diff --git a/docs/algorithms/kem/ml_kem.md b/docs/algorithms/kem/ml_kem.md
index 7343bf24f..fe69e7ef0 100644
--- a/docs/algorithms/kem/ml_kem.md
+++ b/docs/algorithms/kem/ml_kem.md
@@ -7,9 +7,9 @@
- **Authors' website**: https://pq-crystals.org/kyber/ and https://csrc.nist.gov/pubs/fips/203
- **Specification version**: ML-KEM.
- **Primary Source**:
- - **Source**: https://github.com/pq-code-package/mlkem-native/commit/84398e7230fa31ba4241f5eb36bdc3c1dbbd5bcd with copy_from_upstream patches
+ - **Source**: https://github.com/pq-code-package/mlkem-native/commit/d830bc22eb1613bbe38028cfefc33f1a52a40b2f with copy_from_upstream patches
- **Implementation license (SPDX-Identifier)**: CC0-1.0 or Apache-2.0
-- **Optimized Implementation sources**: https://github.com/pq-code-package/mlkem-native/commit/84398e7230fa31ba4241f5eb36bdc3c1dbbd5bcd with copy_from_upstream patches
+- **Optimized Implementation sources**: https://github.com/pq-code-package/mlkem-native/commit/d830bc22eb1613bbe38028cfefc33f1a52a40b2f with copy_from_upstream patches
- **cupqc-cuda**:
- **Source**: https://github.com/praveksharma/cupqc-mlkem/commit/b026f4e5475cd9c20c2082c7d9bad80e5b0ba89e
- **Implementation license (SPDX-Identifier)**: Apache-2.0
diff --git a/docs/algorithms/kem/ml_kem.yml b/docs/algorithms/kem/ml_kem.yml
index 62400591b..4042f6f60 100644
--- a/docs/algorithms/kem/ml_kem.yml
+++ b/docs/algorithms/kem/ml_kem.yml
@@ -17,7 +17,7 @@ website: https://pq-crystals.org/kyber/ and https://csrc.nist.gov/pubs/fips/203
nist-round: FIPS203
spec-version: ML-KEM
primary-upstream:
- source: https://github.com/pq-code-package/mlkem-native/commit/84398e7230fa31ba4241f5eb36bdc3c1dbbd5bcd
+ source: https://github.com/pq-code-package/mlkem-native/commit/d830bc22eb1613bbe38028cfefc33f1a52a40b2f
with copy_from_upstream patches
spdx-license-identifier: CC0-1.0 or Apache-2.0
optimized-upstreams:
diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml
index bc08de94f..54fc50307 100644
--- a/scripts/copy_from_upstream/copy_from_upstream.yml
+++ b/scripts/copy_from_upstream/copy_from_upstream.yml
@@ -34,7 +34,7 @@ upstreams:
name: mlkem-native
git_url: https://github.com/pq-code-package/mlkem-native.git
git_branch: main
- git_commit: 84398e7230fa31ba4241f5eb36bdc3c1dbbd5bcd
+ git_commit: d830bc22eb1613bbe38028cfefc33f1a52a40b2f
kem_meta_path: 'integration/liboqs/{pretty_name_full}_META.yml'
kem_scheme_path: '.'
patches: [mlkem-native-ml_kem.patch]
diff --git a/src/kem/ml_kem/CMakeLists.txt b/src/kem/ml_kem/CMakeLists.txt
index aabc775fa..bd2201513 100644
--- a/src/kem/ml_kem/CMakeLists.txt
+++ b/src/kem/ml_kem/CMakeLists.txt
@@ -15,7 +15,7 @@ if(OQS_ENABLE_KEM_ml_kem_512)
endif()
if(OQS_ENABLE_KEM_ml_kem_512_x86_64)
- add_library(ml_kem_512_x86_64 OBJECT mlkem-native_ml-kem-512_x86_64/compress.c mlkem-native_ml-kem-512_x86_64/debug.c mlkem-native_ml-kem-512_x86_64/indcpa.c mlkem-native_ml-kem-512_x86_64/kem.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/compress_avx2.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/consts.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/fq.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/intt.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntt.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/shuffle.S mlkem-native_ml-kem-512_x86_64/poly.c mlkem-native_ml-kem-512_x86_64/poly_k.c mlkem-native_ml-kem-512_x86_64/sampling.c mlkem-native_ml-kem-512_x86_64/verify.c mlkem-native_ml-kem-512_x86_64/zetas.c)
+ add_library(ml_kem_512_x86_64 OBJECT mlkem-native_ml-kem-512_x86_64/compress.c mlkem-native_ml-kem-512_x86_64/debug.c mlkem-native_ml-kem-512_x86_64/indcpa.c mlkem-native_ml-kem-512_x86_64/kem.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/compress_avx2.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/consts.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/intt.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntt.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttfrombytes.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttpack.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntttobytes.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttunpack.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/reduce.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/tomont.S mlkem-native_ml-kem-512_x86_64/poly.c mlkem-native_ml-kem-512_x86_64/poly_k.c mlkem-native_ml-kem-512_x86_64/sampling.c mlkem-native_ml-kem-512_x86_64/verify.c mlkem-native_ml-kem-512_x86_64/zetas.c)
target_include_directories(ml_kem_512_x86_64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-512_x86_64)
target_include_directories(ml_kem_512_x86_64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
target_compile_options(ml_kem_512_x86_64 PRIVATE -mavx2 -mbmi2 -mpopcnt )
@@ -24,7 +24,7 @@ if(OQS_ENABLE_KEM_ml_kem_512_x86_64)
endif()
if(OQS_ENABLE_KEM_ml_kem_512_aarch64)
- add_library(ml_kem_512_aarch64 OBJECT mlkem-native_ml-kem-512_aarch64/compress.c mlkem-native_ml-kem-512_aarch64/debug.c mlkem-native_ml-kem-512_aarch64/indcpa.c mlkem-native_ml-kem-512_aarch64/kem.c mlkem-native_ml-kem-512_aarch64/native/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_clean.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_clean.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_clean.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-512_aarch64/poly.c mlkem-native_ml-kem-512_aarch64/poly_k.c mlkem-native_ml-kem-512_aarch64/sampling.c mlkem-native_ml-kem-512_aarch64/verify.c mlkem-native_ml-kem-512_aarch64/zetas.c)
+ add_library(ml_kem_512_aarch64 OBJECT mlkem-native_ml-kem-512_aarch64/compress.c mlkem-native_ml-kem-512_aarch64/debug.c mlkem-native_ml-kem-512_aarch64/indcpa.c mlkem-native_ml-kem-512_aarch64/kem.c mlkem-native_ml-kem-512_aarch64/native/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_reduce_asm_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_tomont_asm_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-512_aarch64/poly.c mlkem-native_ml-kem-512_aarch64/poly_k.c mlkem-native_ml-kem-512_aarch64/sampling.c mlkem-native_ml-kem-512_aarch64/verify.c mlkem-native_ml-kem-512_aarch64/zetas.c)
target_include_directories(ml_kem_512_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-512_aarch64)
target_include_directories(ml_kem_512_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
target_compile_options(ml_kem_512_aarch64 PUBLIC -DMLKEM_K=2 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE_BACKEND_ARITH -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT)
@@ -49,7 +49,7 @@ if(OQS_ENABLE_KEM_ml_kem_768)
endif()
if(OQS_ENABLE_KEM_ml_kem_768_x86_64)
- add_library(ml_kem_768_x86_64 OBJECT mlkem-native_ml-kem-768_x86_64/compress.c mlkem-native_ml-kem-768_x86_64/debug.c mlkem-native_ml-kem-768_x86_64/indcpa.c mlkem-native_ml-kem-768_x86_64/kem.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/compress_avx2.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/consts.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/fq.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/intt.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntt.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/shuffle.S mlkem-native_ml-kem-768_x86_64/poly.c mlkem-native_ml-kem-768_x86_64/poly_k.c mlkem-native_ml-kem-768_x86_64/sampling.c mlkem-native_ml-kem-768_x86_64/verify.c mlkem-native_ml-kem-768_x86_64/zetas.c)
+ add_library(ml_kem_768_x86_64 OBJECT mlkem-native_ml-kem-768_x86_64/compress.c mlkem-native_ml-kem-768_x86_64/debug.c mlkem-native_ml-kem-768_x86_64/indcpa.c mlkem-native_ml-kem-768_x86_64/kem.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/compress_avx2.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/consts.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/intt.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntt.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttfrombytes.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttpack.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntttobytes.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttunpack.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/reduce.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/tomont.S mlkem-native_ml-kem-768_x86_64/poly.c mlkem-native_ml-kem-768_x86_64/poly_k.c mlkem-native_ml-kem-768_x86_64/sampling.c mlkem-native_ml-kem-768_x86_64/verify.c mlkem-native_ml-kem-768_x86_64/zetas.c)
target_include_directories(ml_kem_768_x86_64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-768_x86_64)
target_include_directories(ml_kem_768_x86_64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
target_compile_options(ml_kem_768_x86_64 PRIVATE -mavx2 -mbmi2 -mpopcnt )
@@ -58,7 +58,7 @@ if(OQS_ENABLE_KEM_ml_kem_768_x86_64)
endif()
if(OQS_ENABLE_KEM_ml_kem_768_aarch64)
- add_library(ml_kem_768_aarch64 OBJECT mlkem-native_ml-kem-768_aarch64/compress.c mlkem-native_ml-kem-768_aarch64/debug.c mlkem-native_ml-kem-768_aarch64/indcpa.c mlkem-native_ml-kem-768_aarch64/kem.c mlkem-native_ml-kem-768_aarch64/native/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_clean.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_clean.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_clean.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-768_aarch64/poly.c mlkem-native_ml-kem-768_aarch64/poly_k.c mlkem-native_ml-kem-768_aarch64/sampling.c mlkem-native_ml-kem-768_aarch64/verify.c mlkem-native_ml-kem-768_aarch64/zetas.c)
+ add_library(ml_kem_768_aarch64 OBJECT mlkem-native_ml-kem-768_aarch64/compress.c mlkem-native_ml-kem-768_aarch64/debug.c mlkem-native_ml-kem-768_aarch64/indcpa.c mlkem-native_ml-kem-768_aarch64/kem.c mlkem-native_ml-kem-768_aarch64/native/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_reduce_asm_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_tomont_asm_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-768_aarch64/poly.c mlkem-native_ml-kem-768_aarch64/poly_k.c mlkem-native_ml-kem-768_aarch64/sampling.c mlkem-native_ml-kem-768_aarch64/verify.c mlkem-native_ml-kem-768_aarch64/zetas.c)
target_include_directories(ml_kem_768_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-768_aarch64)
target_include_directories(ml_kem_768_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
target_compile_options(ml_kem_768_aarch64 PUBLIC -DMLKEM_K=3 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE_BACKEND_ARITH -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_OPT)
@@ -83,7 +83,7 @@ if(OQS_ENABLE_KEM_ml_kem_1024)
endif()
if(OQS_ENABLE_KEM_ml_kem_1024_x86_64)
- add_library(ml_kem_1024_x86_64 OBJECT mlkem-native_ml-kem-1024_x86_64/compress.c mlkem-native_ml-kem-1024_x86_64/debug.c mlkem-native_ml-kem-1024_x86_64/indcpa.c mlkem-native_ml-kem-1024_x86_64/kem.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/compress_avx2.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/consts.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/fq.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/intt.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntt.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/shuffle.S mlkem-native_ml-kem-1024_x86_64/poly.c mlkem-native_ml-kem-1024_x86_64/poly_k.c mlkem-native_ml-kem-1024_x86_64/sampling.c mlkem-native_ml-kem-1024_x86_64/verify.c mlkem-native_ml-kem-1024_x86_64/zetas.c)
+ add_library(ml_kem_1024_x86_64 OBJECT mlkem-native_ml-kem-1024_x86_64/compress.c mlkem-native_ml-kem-1024_x86_64/debug.c mlkem-native_ml-kem-1024_x86_64/indcpa.c mlkem-native_ml-kem-1024_x86_64/kem.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/compress_avx2.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/consts.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/intt.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntt.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttfrombytes.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttpack.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntttobytes.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttunpack.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/reduce.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/tomont.S mlkem-native_ml-kem-1024_x86_64/poly.c mlkem-native_ml-kem-1024_x86_64/poly_k.c mlkem-native_ml-kem-1024_x86_64/sampling.c mlkem-native_ml-kem-1024_x86_64/verify.c mlkem-native_ml-kem-1024_x86_64/zetas.c)
target_include_directories(ml_kem_1024_x86_64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-1024_x86_64)
target_include_directories(ml_kem_1024_x86_64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
target_compile_options(ml_kem_1024_x86_64 PRIVATE -mavx2 -mbmi2 -mpopcnt )
@@ -92,7 +92,7 @@ if(OQS_ENABLE_KEM_ml_kem_1024_x86_64)
endif()
if(OQS_ENABLE_KEM_ml_kem_1024_aarch64)
- add_library(ml_kem_1024_aarch64 OBJECT mlkem-native_ml-kem-1024_aarch64/compress.c mlkem-native_ml-kem-1024_aarch64/debug.c mlkem-native_ml-kem-1024_aarch64/indcpa.c mlkem-native_ml-kem-1024_aarch64/kem.c mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_clean.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_clean.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_clean.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-1024_aarch64/poly.c mlkem-native_ml-kem-1024_aarch64/poly_k.c mlkem-native_ml-kem-1024_aarch64/sampling.c mlkem-native_ml-kem-1024_aarch64/verify.c mlkem-native_ml-kem-1024_aarch64/zetas.c)
+ add_library(ml_kem_1024_aarch64 OBJECT mlkem-native_ml-kem-1024_aarch64/compress.c mlkem-native_ml-kem-1024_aarch64/debug.c mlkem-native_ml-kem-1024_aarch64/indcpa.c mlkem-native_ml-kem-1024_aarch64/kem.c mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_reduce_asm_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_tomont_asm_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-1024_aarch64/poly.c mlkem-native_ml-kem-1024_aarch64/poly_k.c mlkem-native_ml-kem-1024_aarch64/sampling.c mlkem-native_ml-kem-1024_aarch64/verify.c mlkem-native_ml-kem-1024_aarch64/zetas.c)
target_include_directories(ml_kem_1024_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-1024_aarch64)
target_include_directories(ml_kem_1024_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
target_compile_options(ml_kem_1024_aarch64 PUBLIC -DMLKEM_K=4 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE_BACKEND_ARITH -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_OPT)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/README.md b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/README.md
index e499a4a22..a420f05b6 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/README.md
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/README.md
@@ -10,10 +10,9 @@ works:
- _Fast and Clean: Auditable high-performance assembly via constraint solving_, Amin Abdulrahman, Hanno Becker, Matthias
J. Kannwischer, Fabien Klein, [https://eprint.iacr.org/2022/1303](https://eprint.iacr.org/2022/1303)
-## Profiles
-This backend comes with two profiles: "clean" and optimized. The "clean" backend is handwritten and meant to be easy to
-read and modify; for example, is heavily leverages register aliases and assembly macros. The optimized profile is
-automatically generated from the clean profile via [SLOTHY](https://github.com/slothy-optimizer/slothy). Currently, the
+## Variants
+
+This backend comes in two versions: "clean" and optimized. The "clean" backend is handwritten and meant to be easy to read and modify; for example, is heavily leverages register aliases and assembly macros. This directory contains the optimized version, which is automatically generated from the clean one via [SLOTHY](https://github.com/slothy-optimizer/slothy). Currently, the
target architecture is Cortex-A55, but you can easily re-optimize the code for a different microarchitecture supported
-by SLOTHY, by adjusting the parameters in [optimize.sh](src/optimize.sh).
+by SLOTHY, by adjusting the parameters in [optimize.sh](../../../test/aarch64_clean/src/optimize.sh).
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/clean.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/clean.h
deleted file mode 100644
index f124702a4..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/clean.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/* ML-KEM arithmetic native profile for clean assembly */
-
-#ifdef MLKEM_NATIVE_ARITH_PROFILE_H
-#error Only one MLKEM_ARITH assembly profile can be defined -- did you include multiple profiles?
-#else
-#define MLKEM_NATIVE_ARITH_PROFILE_H
-
-/* Identifier for this backend so that source and assembly files
- * in the build can be appropriately guarded. */
-#define MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN
-
-#define MLKEM_NATIVE_ARITH_BACKEND_NAME AARCH64_CLEAN
-
-/* Filename of the C backend implementation.
- * This is not inlined here because this header is included in assembly
- * files as well. */
-#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/aarch64/src/clean_impl.h"
-
-#endif /* MLKEM_NATIVE_ARITH_PROFILE_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/opt.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/opt.h
index a7217163f..4a0243279 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/opt.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/opt.h
@@ -3,8 +3,6 @@
* SPDX-License-Identifier: Apache-2.0
*/
-/* ML-KEM arithmetic native profile for clean assembly */
-
#ifdef MLKEM_NATIVE_ARITH_PROFILE_H
#error Only one MLKEM_ARITH assembly profile can be defined -- did you include multiple profiles?
#else
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/aarch64_zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/aarch64_zetas.c
index 2c1bb31e1..23e7949d3 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/aarch64_zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/aarch64_zetas.c
@@ -10,8 +10,7 @@
#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \
- defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
#include
#include "arith_native_aarch64.h"
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/arith_native_aarch64.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/arith_native_aarch64.h
index ed0825892..60779598d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/arith_native_aarch64.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/arith_native_aarch64.h
@@ -29,62 +29,49 @@ extern const int16_t aarch64_zetas_mulcache_native[];
extern const int16_t aarch64_zetas_mulcache_twisted_native[];
extern const uint8_t rej_uniform_table[];
-#define ntt_asm_clean MLKEM_NAMESPACE(ntt_asm_clean)
-void ntt_asm_clean(int16_t *, const int16_t *, const int16_t *);
-
#define ntt_asm_opt MLKEM_NAMESPACE(ntt_asm_opt)
void ntt_asm_opt(int16_t *, const int16_t *, const int16_t *);
-#define intt_asm_clean MLKEM_NAMESPACE(intt_asm_clean)
-void intt_asm_clean(int16_t *, const int16_t *, const int16_t *);
-
#define intt_asm_opt MLKEM_NAMESPACE(intt_asm_opt)
void intt_asm_opt(int16_t *, const int16_t *, const int16_t *);
-#define rej_uniform_asm_clean MLKEM_NAMESPACE(rej_uniform_asm_clean)
-unsigned rej_uniform_asm_clean(int16_t *r, const uint8_t *buf, unsigned buflen,
- const uint8_t *table);
-
-#define poly_reduce_asm_clean MLKEM_NAMESPACE(poly_reduce_asm_clean)
-void poly_reduce_asm_clean(int16_t *);
-
#define poly_reduce_asm_opt MLKEM_NAMESPACE(poly_reduce_asm_opt)
void poly_reduce_asm_opt(int16_t *);
-#define poly_tomont_asm_clean MLKEM_NAMESPACE(poly_tomont_asm_clean)
-void poly_tomont_asm_clean(int16_t *);
-
#define poly_tomont_asm_opt MLKEM_NAMESPACE(poly_tomont_asm_opt)
void poly_tomont_asm_opt(int16_t *);
-#define poly_mulcache_compute_asm_clean \
- MLKEM_NAMESPACE(poly_mulcache_compute_asm_clean)
-void poly_mulcache_compute_asm_clean(int16_t *, const int16_t *,
- const int16_t *, const int16_t *);
-
-
#define poly_mulcache_compute_asm_opt \
MLKEM_NAMESPACE(poly_mulcache_compute_asm_opt)
void poly_mulcache_compute_asm_opt(int16_t *, const int16_t *, const int16_t *,
const int16_t *);
-#define poly_tobytes_asm_clean MLKEM_NAMESPACE(poly_tobytes_asm_clean)
-void poly_tobytes_asm_clean(uint8_t *r, const int16_t *a);
-
#define poly_tobytes_asm_opt MLKEM_NAMESPACE(poly_tobytes_asm_opt)
void poly_tobytes_asm_opt(uint8_t *r, const int16_t *a);
-#define polyvec_basemul_acc_montgomery_cached_asm_clean \
- MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
-void polyvec_basemul_acc_montgomery_cached_asm_clean(int16_t *r,
- const int16_t *a,
- const int16_t *b,
- const int16_t *b_cache);
+#define polyvec_basemul_acc_montgomery_cached_asm_k2_opt \
+ MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2_opt)
+void polyvec_basemul_acc_montgomery_cached_asm_k2_opt(int16_t *r,
+ const int16_t *a,
+ const int16_t *b,
+ const int16_t *b_cache);
+
+#define polyvec_basemul_acc_montgomery_cached_asm_k3_opt \
+ MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3_opt)
+void polyvec_basemul_acc_montgomery_cached_asm_k3_opt(int16_t *r,
+ const int16_t *a,
+ const int16_t *b,
+ const int16_t *b_cache);
+
+#define polyvec_basemul_acc_montgomery_cached_asm_k4_opt \
+ MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4_opt)
+void polyvec_basemul_acc_montgomery_cached_asm_k4_opt(int16_t *r,
+ const int16_t *a,
+ const int16_t *b,
+ const int16_t *b_cache);
-#define polyvec_basemul_acc_montgomery_cached_asm_opt \
- MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
-void polyvec_basemul_acc_montgomery_cached_asm_opt(int16_t *r, const int16_t *a,
- const int16_t *b,
- const int16_t *b_cache);
+#define rej_uniform_asm_clean MLKEM_NAMESPACE(rej_uniform_asm_clean)
+unsigned rej_uniform_asm_clean(int16_t *r, const uint8_t *buf, unsigned buflen,
+ const uint8_t *table);
#endif /* MLKEM_AARCH64_NATIVE_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/clean_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/clean_impl.h
deleted file mode 100644
index 4be90fb24..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/clean_impl.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/* ML-KEM arithmetic native profile for clean assembly */
-
-#ifdef MLKEM_NATIVE_ARITH_PROFILE_IMPL_H
-#error Only one MLKEM_ARITH assembly profile can be defined -- did you include multiple profiles?
-#else
-#define MLKEM_NATIVE_ARITH_PROFILE_IMPL_H
-
-#include "arith_native_aarch64.h"
-
-/* Set of primitives that this backend replaces */
-#define MLKEM_USE_NATIVE_NTT
-#define MLKEM_USE_NATIVE_INTT
-#define MLKEM_USE_NATIVE_POLY_REDUCE
-#define MLKEM_USE_NATIVE_POLY_TOMONT
-#define MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE
-#define MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED
-#define MLKEM_USE_NATIVE_POLY_TOBYTES
-#define MLKEM_USE_NATIVE_REJ_UNIFORM
-
-static INLINE void ntt_native(int16_t data[MLKEM_N])
-{
- ntt_asm_clean(data, aarch64_ntt_zetas_layer01234, aarch64_ntt_zetas_layer56);
-}
-
-static INLINE void intt_native(int16_t data[MLKEM_N])
-{
- intt_asm_clean(data, aarch64_invntt_zetas_layer01234,
- aarch64_invntt_zetas_layer56);
-}
-
-static INLINE void poly_reduce_native(int16_t data[MLKEM_N])
-{
- poly_reduce_asm_clean(data);
-}
-
-static INLINE void poly_tomont_native(int16_t data[MLKEM_N])
-{
- poly_tomont_asm_clean(data);
-}
-
-static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2],
- const int16_t y[MLKEM_N])
-{
- poly_mulcache_compute_asm_clean(x, y, aarch64_zetas_mulcache_native,
- aarch64_zetas_mulcache_twisted_native);
-}
-
-static INLINE void polyvec_basemul_acc_montgomery_cached_native(
- int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
- const int16_t b[MLKEM_K * MLKEM_N],
- const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)])
-{
- polyvec_basemul_acc_montgomery_cached_asm_clean(r, a, b, b_cache);
-}
-
-static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
- const int16_t a[MLKEM_N])
-{
- poly_tobytes_asm_clean(r, a);
-}
-
-static INLINE int rej_uniform_native(int16_t *r, unsigned len,
- const uint8_t *buf, unsigned buflen)
-{
- if (len != MLKEM_N || buflen % 24 != 0)
- {
- return -1;
- }
- return (int)rej_uniform_asm_clean(r, buf, buflen, rej_uniform_table);
-}
-
-#endif /* MLKEM_NATIVE_ARITH_PROFILE_IMPL_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_clean.S
deleted file mode 100644
index b0ae1ad46..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_clean.S
+++ /dev/null
@@ -1,389 +0,0 @@
-/// Copyright (c) 2024 The mlkem-native project authors
-/// Copyright (c) 2022 Arm Limited
-/// Copyright (c) 2022 Hanno Becker
-/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer
-/// SPDX-License-Identifier: MIT
-///
-/// Permission is hereby granted, free of charge, to any person obtaining a copy
-/// of this software and associated documentation files (the "Software"), to deal
-/// in the Software without restriction, including without limitation the rights
-/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-/// copies of the Software, and to permit persons to whom the Software is
-/// furnished to do so, subject to the following conditions:
-///
-/// The above copyright notice and this permission notice shall be included in all
-/// copies or substantial portions of the Software.
-///
-/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-/// SOFTWARE.
-///
-
-#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
-
-// Bounds:
-// If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2)
-//
-// See mlken/reduce.c and test/test_bounds.py for more details.
-.macro mulmodq dst, src, const, idx0, idx1
- // Signed barrett multiplication using
- // round-to-nearest-even-integer approximation.
- // Following https://eprint.iacr.org/2021/986.pdf, this
- // is functionally the same as a signed Montgomery multiplication
- // with a suitable constant of absolute value < q.
- sqrdmulh t2.8h, \src\().8h, \const\().h[\idx1\()]
- mul \dst\().8h, \src\().8h, \const\().h[\idx0\()]
- mls \dst\().8h, t2.8h, consts.h[0]
-.endm
-
-.macro mulmod dst, src, const, const_twisted
- sqrdmulh t2.8h, \src\().8h, \const_twisted\().8h
- mul \dst\().8h, \src\().8h, \const\().8h
- mls \dst\().8h, t2.8h, consts.h[0]
-.endm
-
-.macro gs_butterfly a, b, root, idx0, idx1
- sub tmp.8h, \a\().8h, \b\().8h
- add \a\().8h, \a\().8h, \b\().8h
- mulmodq \b, tmp, \root, \idx0, \idx1
-.endm
-
-.macro gs_butterfly_v a, b, root, root_twisted
- sub tmp.8h, \a\().8h, \b\().8h
- add \a\().8h, \a\().8h, \b\().8h
- mulmod \b, tmp, \root, \root_twisted
-.endm
-
-.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3
- mulmod \dst0, \src0, ninv, ninv_tw
- mulmod \dst1, \src1, ninv, ninv_tw
- mulmod \dst2, \src2, ninv, ninv_tw
- mulmod \dst3, \src3, ninv, ninv_tw
-.endm
-
-.macro barrett_reduce a
- sqdmulh t0.8h, \a\().8h, consts.h[1]
- srshr t0.8h, t0.8h, #11
- mls \a\().8h, t0.8h, consts.h[0]
-.endm
-
-.macro load_roots_012
- ldr q_root0, [r01234_ptr], #32
- ldr q_root1, [r01234_ptr, #-16]
-.endm
-
-.macro load_next_roots_34
- ldr q_root0, [r01234_ptr], #16
-.endm
-
-.macro load_next_roots_56
- ldr q_root0, [r56_ptr], #(6*16)
- ldr q_root0_tw, [r56_ptr, #(-6*16 + 1*16)]
- ldr q_root1, [r56_ptr, #(-6*16 + 2*16)]
- ldr q_root1_tw, [r56_ptr, #(-6*16 + 3*16)]
- ldr q_root2, [r56_ptr, #(-6*16 + 4*16)]
- ldr q_root2_tw, [r56_ptr, #(-6*16 + 5*16)]
-.endm
-
-.macro transpose4 data
- trn1 t0.4s, \data\()0.4s, \data\()1.4s
- trn2 t1.4s, \data\()0.4s, \data\()1.4s
- trn1 t2.4s, \data\()2.4s, \data\()3.4s
- trn2 t3.4s, \data\()2.4s, \data\()3.4s
-
- trn2 \data\()2.2d, t0.2d, t2.2d
- trn2 \data\()3.2d, t1.2d, t3.2d
- trn1 \data\()0.2d, t0.2d, t2.2d
- trn1 \data\()1.2d, t1.2d, t3.2d
-.endm
-
-.macro transpose_single data_out, data_in
- trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s
- trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s
- trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s
- trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s
-.endm
-
-.macro save_vregs
- sub sp, sp, #(16*4)
- stp d8, d9, [sp, #16*0]
- stp d10, d11, [sp, #16*1]
- stp d12, d13, [sp, #16*2]
- stp d14, d15, [sp, #16*3]
-.endm
-
-.macro restore_vregs
- ldp d8, d9, [sp, #16*0]
- ldp d10, d11, [sp, #16*1]
- ldp d12, d13, [sp, #16*2]
- ldp d14, d15, [sp, #16*3]
- add sp, sp, #(16*4)
-.endm
-
-.macro push_stack
- save_vregs
-.endm
-
-.macro pop_stack
- restore_vregs
-.endm
-
-// For comparability reasons, the output range for the coefficients of this
-// invNTT code is supposed to match the implementation from PQClean on commit
-// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients
-// are NOT canonically reduced. The ordering of the coefficients is canonical,
-// also matching PQClean.
-
-.text
- .global MLKEM_ASM_NAMESPACE(intt_asm_clean)
-
- in .req x0
- r01234_ptr .req x1
- r56_ptr .req x2
-
- inp .req x3
- count .req x4
- wtmp .req w5
-
- data0 .req v8
- data1 .req v9
- data2 .req v10
- data3 .req v11
- data4 .req v12
- data5 .req v13
- data6 .req v14
- data7 .req v15
-
- q_data0 .req q8
- q_data1 .req q9
- q_data2 .req q10
- q_data3 .req q11
- q_data4 .req q12
- q_data5 .req q13
- q_data6 .req q14
- q_data7 .req q15
-
- root0 .req v0
- root1 .req v1
- root2 .req v2
- root0_tw .req v4
- root1_tw .req v5
- root2_tw .req v6
-
- consts .req v7
- q_consts .req q7
-
- q_root0 .req q0
- q_root1 .req q1
- q_root2 .req q2
- q_root0_tw .req q4
- q_root1_tw .req q5
- q_root2_tw .req q6
-
- tmp .req v24
- t0 .req v25
- t1 .req v26
- t2 .req v27
- t3 .req v28
-
- ninv .req v29
- ninv_tw .req v30
-
-.balign 4
-MLKEM_ASM_NAMESPACE(intt_asm_clean):
- push_stack
-
- // Setup constants
- mov wtmp, #3329
- mov consts.h[0], wtmp
- mov wtmp, #20159
- mov consts.h[1], wtmp
- mov wtmp, #512
- dup ninv.8h, wtmp
- mov wtmp, #5040
- dup ninv_tw.8h, wtmp
-
- mov inp, in
- mov count, #8
-
-scale_start:
-
- ldr q_data0, [inp, #(16*0)]
- ldr q_data1, [inp, #(16*1)]
- ldr q_data2, [inp, #(16*2)]
- ldr q_data3, [inp, #(16*3)]
-
- mul_ninv data0, data1, data2, data3, data0, data1, data2, data3
- // Bounds: Absolute value < q
-
- str q_data0, [inp], #64
- str q_data1, [inp, #(-64 + 16*1)]
- str q_data2, [inp, #(-64 + 16*2)]
- str q_data3, [inp, #(-64 + 16*3)]
-
- subs count, count, #1
- cbnz count, scale_start
-
- mov inp, in
- mov count, #8
-
- .p2align 2
-layer3456_start:
-
- ldr q_data0, [inp, #(16*0)]
- ldr q_data1, [inp, #(16*1)]
- ldr q_data2, [inp, #(16*2)]
- ldr q_data3, [inp, #(16*3)]
-
- transpose4 data // manual ld4
-
- load_next_roots_56
-
- // Layer 7
- gs_butterfly_v data0, data1, root1, root1_tw
- gs_butterfly_v data2, data3, root2, root2_tw
- // Bounds:
- // data0, data2: < 2q
- // data1, data3: < q
-
- // Layer 6
- gs_butterfly_v data0, data2, root0, root0_tw
- gs_butterfly_v data1, data3, root0, root0_tw
- // Bounds:
- // data0: < 4q
- // data1: < 2q
- // data2, data3: < q
-
- transpose4 data
-
- load_next_roots_34
-
- // Layer 5
- gs_butterfly data0, data1, root0, 2, 3
- gs_butterfly data2, data3, root0, 4, 5
- // Max bound: 8q
-
- // Not all of those reductions are needed, but the bounds tracking
- // is easier if we uniformly reduce at this point.
- barrett_reduce data0
- barrett_reduce data2
- barrett_reduce data1
- barrett_reduce data3
-
- // Bounds: q/2
-
- // Layer 4
- gs_butterfly data0, data2, root0, 0, 1
- gs_butterfly data1, data3, root0, 0, 1
- // Bounds: < q
-
- str q_data0, [inp], #(64)
- str q_data1, [inp, #(-64 + 16*1)]
- str q_data2, [inp, #(-64 + 16*2)]
- str q_data3, [inp, #(-64 + 16*3)]
-
- subs count, count, #1
- cbnz count, layer3456_start
-
- // ---------------------------------------------------------------------
-
- mov count, #4
- load_roots_012
-
- .p2align 2
-
-layer012_start:
-
- ldr q_data0, [in, #0]
- ldr q_data1, [in, #(1*(512/8))]
- ldr q_data2, [in, #(2*(512/8))]
- ldr q_data3, [in, #(3*(512/8))]
- ldr q_data4, [in, #(4*(512/8))]
- ldr q_data5, [in, #(5*(512/8))]
- ldr q_data6, [in, #(6*(512/8))]
- ldr q_data7, [in, #(7*(512/8))]
-
- gs_butterfly data0, data1, root0, 6, 7
- gs_butterfly data2, data3, root1, 0, 1
- gs_butterfly data4, data5, root1, 2, 3
- gs_butterfly data6, data7, root1, 4, 5
-
- gs_butterfly data0, data2, root0, 2, 3
- gs_butterfly data1, data3, root0, 2, 3
- gs_butterfly data4, data6, root0, 4, 5
- gs_butterfly data5, data7, root0, 4, 5
-
- gs_butterfly data0, data4, root0, 0, 1
- gs_butterfly data1, data5, root0, 0, 1
- gs_butterfly data2, data6, root0, 0, 1
- gs_butterfly data3, data7, root0, 0, 1
-
- // Bounds: < 8q
-
- str q_data4, [in, #(4*(512/8))]
- str q_data5, [in, #(5*(512/8))]
- str q_data6, [in, #(6*(512/8))]
- str q_data7, [in, #(7*(512/8))]
-
- str q_data0, [in], #(16)
- str q_data1, [in, #(-16 + 1*(512/8))]
- str q_data2, [in, #(-16 + 2*(512/8))]
- str q_data3, [in, #(-16 + 3*(512/8))]
-
- subs count, count, #1
- cbnz count, layer012_start
-
- pop_stack
- ret
-
-/****************** REGISTER DEALLOCATIONS *******************/
- .unreq in
- .unreq r01234_ptr
- .unreq r56_ptr
- .unreq inp
- .unreq count
- .unreq wtmp
- .unreq data0
- .unreq data1
- .unreq data2
- .unreq data3
- .unreq data4
- .unreq data5
- .unreq data6
- .unreq data7
- .unreq q_data0
- .unreq q_data1
- .unreq q_data2
- .unreq q_data3
- .unreq q_data4
- .unreq q_data5
- .unreq q_data6
- .unreq q_data7
- .unreq root0
- .unreq root1
- .unreq root2
- .unreq root0_tw
- .unreq root1_tw
- .unreq root2_tw
- .unreq consts
- .unreq q_consts
- .unreq q_root0
- .unreq q_root1
- .unreq q_root2
- .unreq q_root0_tw
- .unreq q_root1_tw
- .unreq q_root2_tw
- .unreq tmp
- .unreq t0
- .unreq t1
- .unreq t2
- .unreq t3
- .unreq ninv
- .unreq ninv_tw
-
-#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_opt.S
index 191de3c4d..0f9e44307 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_opt.S
@@ -25,6 +25,7 @@
#include "../../../common.h"
#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+/* simpasm: header-end */
// Bounds:
// If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2)
@@ -139,9 +140,6 @@
// are NOT canonically reduced. The ordering of the coefficients is canonical,
// also matching PQClean.
-.text
- .global MLKEM_ASM_NAMESPACE(intt_asm_opt)
-
in .req x0
r01234_ptr .req x1
r56_ptr .req x2
@@ -194,7 +192,9 @@
ninv .req v29
ninv_tw .req v30
-.balign 4
+ .text
+ .global MLKEM_ASM_NAMESPACE(intt_asm_opt)
+ .balign 4
MLKEM_ASM_NAMESPACE(intt_asm_opt):
push_stack
@@ -1042,4 +1042,5 @@ layer012_start:
.unreq ninv
.unreq ninv_tw
+/* simpasm: footer-start */
#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_clean.S
deleted file mode 100644
index 4f844e212..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_clean.S
+++ /dev/null
@@ -1,317 +0,0 @@
-///
-/// Copyright (c) 2022 Arm Limited
-/// Copyright (c) 2022 Hanno Becker
-/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer
-/// Copyright (c) 2024 The mlkem-native project authors
-// SPDX-License-Identifier: MIT
-///
-/// Permission is hereby granted, free of charge, to any person obtaining a copy
-/// of this software and associated documentation files (the "Software"), to deal
-/// in the Software without restriction, including without limitation the rights
-/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-/// copies of the Software, and to permit persons to whom the Software is
-/// furnished to do so, subject to the following conditions:
-///
-/// The above copyright notice and this permission notice shall be included in all
-/// copies or substantial portions of the Software.
-///
-/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-/// SOFTWARE.
-///
-
-#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
-
-// Bounds:
-// If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2)
-//
-// See mlken/reduce.c and test/test_bounds.py for more details.
-.macro mulmodq dst, src, const, idx0, idx1
- // Signed barrett multiplication using
- // round-to-nearest-even-integer approximation.
- // Following https://eprint.iacr.org/2021/986.pdf, this
- // is functionally the same as a signed Montgomery multiplication
- // with a suitable constant of absolute value < q.
- sqrdmulh t2.8h, \src\().8h, \const\().h[\idx1\()]
- mul \dst\().8h, \src\().8h, \const\().h[\idx0\()]
- mls \dst\().8h, t2.8h, consts.h[0]
-.endm
-
-.macro mulmod dst, src, const, const_twisted
- sqrdmulh t2.8h, \src\().8h, \const_twisted\().8h
- mul \dst\().8h, \src\().8h, \const\().8h
- mls \dst\().8h, t2.8h, consts.h[0]
-.endm
-
-.macro ct_butterfly a, b, root, idx0, idx1
- mulmodq tmp, \b, \root, \idx0, \idx1
- sub \b\().8h, \a\().8h, tmp.8h
- add \a\().8h, \a\().8h, tmp.8h
-.endm
-
-.macro ct_butterfly_v a, b, root, root_twisted
- mulmod tmp, \b, \root, \root_twisted
- sub \b\().8h, \a\().8h, tmp.8h
- add \a\().8h, \a\().8h, tmp.8h
-.endm
-
-.macro load_roots_012
- ldr q_root0, [r01234_ptr], #32
- ldr q_root1, [r01234_ptr, #-16]
-.endm
-
-.macro load_next_roots_34
- ldr q_root0, [r01234_ptr], #16
-.endm
-
-.macro load_next_roots_56
- ldr q_root0, [r56_ptr], #(6*16)
- ldr q_root0_tw, [r56_ptr, #(-6*16 + 1*16)]
- ldr q_root1, [r56_ptr, #(-6*16 + 2*16)]
- ldr q_root1_tw, [r56_ptr, #(-6*16 + 3*16)]
- ldr q_root2, [r56_ptr, #(-6*16 + 4*16)]
- ldr q_root2_tw, [r56_ptr, #(-6*16 + 5*16)]
-.endm
-
-.macro transpose4 data
- trn1 t0.4s, \data\()0.4s, \data\()1.4s
- trn2 t1.4s, \data\()0.4s, \data\()1.4s
- trn1 t2.4s, \data\()2.4s, \data\()3.4s
- trn2 t3.4s, \data\()2.4s, \data\()3.4s
-
- trn2 \data\()2.2d, t0.2d, t2.2d
- trn2 \data\()3.2d, t1.2d, t3.2d
- trn1 \data\()0.2d, t0.2d, t2.2d
- trn1 \data\()1.2d, t1.2d, t3.2d
-.endm
-
-.macro save_vregs
- sub sp, sp, #(16*4)
- stp d8, d9, [sp, #16*0]
- stp d10, d11, [sp, #16*1]
- stp d12, d13, [sp, #16*2]
- stp d14, d15, [sp, #16*3]
-.endm
-
-.macro restore_vregs
- ldp d8, d9, [sp, #16*0]
- ldp d10, d11, [sp, #16*1]
- ldp d12, d13, [sp, #16*2]
- ldp d14, d15, [sp, #16*3]
- add sp, sp, #(16*4)
-.endm
-
-.macro push_stack
- save_vregs
-.endm
-
-.macro pop_stack
- restore_vregs
-.endm
-
- // Arguments
- in .req x0 // Input/output buffer
- r01234_ptr .req x1 // twiddles for layer 0,1,2,3,4
- r56_ptr .req x2 // twiddles for layer 5,6
-
- inp .req x3
- count .req x4
- wtmp .req w5
-
- data0 .req v8
- data1 .req v9
- data2 .req v10
- data3 .req v11
- data4 .req v12
- data5 .req v13
- data6 .req v14
- data7 .req v15
-
- q_data0 .req q8
- q_data1 .req q9
- q_data2 .req q10
- q_data3 .req q11
- q_data4 .req q12
- q_data5 .req q13
- q_data6 .req q14
- q_data7 .req q15
-
- root0 .req v0
- root1 .req v1
- root2 .req v2
- root0_tw .req v4
- root1_tw .req v5
- root2_tw .req v6
-
- q_root0 .req q0
- q_root1 .req q1
- q_root2 .req q2
- q_root0_tw .req q4
- q_root1_tw .req q5
- q_root2_tw .req q6
-
- consts .req v7
-
- tmp .req v24
- t0 .req v25
- t1 .req v26
- t2 .req v27
- t3 .req v28
-
- .text
- .global MLKEM_ASM_NAMESPACE(ntt_asm_clean)
-
- .balign 4
-MLKEM_ASM_NAMESPACE(ntt_asm_clean):
- push_stack
-
- mov wtmp, #3329
- mov consts.h[0], wtmp
- mov wtmp, #20159
- mov consts.h[1], wtmp
-
- mov inp, in
- mov count, #4
-
- load_roots_012
-
- .p2align 2
-
- // Bounds reasoning:
- // - There are 7 layers
- // - When passing from layer N to layer N+1, each layer-N value
- // is modified through the addition/subtraction of a Montgomery
- // product of a twiddle of absolute value < q/2 and a layer-N value.
- // - Recalling that for C such that |a| < C * q and |t|> 0);
- xtn out0.8b, data0.8h
-
- // r[3 * i + 1] = (t0 >> 8);
- shrn out1.8b, data0.8h, #8
- xtn tmp.8b, data1.8h
- // r[3 * i + 1] = (t0 >> 8) | (t1 << 4);
- sli out1.8b, tmp.8b, #4
-
- // r[3 * i + 2] = (t1 >> 4);
- shrn out2.8b, data1.8h, #4
-
- st3 {out0.8b, out1.8b, out2.8b}, [dst], #24
-
- subs count, count, #1
- cbnz count, poly_tobytes_asm_clean_asm_loop_start
- ret
-
- .unreq data0
- .unreq data1
- .unreq out0
- .unreq out1
- .unreq out2
- .unreq tmp
- .unreq dst
- .unreq src
- .unreq count
-
-/**********************************
- * poly_tomont() *
- **********************************/
-.global MLKEM_ASM_NAMESPACE(poly_tomont_asm_clean)
-
- src .req x0
- count .req x1
- wtmp .req w2
-
- data .req v0
- q_data .req q0
- res .req v1
- q_res .req q1
-
- factor .req v2
- factor_t .req v3
- modulus .req v4
- modulus_twisted .req v5
-
- tmp0 .req v6
-
-.balign 4
-MLKEM_ASM_NAMESPACE(poly_tomont_asm_clean):
-
- mov wtmp, #3329 // ML-KEM modulus
- dup modulus.8h, wtmp
-
- mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
- dup modulus_twisted.8h, wtmp
-
- mov wtmp, #-1044 // 2^16 % 3329
- dup factor.8h, wtmp
-
- mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16)
- dup factor_t.8h, wtmp
-
- mov count, #8
-poly_tomont_asm_loop:
-
- ldr q_data, [src], #64
- mulmod res, data, factor, factor_t
- str q_res, [src, #-64]
-
- ldr q_data, [src, #-48]
- mulmod res, data, factor, factor_t
- str q_res, [src, #-48]
-
- ldr q_data, [src, #-32]
- mulmod res, data, factor, factor_t
- str q_res, [src, #-32]
-
- ldr q_data, [src, #-16]
- mulmod res, data, factor, factor_t
- str q_res, [src, #-16]
-
- sub count, count, #1
- cbnz count, poly_tomont_asm_loop
-
- ret
-
- .unreq src
- .unreq count
- .unreq wtmp
-
- .unreq data
- .unreq q_data
- .unreq res
- .unreq q_res
-
- .unreq factor
- .unreq factor_t
- .unreq modulus
- .unreq modulus_twisted
-
- .unreq tmp0
-
-#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S
new file mode 100644
index 000000000..a3593b7fd
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "../../../common.h"
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+/* simpasm: header-end */
+
+/* Montgomery multiplication, with precomputed Montgomery twist
+ * Expects modulus in consts.h[0]. */
+.macro mulmod dst, src, const, const_twisted
+ sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h
+ mul \dst\().8h, \src\().8h, \const\().8h
+ mls \dst\().8h, tmp0.8h, modulus.h[0]
+.endm
+
+/********************************************
+ * poly_mulcache_compute() *
+ ********************************************/
+
+
+ cache_ptr .req x0
+ data_ptr .req x1
+ zeta_ptr .req x2
+ zeta_twisted_ptr .req x3
+ count .req x4
+ wtmp .req w5
+
+ data_odd .req v0
+ zeta .req v1
+ q_zeta .req q1
+ zeta_twisted .req v2
+ q_zeta_twisted .req q2
+
+ tmp0 .req v3
+ q_tmp0 .req q3
+ tmp1 .req v4
+ q_tmp1 .req q4
+ dst .req v5
+ q_dst .req q5
+
+ modulus .req v6
+ modulus_twisted .req v7
+
+ .text
+ .global MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt)
+ .balign 4
+MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt):
+ mov wtmp, #3329
+ dup modulus.8h, wtmp
+
+ mov wtmp, #20159
+ dup modulus_twisted.8h, wtmp
+
+ mov count, #16
+ // Instructions: 7
+ // Expected cycles: 12
+ // Expected IPC: 0.58
+
+ // Cycle bound: 12.0
+ // IPC bound: 0.58
+
+ // Wall time: 0.01s
+ // User time: 0.01s
+
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ ldr q1, [x1, #16] // *.............................
+ ldr q27, [x1], #32 // ..*...........................
+ ldr q23, [x2], #16 // ....*.........................
+ uzp2 v27.8H, v27.8H, v1.8H // ......*.......................
+ ldr q1, [x3], #16 // .......*......................
+ mul v2.8H, v27.8H, v23.8H // .........*....................
+ sqrdmulh v27.8H, v27.8H, v1.8H // ...........*..................
+
+ // ------ cycle (expected) ------>
+ // 0 25
+ // |------------------------|-----
+ // ldr q29, [x1, #16] // *..............................
+ // ldr q21, [x2], #16 // ....*..........................
+ // ldr q27, [x1], #32 // ..*............................
+ // ldr q7, [x3], #16 // .......*.......................
+ // uzp2 v28.8H, v27.8H, v29.8H // ......*........................
+ // mul v2.8H, v28.8H, v21.8H // .........*.....................
+ // sqrdmulh v27.8H, v28.8H, v7.8H // ...........*...................
+
+ sub count, count, #1
+poly_mulcache_compute_asm_opt_loop:
+ // Instructions: 9
+ // Expected cycles: 13
+ // Expected IPC: 0.69
+
+ // Cycle bound: 13.0
+ // IPC bound: 0.69
+
+ // Wall time: 0.09s
+ // User time: 0.09s
+
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ ldr q29, [x1, #16] // e.............................
+ ldr q21, [x2], #16 // ..e...........................
+ mls v2.8H, v27.8H, v6.H[0] // ....*.........................
+ ldr q27, [x1], #32 // .....e........................
+ ldr q7, [x3], #16 // .......e......................
+ uzp2 v28.8H, v27.8H, v29.8H // .........e....................
+ str q2, [x0], #16 // ..........*...................
+ mul v2.8H, v28.8H, v21.8H // ...........e..................
+ sqrdmulh v27.8H, v28.8H, v7.8H // ............e.................
+
+ // ------ cycle (expected) ------>
+ // 0 25
+ // |------------------------|-----
+ // ldr q3, [x1], #32 // .....e.......'....~.......'....
+ // ldr q4, [x1, #-16] // e............~............~....
+ // ldr q1, [x2], #16 // ..e..........'.~..........'.~..
+ // ldr q2, [x3], #16 // .......e.....'......~.....'....
+ // uzp2 v0.8h, v3.8h, v4.8h // .........e...'........~...'....
+ // sqrdmulh v3.8h, v0.8h, v2.8h // ............e'...........~'....
+ // mul v5.8h, v0.8h, v1.8h // ...........e.'..........~.'....
+ // mls v5.8h, v3.8h, v6.h[0] // ....~........'...*........'....
+ // str q5, [x0], #16 // ..........~..'.........*..'....
+
+ sub count, count, 1
+ cbnz count, poly_mulcache_compute_asm_opt_loop
+ // Instructions: 2
+ // Expected cycles: 5
+ // Expected IPC: 0.40
+
+ // Cycle bound: 5.0
+ // IPC bound: 0.40
+
+ // Wall time: 0.00s
+ // User time: 0.00s
+
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ mls v2.8H, v27.8H, v6.H[0] // *.............................
+ str q2, [x0], #16 // ....*.........................
+
+ // ------ cycle (expected) ------>
+ // 0 25
+ // |------------------------|-----
+ // mls v2.8H, v27.8H, v6.H[0] // *..............................
+ // str q2, [x0], #16 // ....*..........................
+
+
+ ret
+
+ .unreq cache_ptr
+ .unreq data_ptr
+ .unreq zeta_ptr
+ .unreq zeta_twisted_ptr
+ .unreq count
+ .unreq wtmp
+
+ .unreq data_odd
+ .unreq zeta
+ .unreq q_zeta
+ .unreq zeta_twisted
+ .unreq q_zeta_twisted
+
+ .unreq tmp0
+ .unreq q_tmp0
+ .unreq tmp1
+ .unreq q_tmp1
+ .unreq dst
+ .unreq q_dst
+
+ .unreq modulus
+ .unreq modulus_twisted
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_opt.S
deleted file mode 100644
index 79605818f..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_opt.S
+++ /dev/null
@@ -1,670 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
-
-/*
- * Some modular arithmetic macros
- */
-
-/* Barrett reduction */
-.macro barrett_reduce a
- sqdmulh tmp.8h, \a\().8h, modulus_twisted.h[0]
- srshr tmp.8h, tmp.8h, #11
- mls \a\().8h, tmp.8h, modulus.h[0]
-.endm
-
-/* Montgomery multiplication, with precomputed Montgomery twist
- * Expects modulus in consts.h[0]. */
-.macro mulmod dst, src, const, const_twisted
- sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h
- mul \dst\().8h, \src\().8h, \const\().8h
- mls \dst\().8h, tmp0.8h, modulus.h[0]
-.endm
-
-/* Turns signed-canonical to unsigned canonical representative
- * through conditional addition of the modulus.
- *
- * Expected modulus in `modulus`. */
-.macro scalar_signed_to_unsigned a
- sshr mask.8h, \a\().8h, #15
- and mask.16b, modulus.16b, mask.16b
- add \a\().8h, \a\().8h, mask.8h
-.endm
-
-/**********************************
- * poly_reduce() *
- **********************************/
-
-.global MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt)
-
- ptr .req x0
- count .req x1
- wtmp .req w2
-
- data .req v0
- q_data .req q0
-
- tmp .req v1
- mask .req v2
- modulus .req v3
- modulus_twisted .req v4
-
-.balign 4
-MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
-
- mov wtmp, #3329 // ML-KEM modulus
- dup modulus.8h, wtmp
-
- mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
- dup modulus_twisted.8h, wtmp
-
- mov count, #8
- // Instructions: 15
- // Expected cycles: 22
- // Expected IPC: 0.68
-
- // Cycle bound: 22.0
- // IPC bound: 0.68
-
- // Wall time: 0.05s
- // User time: 0.05s
-
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- ldr q21, [x0, #32] // *.............................
- ldr q23, [x0, #48] // ..*...........................
- sqdmulh v7.8H, v21.8H, v4.H[0] // ....*.........................
- sqdmulh v30.8H, v23.8H, v4.H[0] // ......*.......................
- srshr v7.8H, v7.8H, #11 // ........*.....................
- srshr v30.8H, v30.8H, #11 // ..........*...................
- mls v21.8H, v7.8H, v3.H[0] // ...........*..................
- mls v23.8H, v30.8H, v3.H[0] // .............*................
- ldr q5, [x0, #16] // ..............*...............
- sshr v7.8H, v21.8H, #15 // ................*.............
- sshr v30.8H, v23.8H, #15 // .................*............
- and v7.16B, v3.16B, v7.16B // ..................*...........
- add v21.8H, v21.8H, v7.8H // ...................*..........
- and v7.16B, v3.16B, v30.16B // ....................*.........
- add v16.8H, v23.8H, v7.8H // .....................*........
-
- // ------ cycle (expected) ------>
- // 0 25
- // |------------------------|-----
- // ldr q30, [x0, #32] // *..............................
- // sqdmulh v22.8H, v30.8H, v4.H[0] // ....*..........................
- // ldr q2, [x0, #48] // ..*............................
- // srshr v19.8H, v22.8H, #11 // ........*......................
- // mls v30.8H, v19.8H, v3.H[0] // ...........*...................
- // sqdmulh v25.8H, v2.8H, v4.H[0] // ......*........................
- // sshr v31.8H, v30.8H, #15 // ................*..............
- // srshr v25.8H, v25.8H, #11 // ..........*....................
- // and v18.16B, v3.16B, v31.16B // ..................*............
- // mls v2.8H, v25.8H, v3.H[0] // .............*.................
- // add v21.8H, v30.8H, v18.8H // ...................*...........
- // ldr q5, [x0, #16] // ..............*................
- // sshr v18.8H, v2.8H, #15 // .................*.............
- // and v27.16B, v3.16B, v18.16B // ....................*..........
- // add v16.8H, v2.8H, v27.8H // .....................*.........
-
- sub count, count, #1
-1:
- // Instructions: 32
- // Expected cycles: 36
- // Expected IPC: 0.89
-
- // Cycle bound: 36.0
- // IPC bound: 0.89
-
- // Wall time: 1.05s
- // User time: 1.05s
-
- // -------- cycle (expected) --------->
- // 0 25
- // |------------------------|----------
- ldr q6, [x0], #64 // *...................................
- ldr q30, [x0, #32] // ..e.................................
- sqdmulh v31.8H, v6.8H, v4.H[0] // ....*...............................
- sqdmulh v29.8H, v5.8H, v4.H[0] // .....*..............................
- sqdmulh v22.8H, v30.8H, v4.H[0] // ......e.............................
- str q16, [x0, #-16] // .......*............................
- srshr v20.8H, v31.8H, #11 // ........*...........................
- srshr v28.8H, v29.8H, #11 // .........*..........................
- str q21, [x0, #-32] // ..........*.........................
- mls v6.8H, v20.8H, v3.H[0] // ...........*........................
- mls v5.8H, v28.8H, v3.H[0] // ............*.......................
- ldr q2, [x0, #48] // .............e......................
- sshr v31.8H, v6.8H, #15 // ...............*....................
- srshr v19.8H, v22.8H, #11 // ................e...................
- and v22.16B, v3.16B, v31.16B // .................*..................
- add v0.8H, v6.8H, v22.8H // ..................*.................
- mls v30.8H, v19.8H, v3.H[0] // ...................e................
- sshr v26.8H, v5.8H, #15 // ....................*...............
- sqdmulh v25.8H, v2.8H, v4.H[0] // .....................e..............
- and v17.16B, v3.16B, v26.16B // ......................*.............
- add v1.8H, v5.8H, v17.8H // .......................*............
- sshr v31.8H, v30.8H, #15 // ........................e...........
- srshr v25.8H, v25.8H, #11 // .........................e..........
- str q1, [x0, #-48] // ..........................*.........
- and v18.16B, v3.16B, v31.16B // ...........................e........
- mls v2.8H, v25.8H, v3.H[0] // ............................e.......
- add v21.8H, v30.8H, v18.8H // .............................e......
- ldr q5, [x0, #16] // ..............................e.....
- sshr v18.8H, v2.8H, #15 // ................................e...
- str q0, [x0, #-64] // .................................*..
- and v27.16B, v3.16B, v18.16B // ..................................e.
- add v16.8H, v2.8H, v27.8H // ...................................e
-
- // ------------------------ cycle (expected) ------------------------->
- // 0 25 50
- // |------------------------|------------------------|-----------------
- // ldr q0, [x0], #64 // ..................................*.................................
- // sqdmulh v1.8h, v0.8h, v4.h[0] // ..~...............................'...*.............................
- // srshr v1.8h, v1.8h, #11 // ......~...........................'.......*.........................
- // mls v0.8h, v1.8h, v3.h[0] // .........~........................'..........*......................
- // sshr v2.8h, v0.8h, #15 // .............~....................'..............*..................
- // and v2.16b, v3.16b, v2.16b // ...............~..................'................*................
- // add v0.8h, v0.8h, v2.8h // ................~.................'.................*...............
- // str q0, [x0, #-64] // ...............................~..'................................*
- // ldr q0, [x0, #-48] // ............................e.....'.............................~...
- // sqdmulh v1.8h, v0.8h, v4.h[0] // ...~..............................'....*............................
- // srshr v1.8h, v1.8h, #11 // .......~..........................'........*........................
- // mls v0.8h, v1.8h, v3.h[0] // ..........~.......................'...........*.....................
- // sshr v2.8h, v0.8h, #15 // ..................~...............'...................*.............
- // and v2.16b, v3.16b, v2.16b // ....................~.............'.....................*...........
- // add v0.8h, v0.8h, v2.8h // .....................~............'......................*..........
- // str q0, [x0, #-48] // ........................~.........'.........................*.......
- // ldr q0, [x0, #-32] // e.................................'.~...............................
- // sqdmulh v1.8h, v0.8h, v4.h[0] // ....e.............................'.....~...........................
- // srshr v1.8h, v1.8h, #11 // ..............e...................'...............~.................
- // mls v0.8h, v1.8h, v3.h[0] // .................e................'..................~..............
- // sshr v2.8h, v0.8h, #15 // ......................e...........'.......................~.........
- // and v2.16b, v3.16b, v2.16b // .........................e........'..........................~......
- // add v0.8h, v0.8h, v2.8h // ...........................e......'............................~....
- // str q0, [x0, #-32] // ........~.........................'.........*.......................
- // ldr q0, [x0, #-16] // ...........e......................'............~....................
- // sqdmulh v1.8h, v0.8h, v4.h[0] // ...................e..............'....................~............
- // srshr v1.8h, v1.8h, #11 // .......................e..........'........................~........
- // mls v0.8h, v1.8h, v3.h[0] // ..........................e.......'...........................~.....
- // sshr v2.8h, v0.8h, #15 // ..............................e...'...............................~.
- // and v2.16b, v3.16b, v2.16b // ................................e.'.................................
- // add v0.8h, v0.8h, v2.8h // .................................e'.................................
- // str q0, [x0, #-16] // .....~............................'......*..........................
-
- sub count, count, 1
- cbnz count, 1b
- // Instructions: 17
- // Expected cycles: 23
- // Expected IPC: 0.74
-
- // Cycle bound: 23.0
- // IPC bound: 0.74
-
- // Wall time: 0.05s
- // User time: 0.05s
-
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- sqdmulh v20.8H, v5.8H, v4.H[0] // *.............................
- ldr q24, [x0], #64 // .*............................
- str q21, [x0, #-32] // ...*..........................
- srshr v20.8H, v20.8H, #11 // ....*.........................
- sqdmulh v25.8H, v24.8H, v4.H[0] // .....*........................
- str q16, [x0, #-16] // ......*.......................
- mls v5.8H, v20.8H, v3.H[0] // .......*......................
- srshr v20.8H, v25.8H, #11 // .........*....................
- sshr v2.8H, v5.8H, #15 // ...........*..................
- mls v24.8H, v20.8H, v3.H[0] // ............*.................
- and v20.16B, v3.16B, v2.16B // .............*................
- add v31.8H, v5.8H, v20.8H // ..............*...............
- sshr v20.8H, v24.8H, #15 // ................*.............
- str q31, [x0, #-48] // .................*............
- and v31.16B, v3.16B, v20.16B // ..................*...........
- add v24.8H, v24.8H, v31.8H // ...................*..........
- str q24, [x0, #-64] // ......................*.......
-
- // ------ cycle (expected) ------>
- // 0 25
- // |------------------------|-----
- // ldr q6, [x0], #64 // .*.............................
- // sqdmulh v31.8H, v6.8H, v4.H[0] // .....*.........................
- // sqdmulh v29.8H, v5.8H, v4.H[0] // *..............................
- // str q16, [x0, #-16] // ......*........................
- // srshr v20.8H, v31.8H, #11 // .........*.....................
- // srshr v28.8H, v29.8H, #11 // ....*..........................
- // str q21, [x0, #-32] // ...*...........................
- // mls v6.8H, v20.8H, v3.H[0] // ............*..................
- // mls v5.8H, v28.8H, v3.H[0] // .......*.......................
- // sshr v31.8H, v6.8H, #15 // ................*..............
- // and v22.16B, v3.16B, v31.16B // ..................*............
- // add v0.8H, v6.8H, v22.8H // ...................*...........
- // sshr v26.8H, v5.8H, #15 // ...........*...................
- // and v17.16B, v3.16B, v26.16B // .............*.................
- // add v1.8H, v5.8H, v17.8H // ..............*................
- // str q1, [x0, #-48] // .................*.............
- // str q0, [x0, #-64] // ......................*........
-
-
- ret
-
- .unreq ptr
- .unreq count
- .unreq wtmp
-
- .unreq data
- .unreq q_data
-
- .unreq tmp
- .unreq mask
- .unreq modulus
- .unreq modulus_twisted
-
-/********************************************
- * poly_mulcache_compute() *
- ********************************************/
-
-.global MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt)
-
- cache_ptr .req x0
- data_ptr .req x1
- zeta_ptr .req x2
- zeta_twisted_ptr .req x3
- count .req x4
- wtmp .req w5
-
- data_odd .req v0
- zeta .req v1
- q_zeta .req q1
- zeta_twisted .req v2
- q_zeta_twisted .req q2
-
- tmp0 .req v3
- q_tmp0 .req q3
- tmp1 .req v4
- q_tmp1 .req q4
- dst .req v5
- q_dst .req q5
-
- modulus .req v6
- modulus_twisted .req v7
-
-.balign 4
-MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt):
- mov wtmp, #3329
- dup modulus.8h, wtmp
-
- mov wtmp, #20159
- dup modulus_twisted.8h, wtmp
-
- mov count, #16
- // Instructions: 7
- // Expected cycles: 12
- // Expected IPC: 0.58
-
- // Cycle bound: 12.0
- // IPC bound: 0.58
-
- // Wall time: 0.01s
- // User time: 0.01s
-
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- ldr q1, [x1, #16] // *.............................
- ldr q27, [x1], #32 // ..*...........................
- ldr q23, [x2], #16 // ....*.........................
- uzp2 v27.8H, v27.8H, v1.8H // ......*.......................
- ldr q1, [x3], #16 // .......*......................
- mul v2.8H, v27.8H, v23.8H // .........*....................
- sqrdmulh v27.8H, v27.8H, v1.8H // ...........*..................
-
- // ------ cycle (expected) ------>
- // 0 25
- // |------------------------|-----
- // ldr q29, [x1, #16] // *..............................
- // ldr q21, [x2], #16 // ....*..........................
- // ldr q27, [x1], #32 // ..*............................
- // ldr q7, [x3], #16 // .......*.......................
- // uzp2 v28.8H, v27.8H, v29.8H // ......*........................
- // mul v2.8H, v28.8H, v21.8H // .........*.....................
- // sqrdmulh v27.8H, v28.8H, v7.8H // ...........*...................
-
- sub count, count, #1
-1:
- // Instructions: 9
- // Expected cycles: 13
- // Expected IPC: 0.69
-
- // Cycle bound: 13.0
- // IPC bound: 0.69
-
- // Wall time: 0.09s
- // User time: 0.09s
-
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- ldr q29, [x1, #16] // e.............................
- ldr q21, [x2], #16 // ..e...........................
- mls v2.8H, v27.8H, v6.H[0] // ....*.........................
- ldr q27, [x1], #32 // .....e........................
- ldr q7, [x3], #16 // .......e......................
- uzp2 v28.8H, v27.8H, v29.8H // .........e....................
- str q2, [x0], #16 // ..........*...................
- mul v2.8H, v28.8H, v21.8H // ...........e..................
- sqrdmulh v27.8H, v28.8H, v7.8H // ............e.................
-
- // ------ cycle (expected) ------>
- // 0 25
- // |------------------------|-----
- // ldr q3, [x1], #32 // .....e.......'....~.......'....
- // ldr q4, [x1, #-16] // e............~............~....
- // ldr q1, [x2], #16 // ..e..........'.~..........'.~..
- // ldr q2, [x3], #16 // .......e.....'......~.....'....
- // uzp2 v0.8h, v3.8h, v4.8h // .........e...'........~...'....
- // sqrdmulh v3.8h, v0.8h, v2.8h // ............e'...........~'....
- // mul v5.8h, v0.8h, v1.8h // ...........e.'..........~.'....
- // mls v5.8h, v3.8h, v6.h[0] // ....~........'...*........'....
- // str q5, [x0], #16 // ..........~..'.........*..'....
-
- sub count, count, 1
- cbnz count, 1b
- // Instructions: 2
- // Expected cycles: 5
- // Expected IPC: 0.40
-
- // Cycle bound: 5.0
- // IPC bound: 0.40
-
- // Wall time: 0.00s
- // User time: 0.00s
-
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- mls v2.8H, v27.8H, v6.H[0] // *.............................
- str q2, [x0], #16 // ....*.........................
-
- // ------ cycle (expected) ------>
- // 0 25
- // |------------------------|-----
- // mls v2.8H, v27.8H, v6.H[0] // *..............................
- // str q2, [x0], #16 // ....*..........................
-
-
- ret
-
- .unreq cache_ptr
- .unreq data_ptr
- .unreq zeta_ptr
- .unreq zeta_twisted_ptr
- .unreq count
- .unreq wtmp
-
- .unreq data_odd
- .unreq zeta
- .unreq q_zeta
- .unreq zeta_twisted
- .unreq q_zeta_twisted
-
- .unreq tmp0
- .unreq q_tmp0
- .unreq tmp1
- .unreq q_tmp1
- .unreq dst
- .unreq q_dst
-
- .unreq modulus
- .unreq modulus_twisted
-
-/********************************************
- * poly_tobytes() *
- ********************************************/
-.global MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt)
-
- data0 .req v0
- data1 .req v1
- out0 .req v2
- out1 .req v3
- out2 .req v4
- tmp .req v5
-
- dst .req x0
- src .req x1
- count .req x2
-
-.balign 4
-MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt):
-
- mov count, #16
-poly_tobytes_asm_opt_asm_loop_start:
- ld2 {data0.8h, data1.8h}, [src], #32
-
- // r[3 * i + 0] = (t0 >> 0);
- xtn out0.8b, data0.8h
-
- // r[3 * i + 1] = (t0 >> 8);
- shrn out1.8b, data0.8h, #8
- xtn tmp.8b, data1.8h
- // r[3 * i + 1] = (t0 >> 8) | (t1 << 4);
- sli out1.8b, tmp.8b, #4
-
- // r[3 * i + 2] = (t1 >> 4);
- shrn out2.8b, data1.8h, #4
-
- st3 {out0.8b, out1.8b, out2.8b}, [dst], #24
-
- subs count, count, #1
- cbnz count, poly_tobytes_asm_opt_asm_loop_start
- ret
-
- .unreq data0
- .unreq data1
- .unreq out0
- .unreq out1
- .unreq out2
- .unreq tmp
- .unreq dst
- .unreq src
- .unreq count
-
-/**********************************
- * poly_tomont() *
- **********************************/
-.global MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt)
-
- src .req x0
- count .req x1
- wtmp .req w2
-
- data .req v0
- q_data .req q0
- res .req v1
- q_res .req q1
-
- factor .req v2
- factor_t .req v3
- modulus .req v4
- modulus_twisted .req v5
-
- tmp0 .req v6
-
-.balign 4
-MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt):
-
- mov wtmp, #3329 // ML-KEM modulus
- dup modulus.8h, wtmp
-
- mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
- dup modulus_twisted.8h, wtmp
-
- mov wtmp, #-1044 // 2^16 % 3329
- dup factor.8h, wtmp
-
- mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16)
- dup factor_t.8h, wtmp
-
- mov count, #8
- // Instructions: 5
- // Expected cycles: 7
- // Expected IPC: 0.71
- //
- // Cycle bound: 7.0
- // IPC bound: 0.71
- //
- // Wall time: 0.01s
- // User time: 0.01s
- //
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- ldr q26, [x0, #48] // *.............................
- ldr q23, [x0, #16] // ..*...........................
- mul v17.8H, v26.8H, v2.8H // ....*.........................
- sqrdmulh v7.8H, v26.8H, v3.8H // .....*........................
- ldr q27, [x0, #32] // ......*.......................
-
- // ------ cycle (expected) ------>
- // 0 25
- // |------------------------|-----
- // ldr q7, [x0, #48] // *..............................
- // ldr q23, [x0, #16] // ..*............................
- // mul v17.8H, v7.8H, v2.8H // ....*..........................
- // sqrdmulh v7.8H, v7.8H, v3.8H // .....*.........................
- // ldr q27, [x0, #32] // ......*........................
-
- sub count, count, #1
-1:
- // Instructions: 20
- // Expected cycles: 24
- // Expected IPC: 0.83
- //
- // Cycle bound: 24.0
- // IPC bound: 0.83
- //
- // Wall time: 0.73s
- // User time: 0.73s
- //
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- mls v17.8H, v7.8H, v4.H[0] // *.............................
- sqrdmulh v5.8H, v23.8H, v3.8H // .*............................
- ldr q7, [x0], #64 // ..*...........................
- str q17, [x0, #-16] // ....*.........................
- sqrdmulh v29.8H, v27.8H, v3.8H // .....*........................
- sqrdmulh v19.8H, v7.8H, v3.8H // ......*.......................
- mul v25.8H, v23.8H, v2.8H // .......*......................
- mul v0.8H, v7.8H, v2.8H // ........*.....................
- mul v26.8H, v27.8H, v2.8H // .........*....................
- ldr q7, [x0, #48] // ..........e...................
- mls v25.8H, v5.8H, v4.H[0] // ............*.................
- ldr q23, [x0, #16] // .............e................
- mls v26.8H, v29.8H, v4.H[0] // ...............*..............
- mls v0.8H, v19.8H, v4.H[0] // ................*.............
- str q25, [x0, #-48] // .................*............
- mul v17.8H, v7.8H, v2.8H // ..................e...........
- sqrdmulh v7.8H, v7.8H, v3.8H // ...................e..........
- str q0, [x0, #-64] // ....................*.........
- ldr q27, [x0, #32] // .....................e........
- str q26, [x0, #-32] // .......................*......
-
- // --------- cycle (expected) ---------->
- // 0 25
- // |------------------------|------------
- // ldr q0, [x0], #64 // ..............'.*.....................
- // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'.....*.................
- // mul v1.8h, v0.8h, v2.8h // ..............'.......*...............
- // mls v1.8h, v6.8h, v4.h[0] // ......~.......'...............*.......
- // str q1, [x0, #-64] // ..........~...'...................*...
- // ldr q0, [x0, #-48] // ...e..........'............~..........
- // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'*......................
- // mul v1.8h, v0.8h, v2.8h // ..............'......*................
- // mls v1.8h, v6.8h, v4.h[0] // ..~...........'...........*...........
- // str q1, [x0, #-48] // .......~......'................*......
- // ldr q0, [x0, #-32] // ...........e..'....................~..
- // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'....*..................
- // mul v1.8h, v0.8h, v2.8h // ..............'........*..............
- // mls v1.8h, v6.8h, v4.h[0] // .....~........'..............*........
- // str q1, [x0, #-32] // .............~'......................*
- // ldr q0, [x0, #-16] // e.............'.........~.............
- // sqrdmulh v6.8h, v0.8h, v3.8h // .........e....'..................~....
- // mul v1.8h, v0.8h, v2.8h // ........e.....'.................~.....
- // mls v1.8h, v6.8h, v4.h[0] // ..............*.......................
- // str q1, [x0, #-16] // ..............'...*...................
-
- sub count, count, 1
- cbnz count, 1b
- // Instructions: 15
- // Expected cycles: 18
- // Expected IPC: 0.83
- //
- // Cycle bound: 18.0
- // IPC bound: 0.83
- //
- // Wall time: 0.07s
- // User time: 0.07s
- //
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- mls v17.8H, v7.8H, v4.H[0] // *.............................
- sqrdmulh v7.8H, v23.8H, v3.8H // .*............................
- mul v26.8H, v23.8H, v2.8H // ..*...........................
- sqrdmulh v25.8H, v27.8H, v3.8H // ...*..........................
- ldr q23, [x0], #64 // ....*.........................
- mul v27.8H, v27.8H, v2.8H // ......*.......................
- mls v26.8H, v7.8H, v4.H[0] // .......*......................
- sqrdmulh v7.8H, v23.8H, v3.8H // ........*.....................
- mul v23.8H, v23.8H, v2.8H // .........*....................
- str q17, [x0, #-16] // ..........*...................
- mls v27.8H, v25.8H, v4.H[0] // ...........*..................
- str q26, [x0, #-48] // ............*.................
- mls v23.8H, v7.8H, v4.H[0] // .............*................
- str q27, [x0, #-32] // ...............*..............
- str q23, [x0, #-64] // .................*............
-
- // ------ cycle (expected) ------>
- // 0 25
- // |------------------------|-----
- // mls v17.8H, v7.8H, v4.H[0] // *..............................
- // sqrdmulh v5.8H, v23.8H, v3.8H // .*.............................
- // ldr q7, [x0], #64 // ....*..........................
- // str q17, [x0, #-16] // ..........*....................
- // sqrdmulh v29.8H, v27.8H, v3.8H // ...*...........................
- // sqrdmulh v19.8H, v7.8H, v3.8H // ........*......................
- // mul v25.8H, v23.8H, v2.8H // ..*............................
- // mul v0.8H, v7.8H, v2.8H // .........*.....................
- // mul v26.8H, v27.8H, v2.8H // ......*........................
- // mls v25.8H, v5.8H, v4.H[0] // .......*.......................
- // mls v26.8H, v29.8H, v4.H[0] // ...........*...................
- // mls v0.8H, v19.8H, v4.H[0] // .............*.................
- // str q25, [x0, #-48] // ............*..................
- // str q0, [x0, #-64] // .................*.............
- // str q26, [x0, #-32] // ...............*...............
-
-
- ret
-
- .unreq src
- .unreq count
- .unreq wtmp
-
- .unreq data
- .unreq q_data
- .unreq res
- .unreq q_res
-
- .unreq factor
- .unreq factor_t
- .unreq modulus
- .unreq modulus_twisted
-
- .unreq tmp0
-
-#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_reduce_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_reduce_asm_opt.S
new file mode 100644
index 000000000..410950730
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_reduce_asm_opt.S
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "../../../common.h"
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+/* simpasm: header-end */
+
+/* Barrett reduction */
+.macro barrett_reduce a
+ sqdmulh tmp.8h, \a\().8h, modulus_twisted.h[0]
+ srshr tmp.8h, tmp.8h, #11
+ mls \a\().8h, tmp.8h, modulus.h[0]
+.endm
+
+/* Turns signed-canonical to unsigned canonical representative
+ * through conditional addition of the modulus.
+ *
+ * Expected modulus in `modulus`. */
+.macro scalar_signed_to_unsigned a
+ sshr mask.8h, \a\().8h, #15
+ and mask.16b, modulus.16b, mask.16b
+ add \a\().8h, \a\().8h, mask.8h
+.endm
+
+/**********************************
+ * poly_reduce() *
+ **********************************/
+
+ ptr .req x0
+ count .req x1
+ wtmp .req w2
+
+ data .req v0
+ q_data .req q0
+
+ tmp .req v1
+ mask .req v2
+ modulus .req v3
+ modulus_twisted .req v4
+
+ .text
+ .global MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt)
+ .balign 4
+MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
+
+ mov wtmp, #3329 // ML-KEM modulus
+ dup modulus.8h, wtmp
+
+ mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
+ dup modulus_twisted.8h, wtmp
+
+ mov count, #8
+ // Instructions: 15
+ // Expected cycles: 22
+ // Expected IPC: 0.68
+
+ // Cycle bound: 22.0
+ // IPC bound: 0.68
+
+ // Wall time: 0.05s
+ // User time: 0.05s
+
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ ldr q21, [x0, #32] // *.............................
+ ldr q23, [x0, #48] // ..*...........................
+ sqdmulh v7.8H, v21.8H, v4.H[0] // ....*.........................
+ sqdmulh v30.8H, v23.8H, v4.H[0] // ......*.......................
+ srshr v7.8H, v7.8H, #11 // ........*.....................
+ srshr v30.8H, v30.8H, #11 // ..........*...................
+ mls v21.8H, v7.8H, v3.H[0] // ...........*..................
+ mls v23.8H, v30.8H, v3.H[0] // .............*................
+ ldr q5, [x0, #16] // ..............*...............
+ sshr v7.8H, v21.8H, #15 // ................*.............
+ sshr v30.8H, v23.8H, #15 // .................*............
+ and v7.16B, v3.16B, v7.16B // ..................*...........
+ add v21.8H, v21.8H, v7.8H // ...................*..........
+ and v7.16B, v3.16B, v30.16B // ....................*.........
+ add v16.8H, v23.8H, v7.8H // .....................*........
+
+ // ------ cycle (expected) ------>
+ // 0 25
+ // |------------------------|-----
+ // ldr q30, [x0, #32] // *..............................
+ // sqdmulh v22.8H, v30.8H, v4.H[0] // ....*..........................
+ // ldr q2, [x0, #48] // ..*............................
+ // srshr v19.8H, v22.8H, #11 // ........*......................
+ // mls v30.8H, v19.8H, v3.H[0] // ...........*...................
+ // sqdmulh v25.8H, v2.8H, v4.H[0] // ......*........................
+ // sshr v31.8H, v30.8H, #15 // ................*..............
+ // srshr v25.8H, v25.8H, #11 // ..........*....................
+ // and v18.16B, v3.16B, v31.16B // ..................*............
+ // mls v2.8H, v25.8H, v3.H[0] // .............*.................
+ // add v21.8H, v30.8H, v18.8H // ...................*...........
+ // ldr q5, [x0, #16] // ..............*................
+ // sshr v18.8H, v2.8H, #15 // .................*.............
+ // and v27.16B, v3.16B, v18.16B // ....................*..........
+ // add v16.8H, v2.8H, v27.8H // .....................*.........
+
+ sub count, count, #1
+poly_reduce_asm_opt_loop:
+ // Instructions: 32
+ // Expected cycles: 36
+ // Expected IPC: 0.89
+
+ // Cycle bound: 36.0
+ // IPC bound: 0.89
+
+ // Wall time: 1.05s
+ // User time: 1.05s
+
+ // -------- cycle (expected) --------->
+ // 0 25
+ // |------------------------|----------
+ ldr q6, [x0], #64 // *...................................
+ ldr q30, [x0, #32] // ..e.................................
+ sqdmulh v31.8H, v6.8H, v4.H[0] // ....*...............................
+ sqdmulh v29.8H, v5.8H, v4.H[0] // .....*..............................
+ sqdmulh v22.8H, v30.8H, v4.H[0] // ......e.............................
+ str q16, [x0, #-16] // .......*............................
+ srshr v20.8H, v31.8H, #11 // ........*...........................
+ srshr v28.8H, v29.8H, #11 // .........*..........................
+ str q21, [x0, #-32] // ..........*.........................
+ mls v6.8H, v20.8H, v3.H[0] // ...........*........................
+ mls v5.8H, v28.8H, v3.H[0] // ............*.......................
+ ldr q2, [x0, #48] // .............e......................
+ sshr v31.8H, v6.8H, #15 // ...............*....................
+ srshr v19.8H, v22.8H, #11 // ................e...................
+ and v22.16B, v3.16B, v31.16B // .................*..................
+ add v0.8H, v6.8H, v22.8H // ..................*.................
+ mls v30.8H, v19.8H, v3.H[0] // ...................e................
+ sshr v26.8H, v5.8H, #15 // ....................*...............
+ sqdmulh v25.8H, v2.8H, v4.H[0] // .....................e..............
+ and v17.16B, v3.16B, v26.16B // ......................*.............
+ add v1.8H, v5.8H, v17.8H // .......................*............
+ sshr v31.8H, v30.8H, #15 // ........................e...........
+ srshr v25.8H, v25.8H, #11 // .........................e..........
+ str q1, [x0, #-48] // ..........................*.........
+ and v18.16B, v3.16B, v31.16B // ...........................e........
+ mls v2.8H, v25.8H, v3.H[0] // ............................e.......
+ add v21.8H, v30.8H, v18.8H // .............................e......
+ ldr q5, [x0, #16] // ..............................e.....
+ sshr v18.8H, v2.8H, #15 // ................................e...
+ str q0, [x0, #-64] // .................................*..
+ and v27.16B, v3.16B, v18.16B // ..................................e.
+ add v16.8H, v2.8H, v27.8H // ...................................e
+
+ // ------------------------ cycle (expected) ------------------------->
+ // 0 25 50
+ // |------------------------|------------------------|-----------------
+ // ldr q0, [x0], #64 // ..................................*.................................
+ // sqdmulh v1.8h, v0.8h, v4.h[0] // ..~...............................'...*.............................
+ // srshr v1.8h, v1.8h, #11 // ......~...........................'.......*.........................
+ // mls v0.8h, v1.8h, v3.h[0] // .........~........................'..........*......................
+ // sshr v2.8h, v0.8h, #15 // .............~....................'..............*..................
+ // and v2.16b, v3.16b, v2.16b // ...............~..................'................*................
+ // add v0.8h, v0.8h, v2.8h // ................~.................'.................*...............
+ // str q0, [x0, #-64] // ...............................~..'................................*
+ // ldr q0, [x0, #-48] // ............................e.....'.............................~...
+ // sqdmulh v1.8h, v0.8h, v4.h[0] // ...~..............................'....*............................
+ // srshr v1.8h, v1.8h, #11 // .......~..........................'........*........................
+ // mls v0.8h, v1.8h, v3.h[0] // ..........~.......................'...........*.....................
+ // sshr v2.8h, v0.8h, #15 // ..................~...............'...................*.............
+ // and v2.16b, v3.16b, v2.16b // ....................~.............'.....................*...........
+ // add v0.8h, v0.8h, v2.8h // .....................~............'......................*..........
+ // str q0, [x0, #-48] // ........................~.........'.........................*.......
+ // ldr q0, [x0, #-32] // e.................................'.~...............................
+ // sqdmulh v1.8h, v0.8h, v4.h[0] // ....e.............................'.....~...........................
+ // srshr v1.8h, v1.8h, #11 // ..............e...................'...............~.................
+ // mls v0.8h, v1.8h, v3.h[0] // .................e................'..................~..............
+ // sshr v2.8h, v0.8h, #15 // ......................e...........'.......................~.........
+ // and v2.16b, v3.16b, v2.16b // .........................e........'..........................~......
+ // add v0.8h, v0.8h, v2.8h // ...........................e......'............................~....
+ // str q0, [x0, #-32] // ........~.........................'.........*.......................
+ // ldr q0, [x0, #-16] // ...........e......................'............~....................
+ // sqdmulh v1.8h, v0.8h, v4.h[0] // ...................e..............'....................~............
+ // srshr v1.8h, v1.8h, #11 // .......................e..........'........................~........
+ // mls v0.8h, v1.8h, v3.h[0] // ..........................e.......'...........................~.....
+ // sshr v2.8h, v0.8h, #15 // ..............................e...'...............................~.
+ // and v2.16b, v3.16b, v2.16b // ................................e.'.................................
+ // add v0.8h, v0.8h, v2.8h // .................................e'.................................
+ // str q0, [x0, #-16] // .....~............................'......*..........................
+
+ sub count, count, 1
+ cbnz count, poly_reduce_asm_opt_loop
+ // Instructions: 17
+ // Expected cycles: 23
+ // Expected IPC: 0.74
+
+ // Cycle bound: 23.0
+ // IPC bound: 0.74
+
+ // Wall time: 0.05s
+ // User time: 0.05s
+
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ sqdmulh v20.8H, v5.8H, v4.H[0] // *.............................
+ ldr q24, [x0], #64 // .*............................
+ str q21, [x0, #-32] // ...*..........................
+ srshr v20.8H, v20.8H, #11 // ....*.........................
+ sqdmulh v25.8H, v24.8H, v4.H[0] // .....*........................
+ str q16, [x0, #-16] // ......*.......................
+ mls v5.8H, v20.8H, v3.H[0] // .......*......................
+ srshr v20.8H, v25.8H, #11 // .........*....................
+ sshr v2.8H, v5.8H, #15 // ...........*..................
+ mls v24.8H, v20.8H, v3.H[0] // ............*.................
+ and v20.16B, v3.16B, v2.16B // .............*................
+ add v31.8H, v5.8H, v20.8H // ..............*...............
+ sshr v20.8H, v24.8H, #15 // ................*.............
+ str q31, [x0, #-48] // .................*............
+ and v31.16B, v3.16B, v20.16B // ..................*...........
+ add v24.8H, v24.8H, v31.8H // ...................*..........
+ str q24, [x0, #-64] // ......................*.......
+
+ // ------ cycle (expected) ------>
+ // 0 25
+ // |------------------------|-----
+ // ldr q6, [x0], #64 // .*.............................
+ // sqdmulh v31.8H, v6.8H, v4.H[0] // .....*.........................
+ // sqdmulh v29.8H, v5.8H, v4.H[0] // *..............................
+ // str q16, [x0, #-16] // ......*........................
+ // srshr v20.8H, v31.8H, #11 // .........*.....................
+ // srshr v28.8H, v29.8H, #11 // ....*..........................
+ // str q21, [x0, #-32] // ...*...........................
+ // mls v6.8H, v20.8H, v3.H[0] // ............*..................
+ // mls v5.8H, v28.8H, v3.H[0] // .......*.......................
+ // sshr v31.8H, v6.8H, #15 // ................*..............
+ // and v22.16B, v3.16B, v31.16B // ..................*............
+ // add v0.8H, v6.8H, v22.8H // ...................*...........
+ // sshr v26.8H, v5.8H, #15 // ...........*...................
+ // and v17.16B, v3.16B, v26.16B // .............*.................
+ // add v1.8H, v5.8H, v17.8H // ..............*................
+ // str q1, [x0, #-48] // .................*.............
+ // str q0, [x0, #-64] // ......................*........
+
+
+ ret
+
+ .unreq ptr
+ .unreq count
+ .unreq wtmp
+
+ .unreq data
+ .unreq q_data
+
+ .unreq tmp
+ .unreq mask
+ .unreq modulus
+ .unreq modulus_twisted
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S
new file mode 100644
index 000000000..bc33afd43
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "../../../common.h"
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+/* simpasm: header-end */
+
+/********************************************
+ * poly_tobytes() *
+ ********************************************/
+
+ data0 .req v0
+ data1 .req v1
+ out0 .req v2
+ out1 .req v3
+ out2 .req v4
+ tmp .req v5
+
+ dst .req x0
+ src .req x1
+ count .req x2
+
+ .text
+ .global MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt)
+ .balign 4
+MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt):
+
+ mov count, #16
+poly_tobytes_asm_opt_asm_loop_start:
+ ld2 {data0.8h, data1.8h}, [src], #32
+
+ // r[3 * i + 0] = (t0 >> 0);
+ xtn out0.8b, data0.8h
+
+ // r[3 * i + 1] = (t0 >> 8);
+ shrn out1.8b, data0.8h, #8
+ xtn tmp.8b, data1.8h
+ // r[3 * i + 1] = (t0 >> 8) | (t1 << 4);
+ sli out1.8b, tmp.8b, #4
+
+ // r[3 * i + 2] = (t1 >> 4);
+ shrn out2.8b, data1.8h, #4
+
+ st3 {out0.8b, out1.8b, out2.8b}, [dst], #24
+
+ subs count, count, #1
+ cbnz count, poly_tobytes_asm_opt_asm_loop_start
+ ret
+
+ .unreq data0
+ .unreq data1
+ .unreq out0
+ .unreq out1
+ .unreq out2
+ .unreq tmp
+ .unreq dst
+ .unreq src
+ .unreq count
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_tomont_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_tomont_asm_opt.S
new file mode 100644
index 000000000..bcbff9adb
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_tomont_asm_opt.S
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "../../../common.h"
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+/* simpasm: header-end */
+
+/* Montgomery multiplication, with precomputed Montgomery twist
+ * Expects modulus in consts.h[0]. */
+.macro mulmod dst, src, const, const_twisted
+ sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h
+ mul \dst\().8h, \src\().8h, \const\().8h
+ mls \dst\().8h, tmp0.8h, modulus.h[0]
+.endm
+
+/**********************************
+ * poly_tomont() *
+ **********************************/
+
+ src .req x0
+ count .req x1
+ wtmp .req w2
+
+ data .req v0
+ q_data .req q0
+ res .req v1
+ q_res .req q1
+
+ factor .req v2
+ factor_t .req v3
+ modulus .req v4
+ modulus_twisted .req v5
+
+ tmp0 .req v6
+
+
+ .text
+ .global MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt)
+ .balign 4
+MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt):
+
+ mov wtmp, #3329 // ML-KEM modulus
+ dup modulus.8h, wtmp
+
+ mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
+ dup modulus_twisted.8h, wtmp
+
+ mov wtmp, #-1044 // 2^16 % 3329
+ dup factor.8h, wtmp
+
+ mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16)
+ dup factor_t.8h, wtmp
+
+ mov count, #8
+ // Instructions: 5
+ // Expected cycles: 7
+ // Expected IPC: 0.71
+ //
+ // Cycle bound: 7.0
+ // IPC bound: 0.71
+ //
+ // Wall time: 0.01s
+ // User time: 0.01s
+ //
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ ldr q26, [x0, #48] // *.............................
+ ldr q23, [x0, #16] // ..*...........................
+ mul v17.8H, v26.8H, v2.8H // ....*.........................
+ sqrdmulh v7.8H, v26.8H, v3.8H // .....*........................
+ ldr q27, [x0, #32] // ......*.......................
+
+ // ------ cycle (expected) ------>
+ // 0 25
+ // |------------------------|-----
+ // ldr q7, [x0, #48] // *..............................
+ // ldr q23, [x0, #16] // ..*............................
+ // mul v17.8H, v7.8H, v2.8H // ....*..........................
+ // sqrdmulh v7.8H, v7.8H, v3.8H // .....*.........................
+ // ldr q27, [x0, #32] // ......*........................
+
+ sub count, count, #1
+poly_tomont_asm_opt_loop:
+ // Instructions: 20
+ // Expected cycles: 24
+ // Expected IPC: 0.83
+ //
+ // Cycle bound: 24.0
+ // IPC bound: 0.83
+ //
+ // Wall time: 0.73s
+ // User time: 0.73s
+ //
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ mls v17.8H, v7.8H, v4.H[0] // *.............................
+ sqrdmulh v5.8H, v23.8H, v3.8H // .*............................
+ ldr q7, [x0], #64 // ..*...........................
+ str q17, [x0, #-16] // ....*.........................
+ sqrdmulh v29.8H, v27.8H, v3.8H // .....*........................
+ sqrdmulh v19.8H, v7.8H, v3.8H // ......*.......................
+ mul v25.8H, v23.8H, v2.8H // .......*......................
+ mul v0.8H, v7.8H, v2.8H // ........*.....................
+ mul v26.8H, v27.8H, v2.8H // .........*....................
+ ldr q7, [x0, #48] // ..........e...................
+ mls v25.8H, v5.8H, v4.H[0] // ............*.................
+ ldr q23, [x0, #16] // .............e................
+ mls v26.8H, v29.8H, v4.H[0] // ...............*..............
+ mls v0.8H, v19.8H, v4.H[0] // ................*.............
+ str q25, [x0, #-48] // .................*............
+ mul v17.8H, v7.8H, v2.8H // ..................e...........
+ sqrdmulh v7.8H, v7.8H, v3.8H // ...................e..........
+ str q0, [x0, #-64] // ....................*.........
+ ldr q27, [x0, #32] // .....................e........
+ str q26, [x0, #-32] // .......................*......
+
+ // --------- cycle (expected) ---------->
+ // 0 25
+ // |------------------------|------------
+ // ldr q0, [x0], #64 // ..............'.*.....................
+ // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'.....*.................
+ // mul v1.8h, v0.8h, v2.8h // ..............'.......*...............
+ // mls v1.8h, v6.8h, v4.h[0] // ......~.......'...............*.......
+ // str q1, [x0, #-64] // ..........~...'...................*...
+ // ldr q0, [x0, #-48] // ...e..........'............~..........
+ // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'*......................
+ // mul v1.8h, v0.8h, v2.8h // ..............'......*................
+ // mls v1.8h, v6.8h, v4.h[0] // ..~...........'...........*...........
+ // str q1, [x0, #-48] // .......~......'................*......
+ // ldr q0, [x0, #-32] // ...........e..'....................~..
+ // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'....*..................
+ // mul v1.8h, v0.8h, v2.8h // ..............'........*..............
+ // mls v1.8h, v6.8h, v4.h[0] // .....~........'..............*........
+ // str q1, [x0, #-32] // .............~'......................*
+ // ldr q0, [x0, #-16] // e.............'.........~.............
+ // sqrdmulh v6.8h, v0.8h, v3.8h // .........e....'..................~....
+ // mul v1.8h, v0.8h, v2.8h // ........e.....'.................~.....
+ // mls v1.8h, v6.8h, v4.h[0] // ..............*.......................
+ // str q1, [x0, #-16] // ..............'...*...................
+
+ sub count, count, 1
+ cbnz count, poly_tomont_asm_opt_loop
+ // Instructions: 15
+ // Expected cycles: 18
+ // Expected IPC: 0.83
+ //
+ // Cycle bound: 18.0
+ // IPC bound: 0.83
+ //
+ // Wall time: 0.07s
+ // User time: 0.07s
+ //
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ mls v17.8H, v7.8H, v4.H[0] // *.............................
+ sqrdmulh v7.8H, v23.8H, v3.8H // .*............................
+ mul v26.8H, v23.8H, v2.8H // ..*...........................
+ sqrdmulh v25.8H, v27.8H, v3.8H // ...*..........................
+ ldr q23, [x0], #64 // ....*.........................
+ mul v27.8H, v27.8H, v2.8H // ......*.......................
+ mls v26.8H, v7.8H, v4.H[0] // .......*......................
+ sqrdmulh v7.8H, v23.8H, v3.8H // ........*.....................
+ mul v23.8H, v23.8H, v2.8H // .........*....................
+ str q17, [x0, #-16] // ..........*...................
+ mls v27.8H, v25.8H, v4.H[0] // ...........*..................
+ str q26, [x0, #-48] // ............*.................
+ mls v23.8H, v7.8H, v4.H[0] // .............*................
+ str q27, [x0, #-32] // ...............*..............
+ str q23, [x0, #-64] // .................*............
+
+ // ------ cycle (expected) ------>
+ // 0 25
+ // |------------------------|-----
+ // mls v17.8H, v7.8H, v4.H[0] // *..............................
+ // sqrdmulh v5.8H, v23.8H, v3.8H // .*.............................
+ // ldr q7, [x0], #64 // ....*..........................
+ // str q17, [x0, #-16] // ..........*....................
+ // sqrdmulh v29.8H, v27.8H, v3.8H // ...*...........................
+ // sqrdmulh v19.8H, v7.8H, v3.8H // ........*......................
+ // mul v25.8H, v23.8H, v2.8H // ..*............................
+ // mul v0.8H, v7.8H, v2.8H // .........*.....................
+ // mul v26.8H, v27.8H, v2.8H // ......*........................
+ // mls v25.8H, v5.8H, v4.H[0] // .......*.......................
+ // mls v26.8H, v29.8H, v4.H[0] // ...........*...................
+ // mls v0.8H, v19.8H, v4.H[0] // .............*.................
+ // str q25, [x0, #-48] // ............*..................
+ // str q0, [x0, #-64] // .................*.............
+ // str q26, [x0, #-32] // ...............*...............
+
+
+ ret
+
+ .unreq src
+ .unreq count
+ .unreq wtmp
+
+ .unreq data
+ .unreq q_data
+ .unreq res
+ .unreq q_res
+
+ .unreq factor
+ .unreq factor_t
+ .unreq modulus
+ .unreq modulus_twisted
+
+ .unreq tmp0
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S
new file mode 100644
index 000000000..e336b92cb
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S
@@ -0,0 +1,546 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// AArch64 re-implementation of the asymmetric base multiplication from:
+
+// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
+// https://eprint.iacr.org/2021/986
+// https://github.com/neon-ntt/neon-ntt
+
+#include "../../../common.h"
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) && MLKEM_K == 2
+/* simpasm: header-end */
+
+// Input:
+// - Vectors al, ah of 32-bit entries
+// Output:
+// - Montgomery reductions of al || ah, stored in al
+.macro montgomery_reduce_long x, a
+ uzp1 t0.8h, \a\()l.8h, \a\()h.8h
+ mul t0.8h, t0.8h, modulus_twisted.8h
+ smlal \a\()l.4s, t0.4h, modulus.4h
+ smlal2 \a\()h.4s, t0.8h, modulus.8h
+ uzp2 \x\().8h, \a\()l.8h, \a\()h.8h
+.endm
+
+// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit.
+
+// Bounds:
+// - Assume |a| < 4096,
+// - Result: < 2*4096*2^15 = 2^28
+.macro pmull d, a, b
+ smull \d\()0l.4s, \a\()0.4h, \b\()0.4h
+ smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
+ smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
+ smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
+
+ smull \d\()1l.4s, \a\()0.4h, \b\()1.4h
+ smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
+ smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
+ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
+.endm
+
+.macro pmlal d, a, b
+ smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h
+ smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
+ smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
+ smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
+
+ smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h
+ smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
+ smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
+ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
+.endm
+
+.macro ld2_wrap a, ptr
+ ldr q_tmp0, [\ptr\()], #32
+ ldr q_tmp1, [\ptr\(), #-16]
+ uzp1 \a\()0.8h, tmp0.8h, tmp1.8h
+ uzp2 \a\()1.8h, tmp0.8h, tmp1.8h
+.endm
+
+.macro st2_wrap a, ptr
+ zip1 tmp0.8h, \a\()0.8h, \a\()1.8h
+ zip2 tmp1.8h, \a\()0.8h, \a\()1.8h
+ str q_tmp0, [\ptr\()], #32
+ str q_tmp1, [\ptr\(), #-16]
+.endm
+
+.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr
+ ld2_wrap \a\(), \a_ptr
+ ld2_wrap \b\(), \b_ptr
+ ld1 {\b\()1t.8h}, [\b_cache_ptr], #16
+.endm
+
+.macro save_vregs
+ sub sp, sp, #(16*4)
+ stp d8, d9, [sp, #16*0]
+ stp d10, d11, [sp, #16*1]
+ stp d12, d13, [sp, #16*2]
+ stp d14, d15, [sp, #16*3]
+.endm
+
+.macro restore_vregs
+ ldp d8, d9, [sp, #16*0]
+ ldp d10, d11, [sp, #16*1]
+ ldp d12, d13, [sp, #16*2]
+ ldp d14, d15, [sp, #16*3]
+ add sp, sp, #(16*4)
+.endm
+
+.macro push_stack
+ save_vregs
+.endm
+
+.macro pop_stack
+ restore_vregs
+.endm
+
+ out .req x0
+ a0_ptr .req x1
+ b0_ptr .req x2
+ b0_cache_ptr .req x3
+ a1_ptr .req x4
+ b1_ptr .req x5
+ b1_cache_ptr .req x6
+ a2_ptr .req x7
+ b2_ptr .req x8
+ b2_cache_ptr .req x9
+ a3_ptr .req x10
+ b3_ptr .req x11
+ b3_cache_ptr .req x12
+ count .req x13
+ wtmp .req w14
+
+ modulus .req v0
+ modulus_twisted .req v2
+
+ aa0 .req v3
+ aa1 .req v4
+ bb0 .req v5
+ bb1 .req v6
+ bb1t .req v7
+
+ res0l .req v8
+ res1l .req v9
+ res0h .req v10
+ res1h .req v11
+
+ tmp0 .req v12
+ tmp1 .req v13
+ q_tmp0 .req q12
+ q_tmp1 .req q13
+
+ out0 .req v26
+ out1 .req v27
+
+ t0 .req v28
+
+ .text
+ .global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2_opt)
+ .balign 4
+MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2_opt):
+ push_stack
+
+ mov wtmp, #3329
+ dup modulus.8h, wtmp
+
+ mov wtmp, #3327
+ dup modulus_twisted.8h, wtmp
+
+ // Computed bases of vector entries
+
+ add a1_ptr, a0_ptr, #(1 * 512)
+ add b1_ptr, b0_ptr, #(1 * 512)
+ add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
+
+ mov count, #(MLKEM_N / 16)
+ // Instructions: 75
+ // Expected cycles: 94
+ // Expected IPC: 0.80
+
+ // Cycle bound: 94.0
+ // IPC bound: 0.80
+
+ // Wall time: 1.49s
+ // User time: 1.49s
+
+ // --------------------------- original position ---------------------------->
+ // 0 25 50
+ // |------------------------|------------------------|
+ ldr q9, [x4], #32 // *..........................................................................
+ ldr q5, [x4, #-16] // ......*....................................................................
+ ldr q11, [x5], #32 // .*.........................................................................
+ uzp1 v23.8H, v9.8H, v5.8H // .........*.................................................................
+ uzp2 v9.8H, v9.8H, v5.8H // .....................*.....................................................
+ ldr q5, [x2], #32 // ..*........................................................................
+ ldr q7, [x5, #-16] // ..............*............................................................
+ ldr q21, [x2, #-16] // ...*.......................................................................
+ uzp2 v10.8H, v11.8H, v7.8H // .................*.........................................................
+ uzp1 v11.8H, v11.8H, v7.8H // ..................*........................................................
+ uzp1 v7.8H, v5.8H, v21.8H // ....*......................................................................
+ uzp2 v5.8H, v5.8H, v21.8H // .....*.....................................................................
+ ldr q21, [x1], #32 // .......*...................................................................
+ ldr q25, [x1, #-16] // ........*..................................................................
+ ld1 {v6.8H}, [x3], #16 // ............................*..............................................
+ uzp1 v26.8H, v21.8H, v25.8H // ..........*................................................................
+ uzp2 v21.8H, v21.8H, v25.8H // ...........*...............................................................
+ smull v25.4S, v26.4H, v5.4H // ............*..............................................................
+ smull2 v5.4S, v26.8H, v5.8H // .............*.............................................................
+ smull v19.4S, v26.4H, v7.4H // ..........................*................................................
+ smull2 v26.4S, v26.8H, v7.8H // ..............................*............................................
+ smlal v25.4S, v21.4H, v7.4H // ...............*...........................................................
+ smlal2 v5.4S, v21.8H, v7.8H // ................*..........................................................
+ smlal v19.4S, v21.4H, v6.4H // ...................................*.......................................
+ smlal2 v26.4S, v21.8H, v6.8H // .................................*.........................................
+ smlal v25.4S, v23.4H, v10.4H // ...................*.......................................................
+ smlal2 v5.4S, v23.8H, v10.8H // ....................*......................................................
+ smlal v19.4S, v23.4H, v11.4H // ......................................*....................................
+ smlal2 v26.4S, v23.8H, v11.8H // ....................................*......................................
+ ld1 {v23.8H}, [x6], #16 // ........................*..................................................
+ smlal v25.4S, v9.4H, v11.4H // ......................*....................................................
+ smlal2 v5.4S, v9.8H, v11.8H // .......................*...................................................
+ smlal2 v26.4S, v9.8H, v23.8H // .......................................*...................................
+ smlal v19.4S, v9.4H, v23.4H // .........................................*.................................
+ ldr q9, [x4], #32 // ...............................*...........................................
+ uzp1 v11.8H, v25.8H, v5.8H // .........................*.................................................
+ uzp1 v23.8H, v19.8H, v26.8H // .............................................*.............................
+ mul v11.8H, v11.8H, v2.8H // ...........................*...............................................
+ mul v23.8H, v23.8H, v2.8H // ..............................................*............................
+ ldr q7, [x5], #32 // ................................*..........................................
+ smlal2 v5.4S, v11.8H, v0.8H // .............................*.............................................
+ smlal v25.4S, v11.4H, v0.4H // ..................................*........................................
+ ldr q11, [x2], #32 // .....................................*.....................................
+ ldr q21, [x2, #-16] // ........................................*..................................
+ ldr q6, [x4, #-16] // ...............................................*...........................
+ uzp1 v17.8H, v11.8H, v21.8H // ...........................................*...............................
+ ldr q10, [x1], #32 // ................................................*..........................
+ ldr q29, [x1, #-16] // .................................................*.........................
+ uzp2 v11.8H, v11.8H, v21.8H // ............................................*..............................
+ uzp1 v13.8H, v9.8H, v6.8H // ...................................................*.......................
+ uzp1 v3.8H, v10.8H, v29.8H // ....................................................*......................
+ uzp2 v10.8H, v10.8H, v29.8H // .....................................................*.....................
+ smull v12.4S, v3.4H, v11.4H // ......................................................*....................
+ smull2 v11.4S, v3.8H, v11.8H // .......................................................*...................
+ ldr q21, [x5, #-16] // ........................................................*..................
+ smlal v12.4S, v10.4H, v17.4H // .........................................................*.................
+ smlal2 v11.4S, v10.8H, v17.8H // ..........................................................*................
+ uzp2 v29.8H, v7.8H, v21.8H // ...........................................................*...............
+ uzp1 v15.8H, v7.8H, v21.8H // ............................................................*..............
+ smlal v12.4S, v13.4H, v29.4H // .............................................................*.............
+ smlal2 v11.4S, v13.8H, v29.8H // ..............................................................*............
+ uzp2 v28.8H, v9.8H, v6.8H // ...............................................................*...........
+ smlal2 v26.4S, v23.8H, v0.8H // ..................................................*........................
+ smlal v12.4S, v28.4H, v15.4H // .................................................................*.........
+ smlal2 v11.4S, v28.8H, v15.8H // ..................................................................*........
+ smlal v19.4S, v23.4H, v0.4H // ................................................................*..........
+ uzp2 v27.8H, v25.8H, v5.8H // ..........................................*................................
+ smull v23.4S, v3.4H, v17.4H // ......................................................................*....
+ uzp1 v9.8H, v12.8H, v11.8H // .....................................................................*.....
+ uzp2 v19.8H, v19.8H, v26.8H // ....................................................................*......
+ mul v14.8H, v9.8H, v2.8H // .......................................................................*...
+ ld1 {v22.8H}, [x6], #16 // ...................................................................*.......
+ zip2 v9.8H, v19.8H, v27.8H // ........................................................................*..
+ smlal2 v11.4S, v14.8H, v0.8H // ..........................................................................*
+ ld1 {v4.8H}, [x3], #16 // .........................................................................*.
+
+ // ------------------------------ new position ------------------------------>
+ // 0 25 50
+ // |------------------------|------------------------|------------------------
+ // ldr q18, [x4], #32 // *..........................................................................
+ // ldr q30, [x5], #32 // ..*........................................................................
+ // ldr q8, [x2], #32 // .....*.....................................................................
+ // ldr q9, [x2, #-16] // .......*...................................................................
+ // uzp1 v17.8H, v8.8H, v9.8H // ..........*................................................................
+ // uzp2 v4.8H, v8.8H, v9.8H // ...........*...............................................................
+ // ldr q19, [x4, #-16] // .*.........................................................................
+ // ldr q29, [x1], #32 // ............*..............................................................
+ // ldr q12, [x1, #-16] // .............*.............................................................
+ // uzp1 v13.8H, v18.8H, v19.8H // ...*.......................................................................
+ // uzp1 v3.8H, v29.8H, v12.8H // ...............*...........................................................
+ // uzp2 v10.8H, v29.8H, v12.8H // ................*..........................................................
+ // smull v12.4S, v3.4H, v4.4H // .................*.........................................................
+ // smull2 v11.4S, v3.8H, v4.8H // ..................*........................................................
+ // ldr q5, [x5, #-16] // ......*....................................................................
+ // smlal v12.4S, v10.4H, v17.4H // .....................*.....................................................
+ // smlal2 v11.4S, v10.8H, v17.8H // ......................*....................................................
+ // uzp2 v14.8H, v30.8H, v5.8H // ........*..................................................................
+ // uzp1 v15.8H, v30.8H, v5.8H // .........*.................................................................
+ // smlal v12.4S, v13.4H, v14.4H // .........................*.................................................
+ // smlal2 v11.4S, v13.8H, v14.8H // ..........................*................................................
+ // uzp2 v28.8H, v18.8H, v19.8H // ....*......................................................................
+ // smlal v12.4S, v28.4H, v15.4H // ..............................*............................................
+ // smlal2 v11.4S, v28.8H, v15.8H // ...............................*...........................................
+ // ld1 {v22.8H}, [x6], #16 // .............................*.............................................
+ // uzp1 v1.8H, v12.8H, v11.8H // ...................................*.......................................
+ // smull v23.4S, v3.4H, v17.4H // ...................*.......................................................
+ // mul v14.8H, v1.8H, v2.8H // .....................................*.....................................
+ // ld1 {v4.8H}, [x3], #16 // ..............*............................................................
+ // smlal2 v11.4S, v14.8H, v0.8H // ........................................*..................................
+ // smull2 v20.4S, v3.8H, v17.8H // ....................*......................................................
+ // ldr q18, [x4], #32 // ..................................*........................................
+ // ldr q30, [x5], #32 // .......................................*...................................
+ // smlal2 v20.4S, v10.8H, v4.8H // ........................*..................................................
+ // smlal v12.4S, v14.4H, v0.4H // .........................................*.................................
+ // smlal v23.4S, v10.4H, v4.4H // .......................*...................................................
+ // smlal2 v20.4S, v13.8H, v15.8H // ............................*..............................................
+ // ldr q8, [x2], #32 // ..........................................*................................
+ // smlal v23.4S, v13.4H, v15.4H // ...........................*...............................................
+ // smlal2 v20.4S, v28.8H, v22.8H // ................................*..........................................
+ // ldr q9, [x2, #-16] // ...........................................*...............................
+ // smlal v23.4S, v28.4H, v22.4H // .................................*.........................................
+ // uzp2 v27.8H, v12.8H, v11.8H // ..................................................................*........
+ // uzp1 v17.8H, v8.8H, v9.8H // .............................................*.............................
+ // uzp2 v4.8H, v8.8H, v9.8H // ................................................*..........................
+ // uzp1 v5.8H, v23.8H, v20.8H // ....................................*......................................
+ // mul v31.8H, v5.8H, v2.8H // ......................................*....................................
+ // ldr q19, [x4, #-16] // ............................................*..............................
+ // ldr q29, [x1], #32 // ..............................................*............................
+ // ldr q12, [x1, #-16] // ...............................................*...........................
+ // smlal2 v20.4S, v31.8H, v0.8H // ..............................................................*............
+ // uzp1 v13.8H, v18.8H, v19.8H // .................................................*.........................
+ // uzp1 v3.8H, v29.8H, v12.8H // ..................................................*........................
+ // uzp2 v10.8H, v29.8H, v12.8H // ...................................................*.......................
+ // smull v12.4S, v3.4H, v4.4H // ....................................................*......................
+ // smull2 v11.4S, v3.8H, v4.8H // .....................................................*.....................
+ // ldr q5, [x5, #-16] // ......................................................*....................
+ // smlal v12.4S, v10.4H, v17.4H // .......................................................*...................
+ // smlal2 v11.4S, v10.8H, v17.8H // ........................................................*..................
+ // uzp2 v14.8H, v30.8H, v5.8H // .........................................................*.................
+ // uzp1 v15.8H, v30.8H, v5.8H // ..........................................................*................
+ // smlal v12.4S, v13.4H, v14.4H // ...........................................................*...............
+ // smlal2 v11.4S, v13.8H, v14.8H // ............................................................*..............
+ // uzp2 v28.8H, v18.8H, v19.8H // .............................................................*.............
+ // smlal v23.4S, v31.4H, v0.4H // .................................................................*.........
+ // smlal v12.4S, v28.4H, v15.4H // ...............................................................*...........
+ // smlal2 v11.4S, v28.8H, v15.8H // ................................................................*..........
+ // ld1 {v22.8H}, [x6], #16 // .......................................................................*...
+ // uzp2 v19.8H, v23.8H, v20.8H // .....................................................................*.....
+ // uzp1 v1.8H, v12.8H, v11.8H // ....................................................................*......
+ // smull v23.4S, v3.4H, v17.4H // ...................................................................*.......
+ // mul v14.8H, v1.8H, v2.8H // ......................................................................*....
+ // zip2 v9.8H, v19.8H, v27.8H // ........................................................................*..
+ // ld1 {v4.8H}, [x3], #16 // ..........................................................................*
+ // smlal2 v11.4S, v14.8H, v0.8H // .........................................................................*.
+
+ sub count, count, #2
+polyvec_basemul_acc_montgomery_cached_asm_k2_opt_loop:
+ // Instructions: 48
+ // Expected cycles: 58
+ // Expected IPC: 0.83
+
+ // Cycle bound: 58.0
+ // IPC bound: 0.83
+
+ // Wall time: 6.39s
+ // User time: 6.39s
+
+ // -------------- original position -------------->
+ // 0 25
+ // |------------------------|----------------------
+ smull2 v20.4S, v3.8H, v17.8H // ..........*.....................................
+ ldr q18, [x4], #32 // .................e..............................
+ ldr q30, [x5], #32 // .....................e..........................
+ smlal2 v20.4S, v10.8H, v4.8H // ............*...................................
+ smlal v12.4S, v14.4H, v0.4H // .........................................*......
+ smlal v23.4S, v10.4H, v4.4H // ...........*....................................
+ str q9, [x0, #16] // ...............................................l
+ smlal2 v20.4S, v13.8H, v15.8H // ...........................*....................
+ ldr q8, [x2], #32 // ....e...........................................
+ smlal v23.4S, v13.4H, v15.4H // ..........................*.....................
+ smlal2 v20.4S, v28.8H, v22.8H // .............................*..................
+ zip1 v26.8H, v19.8H, v27.8H // ............................................l...
+ ldr q9, [x2, #-16] // .....e..........................................
+ smlal v23.4S, v28.4H, v22.4H // ............................*...................
+ uzp2 v27.8H, v12.8H, v11.8H // ...........................................*....
+ uzp1 v17.8H, v8.8H, v9.8H // ......e.........................................
+ uzp2 v4.8H, v8.8H, v9.8H // .......e........................................
+ uzp1 v5.8H, v23.8H, v20.8H // ..................................*.............
+ str q26, [x0], #32 // ..............................................l.
+ mul v31.8H, v5.8H, v2.8H // ...................................*............
+ ldr q19, [x4, #-16] // ..................e.............................
+ ldr q29, [x1], #32 // e...............................................
+ ldr q12, [x1, #-16] // .e..............................................
+ smlal2 v20.4S, v31.8H, v0.8H // .....................................*..........
+ uzp1 v13.8H, v18.8H, v19.8H // ...................e............................
+ uzp1 v3.8H, v29.8H, v12.8H // ..e.............................................
+ uzp2 v10.8H, v29.8H, v12.8H // ...e............................................
+ smull v12.4S, v3.4H, v4.4H // .............e..................................
+ smull2 v11.4S, v3.8H, v4.8H // ..............e.................................
+ ldr q5, [x5, #-16] // ......................e.........................
+ smlal v12.4S, v10.4H, v17.4H // ...............e................................
+ smlal2 v11.4S, v10.8H, v17.8H // ................e...............................
+ uzp2 v14.8H, v30.8H, v5.8H // ........................e.......................
+ uzp1 v15.8H, v30.8H, v5.8H // .......................e........................
+ smlal v12.4S, v13.4H, v14.4H // ..............................e.................
+ smlal2 v11.4S, v13.8H, v14.8H // ...............................e................
+ uzp2 v28.8H, v18.8H, v19.8H // ....................e...........................
+ smlal v23.4S, v31.4H, v0.4H // ....................................*...........
+ smlal v12.4S, v28.4H, v15.4H // ................................e...............
+ smlal2 v11.4S, v28.8H, v15.8H // .................................e..............
+ ld1 {v22.8H}, [x6], #16 // .........................e......................
+ uzp2 v19.8H, v23.8H, v20.8H // ......................................*.........
+ uzp1 v1.8H, v12.8H, v11.8H // .......................................e........
+ smull v23.4S, v3.4H, v17.4H // .........e......................................
+ mul v14.8H, v1.8H, v2.8H // ........................................e.......
+ zip2 v9.8H, v19.8H, v27.8H // .............................................*..
+ ld1 {v4.8H}, [x3], #16 // ........e.......................................
+ smlal2 v11.4S, v14.8H, v0.8H // ..........................................e.....
+
+ // ------------------------------------------------- new position -------------------------------------------------->
+ // 0 25 50 75 100
+ // |------------------------|------------------------|------------------------|------------------------|-------------
+ // ldr q12, [x1], #32 // ....................e..........................'....................~..........................'..................
+ // ldr q13, [x1, #-16] // .....................e.........................'.....................~.........................'..................
+ // uzp1 v3.8h, v12.8h, v13.8h // ........................e......................'........................~......................'..................
+ // uzp2 v4.8h, v12.8h, v13.8h // .........................e.....................'.........................~.....................'..................
+ // ldr q12, [x2], #32 // .......e.......................................'.......~.......................................'.......~..........
+ // ldr q13, [x2, #-16] // ...........e...................................'...........~...................................'...........~......
+ // uzp1 v5.8h, v12.8h, v13.8h // ..............e................................'..............~................................'..............~...
+ // uzp2 v6.8h, v12.8h, v13.8h // ...............e...............................'...............~...............................'...............~..
+ // ld1 {v7.8h}, [x3], #16 // .............................................e.'.............................................~.'..................
+ // smull v8.4s, v3.4h, v5.4h // ..........................................e....'..........................................~....'..................
+ // smull2 v10.4s, v3.8h, v5.8h // ...............................................*...............................................~..................
+ // smlal v8.4s, v4.4h, v7.4h // ....~..........................................'....*..........................................'....~.............
+ // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................'..*............................................'..~...............
+ // smull v9.4s, v3.4h, v6.4h // ..........................e....................'..........................~....................'..................
+ // smull2 v11.4s, v3.8h, v6.8h // ...........................e...................'...........................~...................'..................
+ // smlal v9.4s, v4.4h, v5.4h // .............................e.................'.............................~.................'..................
+ // smlal2 v11.4s, v4.8h, v5.8h // ..............................e................'..............................~................'..................
+ // ldr q12, [x4], #32 // e..............................................'~..............................................'~.................
+ // ldr q13, [x4, #-16] // ...................e...........................'...................~...........................'..................
+ // uzp1 v3.8h, v12.8h, v13.8h // .......................e.......................'.......................~.......................'..................
+ // uzp2 v4.8h, v12.8h, v13.8h // ...................................e...........'...................................~...........'..................
+ // ldr q12, [x5], #32 // .e.............................................'.~.............................................'.~................
+ // ldr q13, [x5, #-16] // ............................e..................'............................~..................'..................
+ // uzp1 v5.8h, v12.8h, v13.8h // ................................e..............'................................~..............'..................
+ // uzp2 v6.8h, v12.8h, v13.8h // ...............................e...............'...............................~...............'..................
+ // ld1 {v7.8h}, [x6], #16 // .......................................e.......'.......................................~.......'..................
+ // smlal v8.4s, v3.4h, v5.4h // ........~......................................'........*......................................'........~.........
+ // smlal2 v10.4s, v3.8h, v5.8h // ......~........................................'......*........................................'......~...........
+ // smlal v8.4s, v4.4h, v7.4h // ............~..................................'............*..................................'............~.....
+ // smlal2 v10.4s, v4.8h, v7.8h // .........~.....................................'.........*.....................................'.........~........
+ // smlal v9.4s, v3.4h, v6.4h // .................................e.............'.................................~.............'..................
+ // smlal2 v11.4s, v3.8h, v6.8h // ..................................e............'..................................~............'..................
+ // smlal v9.4s, v4.4h, v5.4h // .....................................e.........'.....................................~.........'..................
+ // smlal2 v11.4s, v4.8h, v5.8h // ......................................e........'......................................~........'..................
+ // uzp1 v28.8h, v8.8h, v10.8h // ................~..............................'................*..............................'................~.
+ // mul v28.8h, v28.8h, v2.8h // ..................~............................'..................*............................'..................
+ // smlal v8.4s, v28.4h, v0.4h // ....................................~..........'....................................*..........'..................
+ // smlal2 v10.4s, v28.8h, v0.8h // ......................~........................'......................*........................'..................
+ // uzp2 v26.8h, v8.8h, v10.8h // ........................................~......'........................................*......'..................
+ // uzp1 v28.8h, v9.8h, v11.8h // .........................................e.....'.........................................~.....'..................
+ // mul v28.8h, v28.8h, v2.8h // ...........................................e...'...........................................~...'..................
+ // smlal v9.4s, v28.4h, v0.4h // ...~...........................................'...*...........................................'...~..............
+ // smlal2 v11.4s, v28.8h, v0.8h // ..............................................e'..............................................~'..................
+ // uzp2 v27.8h, v9.8h, v11.8h // .............~.................................'.............*.................................'.............~....
+ // zip1 v12.8h, v26.8h, v27.8h // ..........~....................................'..........~....................................'..........l.......
+ // zip2 v13.8h, v26.8h, v27.8h // ............................................~..'............................................*..'..................
+ // str q12, [x0], #32 // .................~.............................'.................~.............................'.................l
+ // str q13, [x0, #-16] // .....~.........................................'.....~.........................................'.....l............
+
+ sub count, count, #1
+ cbnz count, polyvec_basemul_acc_montgomery_cached_asm_k2_opt_loop
+ // Instructions: 21
+ // Expected cycles: 35
+ // Expected IPC: 0.60
+
+ // Cycle bound: 35.0
+ // IPC bound: 0.60
+
+ // Wall time: 0.08s
+ // User time: 0.08s
+
+ // ----- original position ----->
+ // 0 25
+ // |------------------------|----
+ smull2 v5.4S, v3.8H, v17.8H // *.............................
+ smlal v12.4S, v14.4H, v0.4H // ..*...........................
+ smlal v23.4S, v10.4H, v4.4H // ...*..........................
+ str q9, [x0, #16] // ....*.........................
+ smlal2 v5.4S, v10.8H, v4.8H // .*............................
+ uzp2 v11.8H, v12.8H, v11.8H // ..........*...................
+ zip1 v9.8H, v19.8H, v27.8H // ........*.....................
+ smlal v23.4S, v13.4H, v15.4H // ......*.......................
+ smlal2 v5.4S, v13.8H, v15.8H // .....*........................
+ str q9, [x0], #32 // ............*.................
+ smlal v23.4S, v28.4H, v22.4H // .........*....................
+ smlal2 v5.4S, v28.8H, v22.8H // .......*......................
+ uzp1 v9.8H, v23.8H, v5.8H // ...........*..................
+ mul v9.8H, v9.8H, v2.8H // .............*................
+ smlal2 v5.4S, v9.8H, v0.8H // ..............*...............
+ smlal v23.4S, v9.4H, v0.4H // ...............*..............
+ uzp2 v9.8H, v23.8H, v5.8H // ................*.............
+ zip2 v5.8H, v9.8H, v11.8H // .................*............
+ zip1 v9.8H, v9.8H, v11.8H // ...................*..........
+ str q5, [x0, #16] // ..................*...........
+ str q9, [x0], #32 // ....................*.........
+
+ // -------- new position -------->
+ // 0 25
+ // |------------------------|-----
+ // smull2 v20.4S, v3.8H, v17.8H // *..............................
+ // smlal2 v20.4S, v10.8H, v4.8H // ....*..........................
+ // smlal v12.4S, v14.4H, v0.4H // .*.............................
+ // smlal v23.4S, v10.4H, v4.4H // ..*............................
+ // str q9, [x0, #16] // ...*...........................
+ // smlal2 v20.4S, v13.8H, v15.8H // ........*......................
+ // smlal v23.4S, v13.4H, v15.4H // .......*.......................
+ // smlal2 v20.4S, v28.8H, v22.8H // ...........*...................
+ // zip1 v26.8H, v19.8H, v27.8H // ......*........................
+ // smlal v23.4S, v28.4H, v22.4H // ..........*....................
+ // uzp2 v27.8H, v12.8H, v11.8H // .....*.........................
+ // uzp1 v5.8H, v23.8H, v20.8H // ............*..................
+ // str q26, [x0], #32 // .........*.....................
+ // mul v31.8H, v5.8H, v2.8H // .............*.................
+ // smlal2 v20.4S, v31.8H, v0.8H // ..............*................
+ // smlal v23.4S, v31.4H, v0.4H // ...............*...............
+ // uzp2 v19.8H, v23.8H, v20.8H // ................*..............
+ // zip2 v9.8H, v19.8H, v27.8H // .................*.............
+ // str q9, [x0, #16] // ...................*...........
+ // zip1 v26.8H, v19.8H, v27.8H // ..................*............
+ // str q26, [x0], #32 // ....................*..........
+
+
+ pop_stack
+ ret
+
+/****************** REGISTER DEALLOCATIONS *******************/
+ .unreq out
+ .unreq a0_ptr
+ .unreq b0_ptr
+ .unreq b0_cache_ptr
+ .unreq a1_ptr
+ .unreq b1_ptr
+ .unreq b1_cache_ptr
+ .unreq a2_ptr
+ .unreq b2_ptr
+ .unreq b2_cache_ptr
+ .unreq a3_ptr
+ .unreq b3_ptr
+ .unreq b3_cache_ptr
+ .unreq count
+ .unreq modulus
+ .unreq modulus_twisted
+ .unreq wtmp
+ .unreq aa0
+ .unreq aa1
+ .unreq bb0
+ .unreq bb1
+ .unreq bb1t
+ .unreq res0l
+ .unreq res1l
+ .unreq res0h
+ .unreq res1h
+ .unreq tmp0
+ .unreq tmp1
+ .unreq q_tmp0
+ .unreq q_tmp1
+ .unreq out0
+ .unreq out1
+ .unreq t0
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT && MLKEM_K == 3 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S
new file mode 100644
index 000000000..1c30ed6aa
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S
@@ -0,0 +1,650 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// AArch64 re-implementation of the asymmetric base multiplication from:
+
+// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
+// https://eprint.iacr.org/2021/986
+// https://github.com/neon-ntt/neon-ntt
+
+#include "../../../common.h"
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) && MLKEM_K == 3
+/* simpasm: header-end */
+
+// Input:
+// - Vectors al, ah of 32-bit entries
+// Output:
+// - Montgomery reductions of al || ah, stored in al
+.macro montgomery_reduce_long x, a
+ uzp1 t0.8h, \a\()l.8h, \a\()h.8h
+ mul t0.8h, t0.8h, modulus_twisted.8h
+ smlal \a\()l.4s, t0.4h, modulus.4h
+ smlal2 \a\()h.4s, t0.8h, modulus.8h
+ uzp2 \x\().8h, \a\()l.8h, \a\()h.8h
+.endm
+
+// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit.
+
+// Bounds:
+// - Assume |a| < 4096,
+// - Result: < 2*4096*2^15 = 2^28
+.macro pmull d, a, b
+ smull \d\()0l.4s, \a\()0.4h, \b\()0.4h
+ smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
+ smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
+ smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
+
+ smull \d\()1l.4s, \a\()0.4h, \b\()1.4h
+ smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
+ smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
+ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
+.endm
+
+.macro pmlal d, a, b
+ smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h
+ smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
+ smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
+ smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
+
+ smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h
+ smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
+ smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
+ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
+.endm
+
+.macro ld2_wrap a, ptr
+ ldr q_tmp0, [\ptr\()], #32
+ ldr q_tmp1, [\ptr\(), #-16]
+ uzp1 \a\()0.8h, tmp0.8h, tmp1.8h
+ uzp2 \a\()1.8h, tmp0.8h, tmp1.8h
+.endm
+
+.macro st2_wrap a, ptr
+ zip1 tmp0.8h, \a\()0.8h, \a\()1.8h
+ zip2 tmp1.8h, \a\()0.8h, \a\()1.8h
+ str q_tmp0, [\ptr\()], #32
+ str q_tmp1, [\ptr\(), #-16]
+.endm
+
+.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr
+ ld2_wrap \a\(), \a_ptr
+ ld2_wrap \b\(), \b_ptr
+ ld1 {\b\()1t.8h}, [\b_cache_ptr], #16
+.endm
+
+.macro save_vregs
+ sub sp, sp, #(16*4)
+ stp d8, d9, [sp, #16*0]
+ stp d10, d11, [sp, #16*1]
+ stp d12, d13, [sp, #16*2]
+ stp d14, d15, [sp, #16*3]
+.endm
+
+.macro restore_vregs
+ ldp d8, d9, [sp, #16*0]
+ ldp d10, d11, [sp, #16*1]
+ ldp d12, d13, [sp, #16*2]
+ ldp d14, d15, [sp, #16*3]
+ add sp, sp, #(16*4)
+.endm
+
+.macro push_stack
+ save_vregs
+.endm
+
+.macro pop_stack
+ restore_vregs
+.endm
+
+ out .req x0
+ a0_ptr .req x1
+ b0_ptr .req x2
+ b0_cache_ptr .req x3
+ a1_ptr .req x4
+ b1_ptr .req x5
+ b1_cache_ptr .req x6
+ a2_ptr .req x7
+ b2_ptr .req x8
+ b2_cache_ptr .req x9
+ a3_ptr .req x10
+ b3_ptr .req x11
+ b3_cache_ptr .req x12
+ count .req x13
+ wtmp .req w14
+
+ modulus .req v0
+ modulus_twisted .req v2
+
+ aa0 .req v3
+ aa1 .req v4
+ bb0 .req v5
+ bb1 .req v6
+ bb1t .req v7
+
+ res0l .req v8
+ res1l .req v9
+ res0h .req v10
+ res1h .req v11
+
+ tmp0 .req v12
+ tmp1 .req v13
+ q_tmp0 .req q12
+ q_tmp1 .req q13
+
+ out0 .req v26
+ out1 .req v27
+
+ t0 .req v28
+
+ .text
+ .global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3_opt)
+ .balign 4
+MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3_opt):
+ push_stack
+ mov wtmp, #3329
+ dup modulus.8h, wtmp
+
+ mov wtmp, #3327
+ dup modulus_twisted.8h, wtmp
+
+ // Computed bases of vector entries
+
+ add a1_ptr, a0_ptr, #(1 * 512)
+ add b1_ptr, b0_ptr, #(1 * 512)
+ add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
+ add a2_ptr, a0_ptr, #(2 * 512)
+ add b2_ptr, b0_ptr, #(2 * 512)
+ add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2)
+
+ mov count, #(MLKEM_N / 16)
+ // Instructions: 75
+ // Expected cycles: 103
+ // Expected IPC: 0.73
+
+ // Cycle bound: 103.0
+ // IPC bound: 0.73
+
+ // Wall time: 0.94s
+ // User time: 0.94s
+
+ // --------------------------- original position ---------------------------->
+ // 0 25 50
+ // |------------------------|------------------------|
+ ldr q7, [x2, #16] // *..........................................................................
+ ldr q20, [x2], #32 // ..*........................................................................
+ ldr q15, [x1, #16] // .*.........................................................................
+ uzp1 v8.8H, v20.8H, v7.8H // ...............*...........................................................
+ uzp2 v7.8H, v20.8H, v7.8H // ................*..........................................................
+ ld1 {v20.8H}, [x3], #16 // ...*.......................................................................
+ ldr q30, [x1], #32 // ..............*............................................................
+ ldr q11, [x4], #32 // ....*......................................................................
+ uzp1 v16.8H, v30.8H, v15.8H // .................*.........................................................
+ uzp2 v15.8H, v30.8H, v15.8H // ..................*........................................................
+ smull v30.4S, v16.4H, v7.4H // ...................*.......................................................
+ smull2 v7.4S, v16.8H, v7.8H // ....................*......................................................
+ smull v9.4S, v16.4H, v8.4H // .....................*.....................................................
+ smull2 v16.4S, v16.8H, v8.8H // ......................*....................................................
+ smlal v30.4S, v15.4H, v8.4H // .......................*...................................................
+ smlal2 v7.4S, v15.8H, v8.8H // ........................*..................................................
+ smlal v9.4S, v15.4H, v20.4H // .........................*.................................................
+ smlal2 v16.4S, v15.8H, v20.8H // ..........................*................................................
+ ldr q20, [x4, #-16] // .....*.....................................................................
+ ldr q15, [x5], #32 // ......*....................................................................
+ uzp1 v8.8H, v11.8H, v20.8H // ...........................*...............................................
+ uzp2 v20.8H, v11.8H, v20.8H // ............................*..............................................
+ ldr q11, [x5, #-16] // .......*...................................................................
+ ld1 {v27.8H}, [x6], #16 // ........*..................................................................
+ uzp1 v10.8H, v15.8H, v11.8H // .............................*.............................................
+ uzp2 v15.8H, v15.8H, v11.8H // ..............................*............................................
+ smlal v9.4S, v8.4H, v10.4H // ...............................*...........................................
+ smlal2 v16.4S, v8.8H, v10.8H // ................................*..........................................
+ smlal v30.4S, v8.4H, v15.4H // .................................*.........................................
+ smlal2 v7.4S, v8.8H, v15.8H // ..................................*........................................
+ smlal v9.4S, v20.4H, v27.4H // ...................................*.......................................
+ smlal2 v16.4S, v20.8H, v27.8H // ....................................*......................................
+ smlal v30.4S, v20.4H, v10.4H // .....................................*.....................................
+ smlal2 v7.4S, v20.8H, v10.8H // ......................................*....................................
+ ldr q20, [x7], #32 // .........*.................................................................
+ ldr q15, [x7, #-16] // ..........*................................................................
+ ldr q8, [x8], #32 // ...........*...............................................................
+ uzp1 v11.8H, v20.8H, v15.8H // .......................................*...................................
+ uzp2 v20.8H, v20.8H, v15.8H // ........................................*..................................
+ ldr q15, [x8, #-16] // ............*..............................................................
+ ld1 {v27.8H}, [x9], #16 // .............*.............................................................
+ uzp1 v10.8H, v8.8H, v15.8H // .........................................*.................................
+ uzp2 v15.8H, v8.8H, v15.8H // ..........................................*................................
+ smlal v9.4S, v11.4H, v10.4H // ...........................................*...............................
+ smlal2 v16.4S, v11.8H, v10.8H // ............................................*..............................
+ smlal v30.4S, v11.4H, v15.4H // .............................................*.............................
+ smlal2 v7.4S, v11.8H, v15.8H // ..............................................*............................
+ smlal v9.4S, v20.4H, v27.4H // ...............................................*...........................
+ smlal2 v16.4S, v20.8H, v27.8H // ................................................*..........................
+ smlal v30.4S, v20.4H, v10.4H // .................................................*.........................
+ smlal2 v7.4S, v20.8H, v10.8H // ..................................................*........................
+ ldr q15, [x2], #32 // ...............................................................*...........
+ uzp1 v20.8H, v9.8H, v16.8H // ....................................................*......................
+ uzp1 v8.8H, v30.8H, v7.8H // .....................................................*.....................
+ mul v20.8H, v20.8H, v2.8H // ......................................................*....................
+ mul v8.8H, v8.8H, v2.8H // .......................................................*...................
+ ldr q21, [x4], #32 // .................................................................*.........
+ smlal v9.4S, v20.4H, v0.4H // ........................................................*..................
+ smlal2 v16.4S, v20.8H, v0.8H // .........................................................*.................
+ smlal v30.4S, v8.4H, v0.4H // ..........................................................*................
+ smlal2 v7.4S, v8.8H, v0.8H // ...........................................................*...............
+ ldr q6, [x4, #-16] // ..................................................................*........
+ uzp2 v27.8H, v9.8H, v16.8H // ............................................................*..............
+ uzp2 v10.8H, v30.8H, v7.8H // .............................................................*.............
+ ldr q16, [x2, #-16] // ...................................................*.......................
+ ldr q30, [x1, #16] // ..............................................................*............
+ ld1 {v9.8H}, [x3], #16 // ................................................................*..........
+ ldr q1, [x5], #32 // ...................................................................*.......
+ ldr q12, [x5, #-16] // ....................................................................*......
+ ld1 {v24.8H}, [x6], #16 // .....................................................................*.....
+ ldr q19, [x7], #32 // ......................................................................*....
+ ldr q31, [x7, #-16] // .......................................................................*...
+ ldr q17, [x8], #32 // ........................................................................*..
+ ldr q18, [x8, #-16] // .........................................................................*.
+ ld1 {v25.8H}, [x9], #16 // ..........................................................................*
+
+ // ------------------------------ new position ------------------------------>
+ // 0 25 50
+ // |------------------------|------------------------|------------------------
+ // ldr q16, [x2, #16] // *..........................................................................
+ // ldr q30, [x1, #16] // ..*........................................................................
+ // ldr q15, [x2], #32 // .*.........................................................................
+ // ld1 {v9.8H}, [x3], #16 // .....*.....................................................................
+ // ldr q21, [x4], #32 // .......*...................................................................
+ // ldr q6, [x4, #-16] // ..................*........................................................
+ // ldr q1, [x5], #32 // ...................*.......................................................
+ // ldr q12, [x5, #-16] // ......................*....................................................
+ // ld1 {v24.8H}, [x6], #16 // .......................*...................................................
+ // ldr q19, [x7], #32 // ..................................*........................................
+ // ldr q31, [x7, #-16] // ...................................*.......................................
+ // ldr q17, [x8], #32 // ....................................*......................................
+ // ldr q18, [x8, #-16] // .......................................*...................................
+ // ld1 {v25.8H}, [x9], #16 // ........................................*..................................
+ // ldr q20, [x1], #32 // ......*....................................................................
+ // uzp1 v7.8H, v15.8H, v16.8H // ...*.......................................................................
+ // uzp2 v15.8H, v15.8H, v16.8H // ....*......................................................................
+ // uzp1 v8.8H, v20.8H, v30.8H // ........*..................................................................
+ // uzp2 v20.8H, v20.8H, v30.8H // .........*.................................................................
+ // smull v30.4S, v8.4H, v15.4H // ..........*................................................................
+ // smull2 v15.4S, v8.8H, v15.8H // ...........*...............................................................
+ // smull v11.4S, v8.4H, v7.4H // ............*..............................................................
+ // smull2 v8.4S, v8.8H, v7.8H // .............*.............................................................
+ // smlal v30.4S, v20.4H, v7.4H // ..............*............................................................
+ // smlal2 v15.4S, v20.8H, v7.8H // ...............*...........................................................
+ // smlal v11.4S, v20.4H, v9.4H // ................*..........................................................
+ // smlal2 v8.4S, v20.8H, v9.8H // .................*.........................................................
+ // uzp1 v7.8H, v21.8H, v6.8H // ....................*......................................................
+ // uzp2 v20.8H, v21.8H, v6.8H // .....................*.....................................................
+ // uzp1 v16.8H, v1.8H, v12.8H // ........................*..................................................
+ // uzp2 v9.8H, v1.8H, v12.8H // .........................*.................................................
+ // smlal v11.4S, v7.4H, v16.4H // ..........................*................................................
+ // smlal2 v8.4S, v7.8H, v16.8H // ...........................*...............................................
+ // smlal v30.4S, v7.4H, v9.4H // ............................*..............................................
+ // smlal2 v15.4S, v7.8H, v9.8H // .............................*.............................................
+ // smlal v11.4S, v20.4H, v24.4H // ..............................*............................................
+ // smlal2 v8.4S, v20.8H, v24.8H // ...............................*...........................................
+ // smlal v30.4S, v20.4H, v16.4H // ................................*..........................................
+ // smlal2 v15.4S, v20.8H, v16.8H // .................................*.........................................
+ // uzp1 v7.8H, v19.8H, v31.8H // .....................................*.....................................
+ // uzp2 v20.8H, v19.8H, v31.8H // ......................................*....................................
+ // uzp1 v16.8H, v17.8H, v18.8H // .........................................*.................................
+ // uzp2 v9.8H, v17.8H, v18.8H // ..........................................*................................
+ // smlal v11.4S, v7.4H, v16.4H // ...........................................*...............................
+ // smlal2 v8.4S, v7.8H, v16.8H // ............................................*..............................
+ // smlal v30.4S, v7.4H, v9.4H // .............................................*.............................
+ // smlal2 v15.4S, v7.8H, v9.8H // ..............................................*............................
+ // smlal v11.4S, v20.4H, v25.4H // ...............................................*...........................
+ // smlal2 v8.4S, v20.8H, v25.8H // ................................................*..........................
+ // smlal v30.4S, v20.4H, v16.4H // .................................................*.........................
+ // smlal2 v15.4S, v20.8H, v16.8H // ..................................................*........................
+ // ldr q16, [x2, #16] // ................................................................*..........
+ // uzp1 v7.8H, v11.8H, v8.8H // ....................................................*......................
+ // uzp1 v20.8H, v30.8H, v15.8H // .....................................................*.....................
+ // mul v7.8H, v7.8H, v2.8H // ......................................................*....................
+ // mul v20.8H, v20.8H, v2.8H // .......................................................*...................
+ // smlal v11.4S, v7.4H, v0.4H // .........................................................*.................
+ // smlal2 v8.4S, v7.8H, v0.8H // ..........................................................*................
+ // smlal v30.4S, v20.4H, v0.4H // ...........................................................*...............
+ // smlal2 v15.4S, v20.8H, v0.8H // ............................................................*..............
+ // uzp2 v27.8H, v11.8H, v8.8H // ..............................................................*............
+ // uzp2 v10.8H, v30.8H, v15.8H // ...............................................................*...........
+ // ldr q30, [x1, #16] // .................................................................*.........
+ // ldr q15, [x2], #32 // ...................................................*.......................
+ // ld1 {v9.8H}, [x3], #16 // ..................................................................*........
+ // ldr q21, [x4], #32 // ........................................................*..................
+ // ldr q6, [x4, #-16] // .............................................................*.............
+ // ldr q1, [x5], #32 // ...................................................................*.......
+ // ldr q12, [x5, #-16] // ....................................................................*......
+ // ld1 {v24.8H}, [x6], #16 // .....................................................................*.....
+ // ldr q19, [x7], #32 // ......................................................................*....
+ // ldr q31, [x7, #-16] // .......................................................................*...
+ // ldr q17, [x8], #32 // ........................................................................*..
+ // ldr q18, [x8, #-16] // .........................................................................*.
+ // ld1 {v25.8H}, [x9], #16 // ..........................................................................*
+
+ sub count, count, #2
+polyvec_basemul_acc_montgomery_cached_asm_k3_opt_loop:
+ // Instructions: 65
+ // Expected cycles: 80
+ // Expected IPC: 0.81
+
+ // Cycle bound: 80.0
+ // IPC bound: 0.81
+
+ // Wall time: 11.64s
+ // User time: 11.64s
+
+ // ---------------------- original position ----------------------->
+ // 0 25 50
+ // |------------------------|------------------------|--------------
+ ldr q20, [x1], #32 // *................................................................
+ uzp1 v7.8H, v15.8H, v16.8H // ......*..........................................................
+ uzp2 v15.8H, v15.8H, v16.8H // .......*.........................................................
+ uzp1 v8.8H, v20.8H, v30.8H // ..*..............................................................
+ uzp2 v20.8H, v20.8H, v30.8H // ...*.............................................................
+ smull v30.4S, v8.4H, v15.4H // .............*...................................................
+ smull2 v15.4S, v8.8H, v15.8H // ..............*..................................................
+ smull v11.4S, v8.4H, v7.4H // .........*.......................................................
+ smull2 v8.4S, v8.8H, v7.8H // ..........*......................................................
+ smlal v30.4S, v20.4H, v7.4H // ...............*.................................................
+ smlal2 v15.4S, v20.8H, v7.8H // ................*................................................
+ smlal v11.4S, v20.4H, v9.4H // ...........*.....................................................
+ smlal2 v8.4S, v20.8H, v9.8H // ............*....................................................
+ uzp1 v7.8H, v21.8H, v6.8H // ...................*.............................................
+ uzp2 v20.8H, v21.8H, v6.8H // ....................*............................................
+ uzp1 v16.8H, v1.8H, v12.8H // .......................*.........................................
+ uzp2 v9.8H, v1.8H, v12.8H // ........................*........................................
+ smlal v11.4S, v7.4H, v16.4H // ..........................*......................................
+ smlal2 v8.4S, v7.8H, v16.8H // ...........................*.....................................
+ smlal v30.4S, v7.4H, v9.4H // ..............................*..................................
+ smlal2 v15.4S, v7.8H, v9.8H // ...............................*.................................
+ smlal v11.4S, v20.4H, v24.4H // ............................*....................................
+ smlal2 v8.4S, v20.8H, v24.8H // .............................*...................................
+ smlal v30.4S, v20.4H, v16.4H // ................................*................................
+ smlal2 v15.4S, v20.8H, v16.8H // .................................*...............................
+ uzp1 v7.8H, v19.8H, v31.8H // ....................................*............................
+ uzp2 v20.8H, v19.8H, v31.8H // .....................................*...........................
+ uzp1 v16.8H, v17.8H, v18.8H // ........................................*........................
+ uzp2 v9.8H, v17.8H, v18.8H // .........................................*.......................
+ smlal v11.4S, v7.4H, v16.4H // ...........................................*.....................
+ smlal2 v8.4S, v7.8H, v16.8H // ............................................*....................
+ smlal v30.4S, v7.4H, v9.4H // ...............................................*.................
+ smlal2 v15.4S, v7.8H, v9.8H // ................................................*................
+ smlal v11.4S, v20.4H, v25.4H // .............................................*...................
+ smlal2 v8.4S, v20.8H, v25.8H // ..............................................*..................
+ smlal v30.4S, v20.4H, v16.4H // .................................................*...............
+ smlal2 v15.4S, v20.8H, v16.8H // ..................................................*..............
+ ldr q16, [x2, #16] // .....e...........................................................
+ uzp1 v7.8H, v11.8H, v8.8H // ...................................................*.............
+ uzp1 v20.8H, v30.8H, v15.8H // ........................................................*........
+ mul v7.8H, v7.8H, v2.8H // ....................................................*............
+ mul v20.8H, v20.8H, v2.8H // .........................................................*.......
+ zip2 v9.8H, v27.8H, v10.8H // ..............................................................l..
+ zip1 v27.8H, v27.8H, v10.8H // .............................................................l...
+ smlal v11.4S, v7.4H, v0.4H // .....................................................*...........
+ smlal2 v8.4S, v7.8H, v0.8H // ......................................................*..........
+ smlal v30.4S, v20.4H, v0.4H // ..........................................................*......
+ smlal2 v15.4S, v20.8H, v0.8H // ...........................................................*.....
+ str q27, [x0], #32 // ...............................................................l.
+ uzp2 v27.8H, v11.8H, v8.8H // .......................................................*.........
+ str q9, [x0, #-16] // ................................................................l
+ uzp2 v10.8H, v30.8H, v15.8H // ............................................................*....
+ ldr q30, [x1, #16] // .e...............................................................
+ ldr q15, [x2], #32 // ....e............................................................
+ ld1 {v9.8H}, [x3], #16 // ........e........................................................
+ ldr q21, [x4], #32 // .................e...............................................
+ ldr q6, [x4, #-16] // ..................e..............................................
+ ldr q1, [x5], #32 // .....................e...........................................
+ ldr q12, [x5, #-16] // ......................e..........................................
+ ld1 {v24.8H}, [x6], #16 // .........................e.......................................
+ ldr q19, [x7], #32 // ..................................e..............................
+ ldr q31, [x7, #-16] // ...................................e.............................
+ ldr q17, [x8], #32 // ......................................e..........................
+ ldr q18, [x8, #-16] // .......................................e.........................
+ ld1 {v25.8H}, [x9], #16 // ..........................................e......................
+
+ // ---------------------------------------------------------------- new position ----------------------------------------------------------------->
+ // 0 25 50 75 100 125
+ // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------
+ // ldr q12, [x1], #32 // ............................*................................................................~..................................................
+ // ldr q13, [x1, #-16] // ...............e............'...................................................~............'..................................................
+ // uzp1 v3.8h, v12.8h, v13.8h // ............................'..*.............................................................'..~...............................................
+ // uzp2 v4.8h, v12.8h, v13.8h // ............................'...*............................................................'...~..............................................
+ // ldr q12, [x2], #32 // ................e...........'....................................................~...........'..................................................
+ // ldr q13, [x2, #-16] // e...........................'....................................~...........................'....................................~.............
+ // uzp1 v5.8h, v12.8h, v13.8h // ............................'*...............................................................'~.................................................
+ // uzp2 v6.8h, v12.8h, v13.8h // ............................'.*..............................................................'.~................................................
+ // ld1 {v7.8h}, [x3], #16 // .................e..........'.....................................................~..........'..................................................
+ // smull v8.4s, v3.4h, v5.4h // ............................'......*.........................................................'......~...........................................
+ // smull2 v10.4s, v3.8h, v5.8h // ............................'.......*........................................................'.......~..........................................
+ // smlal v8.4s, v4.4h, v7.4h // ............................'..........*.....................................................'..........~.......................................
+ // smlal2 v10.4s, v4.8h, v7.8h // ............................'...........*....................................................'...........~......................................
+ // smull v9.4s, v3.4h, v6.4h // ............................'....*...........................................................'....~.............................................
+ // smull2 v11.4s, v3.8h, v6.8h // ............................'.....*..........................................................'.....~............................................
+ // smlal v9.4s, v4.4h, v5.4h // ............................'........*.......................................................'........~.........................................
+ // smlal2 v11.4s, v4.8h, v5.8h // ............................'.........*......................................................'.........~........................................
+ // ldr q12, [x4], #32 // ..................e.........'......................................................~.........'..................................................
+ // ldr q13, [x4, #-16] // ...................e........'.......................................................~........'..................................................
+ // uzp1 v3.8h, v12.8h, v13.8h // ............................'............*...................................................'............~.....................................
+ // uzp2 v4.8h, v12.8h, v13.8h // ............................'.............*..................................................'.............~....................................
+ // ldr q12, [x5], #32 // ....................e.......'........................................................~.......'..................................................
+ // ldr q13, [x5, #-16] // .....................e......'.........................................................~......'..................................................
+ // uzp1 v5.8h, v12.8h, v13.8h // ............................'..............*.................................................'..............~...................................
+ // uzp2 v6.8h, v12.8h, v13.8h // ............................'...............*................................................'...............~..................................
+ // ld1 {v7.8h}, [x6], #16 // ......................e.....'..........................................................~.....'..................................................
+ // smlal v8.4s, v3.4h, v5.4h // ............................'................*...............................................'................~.................................
+ // smlal2 v10.4s, v3.8h, v5.8h // ............................'.................*..............................................'.................~................................
+ // smlal v8.4s, v4.4h, v7.4h // ............................'....................*...........................................'....................~.............................
+ // smlal2 v10.4s, v4.8h, v7.8h // ............................'.....................*..........................................'.....................~............................
+ // smlal v9.4s, v3.4h, v6.4h // ............................'..................*.............................................'..................~...............................
+ // smlal2 v11.4s, v3.8h, v6.8h // ............................'...................*............................................'...................~..............................
+ // smlal v9.4s, v4.4h, v5.4h // ............................'......................*.........................................'......................~...........................
+ // smlal2 v11.4s, v4.8h, v5.8h // ............................'.......................*........................................'.......................~..........................
+ // ldr q12, [x7], #32 // .......................e....'...........................................................~....'..................................................
+ // ldr q13, [x7, #-16] // ........................e...'............................................................~...'..................................................
+ // uzp1 v3.8h, v12.8h, v13.8h // ............................'........................*.......................................'........................~.........................
+ // uzp2 v4.8h, v12.8h, v13.8h // ............................'.........................*......................................'.........................~........................
+ // ldr q12, [x8], #32 // .........................e..'.............................................................~..'..................................................
+ // ldr q13, [x8, #-16] // ..........................e.'..............................................................~.'..................................................
+ // uzp1 v5.8h, v12.8h, v13.8h // ............................'..........................*.....................................'..........................~.......................
+ // uzp2 v6.8h, v12.8h, v13.8h // ............................'...........................*....................................'...........................~......................
+ // ld1 {v7.8h}, [x9], #16 // ...........................e'...............................................................~'..................................................
+ // smlal v8.4s, v3.4h, v5.4h // ............................'............................*...................................'............................~.....................
+ // smlal2 v10.4s, v3.8h, v5.8h // ............................'.............................*..................................'.............................~....................
+ // smlal v8.4s, v4.4h, v7.4h // ............................'................................*...............................'................................~.................
+ // smlal2 v10.4s, v4.8h, v7.8h // ............................'.................................*..............................'.................................~................
+ // smlal v9.4s, v3.4h, v6.4h // ............................'..............................*.................................'..............................~...................
+ // smlal2 v11.4s, v3.8h, v6.8h // ............................'...............................*................................'...............................~..................
+ // smlal v9.4s, v4.4h, v5.4h // ............................'..................................*.............................'..................................~...............
+ // smlal2 v11.4s, v4.8h, v5.8h // ............................'...................................*............................'...................................~..............
+ // uzp1 v28.8h, v8.8h, v10.8h // .~..........................'.....................................*..........................'.....................................~............
+ // mul v28.8h, v28.8h, v2.8h // ...~........................'.......................................*........................'.......................................~..........
+ // smlal v8.4s, v28.4h, v0.4h // .......~....................'...........................................*....................'...........................................~......
+ // smlal2 v10.4s, v28.8h, v0.8h // ........~...................'............................................*...................'............................................~.....
+ // uzp2 v26.8h, v8.8h, v10.8h // ............~...............'................................................*...............'................................................~.
+ // uzp1 v28.8h, v9.8h, v11.8h // ..~.........................'......................................*.........................'......................................~...........
+ // mul v28.8h, v28.8h, v2.8h // ....~.......................'........................................*.......................'........................................~.........
+ // smlal v9.4s, v28.4h, v0.4h // .........~..................'.............................................*..................'.............................................~....
+ // smlal2 v11.4s, v28.8h, v0.8h // ..........~.................'..............................................*.................'..............................................~...
+ // uzp2 v27.8h, v9.8h, v11.8h // ..............~.............'..................................................*.............'..................................................
+ // zip1 v12.8h, v26.8h, v27.8h // ......~.....................'..........................................~.....................'..........................................l.......
+ // zip2 v13.8h, v26.8h, v27.8h // .....~......................'.........................................~......................'.........................................l........
+ // str q12, [x0], #32 // ...........~................'...............................................~................'...............................................l..
+ // str q13, [x0, #-16] // .............~..............'.................................................~..............'.................................................l
+
+ sub count, count, #1
+ cbnz count, polyvec_basemul_acc_montgomery_cached_asm_k3_opt_loop
+ // Instructions: 55
+ // Expected cycles: 61
+ // Expected IPC: 0.90
+
+ // Cycle bound: 61.0
+ // IPC bound: 0.90
+
+ // Wall time: 8.41s
+ // User time: 8.41s
+
+ // ----------------- original position ------------------>
+ // 0 25 50
+ // |------------------------|------------------------|----
+ ldr q7, [x1], #32 // *......................................................
+ uzp1 v20.8H, v15.8H, v16.8H // .*.....................................................
+ uzp2 v15.8H, v15.8H, v16.8H // ..*....................................................
+ uzp1 v23.8H, v7.8H, v30.8H // ...*...................................................
+ uzp2 v11.8H, v7.8H, v30.8H // ....*..................................................
+ smull2 v8.4S, v23.8H, v20.8H // ........*..............................................
+ smull v5.4S, v23.4H, v20.4H // .......*...............................................
+ smull2 v30.4S, v23.8H, v15.8H // ......*................................................
+ uzp1 v28.8H, v1.8H, v12.8H // ...............*.......................................
+ smlal2 v8.4S, v11.8H, v9.8H // ............*..........................................
+ smlal v5.4S, v11.4H, v9.4H // ...........*...........................................
+ uzp1 v3.8H, v21.8H, v6.8H // .............*.........................................
+ smull v16.4S, v23.4H, v15.4H // .....*.................................................
+ smlal2 v8.4S, v3.8H, v28.8H // ..................*....................................
+ smlal v5.4S, v3.4H, v28.4H // .................*.....................................
+ uzp2 v29.8H, v21.8H, v6.8H // ..............*........................................
+ uzp1 v7.8H, v17.8H, v18.8H // ...........................*...........................
+ smlal2 v8.4S, v29.8H, v24.8H // ......................*................................
+ uzp1 v14.8H, v19.8H, v31.8H // .........................*.............................
+ smlal v16.4S, v11.4H, v20.4H // .........*.............................................
+ smlal2 v30.4S, v11.8H, v20.8H // ..........*............................................
+ smlal2 v8.4S, v14.8H, v7.8H // ..............................*........................
+ uzp2 v20.8H, v1.8H, v12.8H // ................*......................................
+ uzp2 v21.8H, v19.8H, v31.8H // ..........................*............................
+ smlal2 v30.4S, v3.8H, v20.8H // ....................*..................................
+ smlal v16.4S, v3.4H, v20.4H // ...................*...................................
+ smlal v5.4S, v29.4H, v24.4H // .....................*.................................
+ uzp2 v9.8H, v17.8H, v18.8H // ............................*..........................
+ smlal2 v30.4S, v29.8H, v28.8H // ........................*..............................
+ smlal v16.4S, v29.4H, v28.4H // .......................*...............................
+ smlal v5.4S, v14.4H, v7.4H // .............................*.........................
+ smlal2 v8.4S, v21.8H, v25.8H // ..................................*....................
+ smlal2 v30.4S, v14.8H, v9.8H // ................................*......................
+ smlal v16.4S, v14.4H, v9.4H // ...............................*.......................
+ smlal v5.4S, v21.4H, v25.4H // .................................*.....................
+ zip1 v20.8H, v27.8H, v10.8H // ..........................................*............
+ smlal2 v30.4S, v21.8H, v7.8H // ....................................*..................
+ smlal v16.4S, v21.4H, v7.4H // ...................................*...................
+ uzp1 v7.8H, v5.8H, v8.8H // .....................................*.................
+ str q20, [x0], #32 // ...............................................*.......
+ mul v15.8H, v7.8H, v2.8H // .......................................*...............
+ uzp1 v7.8H, v16.8H, v30.8H // ......................................*................
+ zip2 v31.8H, v27.8H, v10.8H // .........................................*.............
+ mul v20.8H, v7.8H, v2.8H // ........................................*..............
+ smlal v5.4S, v15.4H, v0.4H // ...........................................*...........
+ smlal2 v8.4S, v15.8H, v0.8H // ............................................*..........
+ str q31, [x0, #-16] // .................................................*.....
+ smlal2 v30.4S, v20.8H, v0.8H // ..............................................*........
+ smlal v16.4S, v20.4H, v0.4H // .............................................*.........
+ uzp2 v15.8H, v5.8H, v8.8H // ................................................*......
+ uzp2 v20.8H, v16.8H, v30.8H // ..................................................*....
+ zip1 v7.8H, v15.8H, v20.8H // ....................................................*..
+ zip2 v20.8H, v15.8H, v20.8H // ...................................................*...
+ str q7, [x0], #32 // .....................................................*.
+ str q20, [x0, #-16] // ......................................................*
+
+ // -------------------- new position -------------------->
+ // 0 25 50
+ // |------------------------|------------------------|----
+ // ldr q20, [x1], #32 // *......................................................
+ // uzp1 v7.8H, v15.8H, v16.8H // .*.....................................................
+ // uzp2 v15.8H, v15.8H, v16.8H // ..*....................................................
+ // uzp1 v8.8H, v20.8H, v30.8H // ...*...................................................
+ // uzp2 v20.8H, v20.8H, v30.8H // ....*..................................................
+ // smull v30.4S, v8.4H, v15.4H // ............*..........................................
+ // smull2 v15.4S, v8.8H, v15.8H // .......*...............................................
+ // smull v11.4S, v8.4H, v7.4H // ......*................................................
+ // smull2 v8.4S, v8.8H, v7.8H // .....*.................................................
+ // smlal v30.4S, v20.4H, v7.4H // ...................*...................................
+ // smlal2 v15.4S, v20.8H, v7.8H // ....................*..................................
+ // smlal v11.4S, v20.4H, v9.4H // ..........*............................................
+ // smlal2 v8.4S, v20.8H, v9.8H // .........*.............................................
+ // uzp1 v7.8H, v21.8H, v6.8H // ...........*...........................................
+ // uzp2 v20.8H, v21.8H, v6.8H // ...............*.......................................
+ // uzp1 v16.8H, v1.8H, v12.8H // ........*..............................................
+ // uzp2 v9.8H, v1.8H, v12.8H // ......................*................................
+ // smlal v11.4S, v7.4H, v16.4H // ..............*........................................
+ // smlal2 v8.4S, v7.8H, v16.8H // .............*.........................................
+ // smlal v30.4S, v7.4H, v9.4H // .........................*.............................
+ // smlal2 v15.4S, v7.8H, v9.8H // ........................*..............................
+ // smlal v11.4S, v20.4H, v24.4H // ..........................*............................
+ // smlal2 v8.4S, v20.8H, v24.8H // .................*.....................................
+ // smlal v30.4S, v20.4H, v16.4H // .............................*.........................
+ // smlal2 v15.4S, v20.8H, v16.8H // ............................*..........................
+ // uzp1 v7.8H, v19.8H, v31.8H // ..................*....................................
+ // uzp2 v20.8H, v19.8H, v31.8H // .......................*...............................
+ // uzp1 v16.8H, v17.8H, v18.8H // ................*......................................
+ // uzp2 v9.8H, v17.8H, v18.8H // ...........................*...........................
+ // smlal v11.4S, v7.4H, v16.4H // ..............................*........................
+ // smlal2 v8.4S, v7.8H, v16.8H // .....................*.................................
+ // smlal v30.4S, v7.4H, v9.4H // .................................*.....................
+ // smlal2 v15.4S, v7.8H, v9.8H // ................................*......................
+ // smlal v11.4S, v20.4H, v25.4H // ..................................*....................
+ // smlal2 v8.4S, v20.8H, v25.8H // ...............................*.......................
+ // smlal v30.4S, v20.4H, v16.4H // .....................................*.................
+ // smlal2 v15.4S, v20.8H, v16.8H // ....................................*..................
+ // uzp1 v7.8H, v11.8H, v8.8H // ......................................*................
+ // uzp1 v20.8H, v30.8H, v15.8H // .........................................*.............
+ // mul v7.8H, v7.8H, v2.8H // ........................................*..............
+ // mul v20.8H, v20.8H, v2.8H // ...........................................*...........
+ // zip2 v9.8H, v27.8H, v10.8H // ..........................................*............
+ // zip1 v27.8H, v27.8H, v10.8H // ...................................*...................
+ // smlal v11.4S, v7.4H, v0.4H // ............................................*..........
+ // smlal2 v8.4S, v7.8H, v0.8H // .............................................*.........
+ // smlal v30.4S, v20.4H, v0.4H // ................................................*......
+ // smlal2 v15.4S, v20.8H, v0.8H // ...............................................*.......
+ // str q27, [x0], #32 // .......................................*...............
+ // uzp2 v27.8H, v11.8H, v8.8H // .................................................*.....
+ // str q9, [x0, #-16] // ..............................................*........
+ // uzp2 v10.8H, v30.8H, v15.8H // ..................................................*....
+ // zip2 v9.8H, v27.8H, v10.8H // ....................................................*..
+ // zip1 v27.8H, v27.8H, v10.8H // ...................................................*...
+ // str q27, [x0], #32 // .....................................................*.
+ // str q9, [x0, #-16] // ......................................................*
+
+
+ pop_stack
+ ret
+
+/****************** REGISTER DEALLOCATIONS *******************/
+ .unreq out
+ .unreq a0_ptr
+ .unreq b0_ptr
+ .unreq b0_cache_ptr
+ .unreq a1_ptr
+ .unreq b1_ptr
+ .unreq b1_cache_ptr
+ .unreq a2_ptr
+ .unreq b2_ptr
+ .unreq b2_cache_ptr
+ .unreq a3_ptr
+ .unreq b3_ptr
+ .unreq b3_cache_ptr
+ .unreq count
+ .unreq modulus
+ .unreq modulus_twisted
+ .unreq wtmp
+ .unreq aa0
+ .unreq aa1
+ .unreq bb0
+ .unreq bb1
+ .unreq bb1t
+ .unreq res0l
+ .unreq res1l
+ .unreq res0h
+ .unreq res1h
+ .unreq tmp0
+ .unreq tmp1
+ .unreq q_tmp0
+ .unreq q_tmp1
+ .unreq out0
+ .unreq out1
+ .unreq t0
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT && MLKEM_K == 3 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S
new file mode 100644
index 000000000..c3d70ed42
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S
@@ -0,0 +1,761 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// AArch64 re-implementation of the asymmetric base multiplication from:
+
+// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
+// https://eprint.iacr.org/2021/986
+// https://github.com/neon-ntt/neon-ntt
+
+#include "../../../common.h"
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) && MLKEM_K == 4
+/* simpasm: header-end */
+
+// Input:
+// - Vectors al, ah of 32-bit entries
+// Output:
+// - Montgomery reductions of al || ah, stored in al
+.macro montgomery_reduce_long x, a
+ uzp1 t0.8h, \a\()l.8h, \a\()h.8h
+ mul t0.8h, t0.8h, modulus_twisted.8h
+ smlal \a\()l.4s, t0.4h, modulus.4h
+ smlal2 \a\()h.4s, t0.8h, modulus.8h
+ uzp2 \x\().8h, \a\()l.8h, \a\()h.8h
+.endm
+
+// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit.
+
+// Bounds:
+// - Assume |a| < 4096,
+// - Result: < 2*4096*2^15 = 2^28
+.macro pmull d, a, b
+ smull \d\()0l.4s, \a\()0.4h, \b\()0.4h
+ smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
+ smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
+ smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
+
+ smull \d\()1l.4s, \a\()0.4h, \b\()1.4h
+ smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
+ smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
+ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
+.endm
+
+.macro pmlal d, a, b
+ smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h
+ smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
+ smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
+ smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
+
+ smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h
+ smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
+ smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
+ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
+.endm
+
+.macro ld2_wrap a, ptr
+ ldr q_tmp0, [\ptr\()], #32
+ ldr q_tmp1, [\ptr\(), #-16]
+ uzp1 \a\()0.8h, tmp0.8h, tmp1.8h
+ uzp2 \a\()1.8h, tmp0.8h, tmp1.8h
+.endm
+
+.macro st2_wrap a, ptr
+ zip1 tmp0.8h, \a\()0.8h, \a\()1.8h
+ zip2 tmp1.8h, \a\()0.8h, \a\()1.8h
+ str q_tmp0, [\ptr\()], #32
+ str q_tmp1, [\ptr\(), #-16]
+.endm
+
+.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr
+ ld2_wrap \a\(), \a_ptr
+ ld2_wrap \b\(), \b_ptr
+ ld1 {\b\()1t.8h}, [\b_cache_ptr], #16
+.endm
+
+.macro save_vregs
+ sub sp, sp, #(16*4)
+ stp d8, d9, [sp, #16*0]
+ stp d10, d11, [sp, #16*1]
+ stp d12, d13, [sp, #16*2]
+ stp d14, d15, [sp, #16*3]
+.endm
+
+.macro restore_vregs
+ ldp d8, d9, [sp, #16*0]
+ ldp d10, d11, [sp, #16*1]
+ ldp d12, d13, [sp, #16*2]
+ ldp d14, d15, [sp, #16*3]
+ add sp, sp, #(16*4)
+.endm
+
+.macro push_stack
+ save_vregs
+.endm
+
+.macro pop_stack
+ restore_vregs
+.endm
+
+ out .req x0
+ a0_ptr .req x1
+ b0_ptr .req x2
+ b0_cache_ptr .req x3
+ a1_ptr .req x4
+ b1_ptr .req x5
+ b1_cache_ptr .req x6
+ a2_ptr .req x7
+ b2_ptr .req x8
+ b2_cache_ptr .req x9
+ a3_ptr .req x10
+ b3_ptr .req x11
+ b3_cache_ptr .req x12
+ count .req x13
+ wtmp .req w14
+
+ modulus .req v0
+ modulus_twisted .req v2
+
+ aa0 .req v3
+ aa1 .req v4
+ bb0 .req v5
+ bb1 .req v6
+ bb1t .req v7
+
+ res0l .req v8
+ res1l .req v9
+ res0h .req v10
+ res1h .req v11
+
+ tmp0 .req v12
+ tmp1 .req v13
+ q_tmp0 .req q12
+ q_tmp1 .req q13
+
+ out0 .req v26
+ out1 .req v27
+
+ t0 .req v28
+
+ .text
+ .global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4_opt)
+ .balign 4
+MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4_opt):
+ push_stack
+ mov wtmp, #3329
+ dup modulus.8h, wtmp
+
+ mov wtmp, #3327
+ dup modulus_twisted.8h, wtmp
+
+ // Computed bases of vector entries
+
+ add a1_ptr, a0_ptr, #(1 * 512)
+ add b1_ptr, b0_ptr, #(1 * 512)
+ add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
+ add a2_ptr, a0_ptr, #(2 * 512)
+ add b2_ptr, b0_ptr, #(2 * 512)
+ add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2)
+ add a3_ptr, a0_ptr, #(3 * 512)
+ add b3_ptr, b0_ptr, #(3 * 512)
+ add b3_cache_ptr, b0_cache_ptr, #(3 * 512/2)
+
+ // Bounds:
+
+ // Each pmull is bound by 2*4096*2^15=2^28, so the final value
+ // before Montgomery reduction is bound by 2^30.
+
+ mov count, #(MLKEM_N / 16)
+ // Instructions: 114
+ // Expected cycles: 153
+ // Expected IPC: 0.75
+ //
+ // Cycle bound: 153.0
+ // IPC bound: 0.75
+ //
+ // Wall time: 0.69s
+ // User time: 0.69s
+ //
+ // ----------------------------------------------- original position ----------------------------------------------->
+ // 0 25 50 75 100
+ // |------------------------|------------------------|------------------------|------------------------|-------------
+ ldr q23, [x2, #16] // .*................................................................................................................
+ ldr q19, [x2], #32 // *.................................................................................................................
+ ldr q17, [x5], #32 // ..*...............................................................................................................
+ uzp2 v13.8H, v19.8H, v23.8H // ..........*.......................................................................................................
+ uzp1 v19.8H, v19.8H, v23.8H // ...........*......................................................................................................
+ ldr q23, [x5, #-16] // ...*..............................................................................................................
+ ldr q30, [x1, #16] // .....*............................................................................................................
+ uzp2 v9.8H, v17.8H, v23.8H // ....*.............................................................................................................
+ uzp1 v23.8H, v17.8H, v23.8H // .......*..........................................................................................................
+ ldr q17, [x1], #32 // ......*...........................................................................................................
+ ldr q10, [x7, #16] // .............*....................................................................................................
+ uzp1 v12.8H, v17.8H, v30.8H // ........*.........................................................................................................
+ uzp2 v17.8H, v17.8H, v30.8H // .........*........................................................................................................
+ smull2 v30.4S, v12.8H, v13.8H // ............*.....................................................................................................
+ smull v13.4S, v12.4H, v13.4H // ............................................*.....................................................................
+ smull2 v22.4S, v12.8H, v19.8H // .....................................*............................................................................
+ smull v12.4S, v12.4H, v19.4H // ..........................................*.......................................................................
+ smlal2 v30.4S, v17.8H, v19.8H // ...............................*..................................................................................
+ smlal v13.4S, v17.4H, v19.4H // ...............................................*..................................................................
+ ldr q19, [x4], #32 // ....................*.............................................................................................
+ ldr q16, [x4, #-16] // .....................*............................................................................................
+ ld1 {v8.8H}, [x3], #16 // ................................*.................................................................................
+ uzp1 v26.8H, v19.8H, v16.8H // .......................*..........................................................................................
+ uzp2 v19.8H, v19.8H, v16.8H // ........................*.........................................................................................
+ smlal2 v30.4S, v26.8H, v9.8H // .................................*................................................................................
+ smlal v13.4S, v26.4H, v9.4H // ..................................................*...............................................................
+ smlal2 v22.4S, v17.8H, v8.8H // ........................................*.........................................................................
+ smlal v12.4S, v17.4H, v8.4H // .................................................*................................................................
+ smlal2 v30.4S, v19.8H, v23.8H // ...................................*..............................................................................
+ smlal v13.4S, v19.4H, v23.4H // .......................................................*..........................................................
+ smlal2 v22.4S, v26.8H, v23.8H // ...........................................*......................................................................
+ smlal v12.4S, v26.4H, v23.4H // .....................................................*............................................................
+ ldr q23, [x7], #32 // ......................*...........................................................................................
+ ldr q17, [x8, #16] // ..............*...................................................................................................
+ uzp1 v9.8H, v23.8H, v10.8H // ..........................*.......................................................................................
+ uzp2 v23.8H, v23.8H, v10.8H // ....................................*.............................................................................
+ ldr q10, [x10], #32 // ...............*..................................................................................................
+ ldr q16, [x10, #-16] // ................*.................................................................................................
+ ld1 {v8.8H}, [x12], #16 // .................*................................................................................................
+ uzp1 v26.8H, v10.8H, v16.8H // ..................*...............................................................................................
+ uzp2 v10.8H, v10.8H, v16.8H // ...................*..............................................................................................
+ ld1 {v16.8H}, [x6], #16 // .........................*........................................................................................
+ ldr q3, [x11, #16] // ...........................*......................................................................................
+ smlal2 v22.4S, v19.8H, v16.8H // ..............................................*...................................................................
+ smlal v12.4S, v19.4H, v16.4H // ........................................................*.........................................................
+ ldr q19, [x11], #32 // ............................*.....................................................................................
+ ld1 {v16.8H}, [x9], #16 // .............................*....................................................................................
+ uzp1 v4.8H, v19.8H, v3.8H // ..................................*...............................................................................
+ uzp2 v19.8H, v19.8H, v3.8H // .......................................*..........................................................................
+ ldr q3, [x8], #32 // ..............................*...................................................................................
+ ldr q31, [x2], #32 // ......................................*...........................................................................
+ uzp1 v6.8H, v3.8H, v17.8H // ...................................................*..............................................................
+ uzp2 v17.8H, v3.8H, v17.8H // .........................................................*........................................................
+ smlal2 v22.4S, v9.8H, v6.8H // ..........................................................*.......................................................
+ smlal2 v30.4S, v9.8H, v17.8H // ...........................................................*......................................................
+ smlal v13.4S, v9.4H, v17.4H // ............................................................*.....................................................
+ smlal v12.4S, v9.4H, v6.4H // .............................................................*....................................................
+ smlal2 v22.4S, v23.8H, v16.8H // ..............................................................*...................................................
+ smlal2 v30.4S, v23.8H, v6.8H // ...............................................................*..................................................
+ smlal v13.4S, v23.4H, v6.4H // ................................................................*.................................................
+ smlal v12.4S, v23.4H, v16.4H // .................................................................*................................................
+ smlal2 v22.4S, v26.8H, v4.8H // ..................................................................*...............................................
+ smlal2 v30.4S, v26.8H, v19.8H // ...................................................................*..............................................
+ smlal v13.4S, v26.4H, v19.4H // ....................................................................*.............................................
+ smlal v12.4S, v26.4H, v4.4H // .....................................................................*............................................
+ smlal2 v22.4S, v10.8H, v8.8H // ......................................................................*...........................................
+ smlal2 v30.4S, v10.8H, v4.8H // .......................................................................*..........................................
+ smlal v13.4S, v10.4H, v4.4H // ........................................................................*.........................................
+ smlal v12.4S, v10.4H, v8.4H // .........................................................................*........................................
+ ldr q19, [x2, #-16] // .........................................*........................................................................
+ uzp1 v23.8H, v13.8H, v30.8H // ...........................................................................*......................................
+ uzp1 v17.8H, v12.8H, v22.8H // ....................................................................................*.............................
+ mul v23.8H, v23.8H, v2.8H // .............................................................................*....................................
+ uzp2 v21.8H, v31.8H, v19.8H // ................................................................................*.................................
+ uzp1 v19.8H, v31.8H, v19.8H // ...................................................................................*..............................
+ mul v17.8H, v17.8H, v2.8H // .....................................................................................*............................
+ smlal v13.4S, v23.4H, v0.4H // .................................................................................*................................
+ smlal2 v30.4S, v23.8H, v0.8H // ..................................................................................*...............................
+ ldr q23, [x5], #32 // .............................................*....................................................................
+ smlal2 v22.4S, v17.8H, v0.8H // ...........................................................................................................*......
+ uzp2 v15.8H, v13.8H, v30.8H // ......................................................................................*...........................
+ smlal v12.4S, v17.4H, v0.4H // ............................................................................................................*.....
+ ldr q17, [x5, #-16] // ................................................*.................................................................
+ ldr q13, [x1, #16] // ......................................................*...........................................................
+ uzp2 v27.8H, v23.8H, v17.8H // ....................................................*.............................................................
+ uzp1 v28.8H, v23.8H, v17.8H // ............................................................................*.....................................
+ uzp2 v7.8H, v12.8H, v22.8H // ...............................................................................................................*..
+ ldr q23, [x1], #32 // ..........................................................................*.......................................
+ zip1 v5.8H, v7.8H, v15.8H // .................................................................................................................*
+ ldr q3, [x7, #16] // ........................................................................................*.........................
+ uzp1 v31.8H, v23.8H, v13.8H // ..............................................................................*...................................
+ uzp2 v16.8H, v23.8H, v13.8H // ...............................................................................*..................................
+ smull2 v24.4S, v31.8H, v21.8H // .......................................................................................*..........................
+ ldr q6, [x8, #16] // .........................................................................................*........................
+ ldr q23, [x10], #32 // ..........................................................................................*.......................
+ smlal2 v24.4S, v16.8H, v19.8H // ..........................................................................................................*.......
+ ldr q17, [x10, #-16] // ...........................................................................................*......................
+ ld1 {v22.8H}, [x12], #16 // ............................................................................................*.....................
+ uzp1 v30.8H, v23.8H, v17.8H // .............................................................................................*....................
+ uzp2 v11.8H, v23.8H, v17.8H // ..............................................................................................*...................
+ ldr q23, [x4], #32 // ...............................................................................................*..................
+ ldr q17, [x4, #-16] // ................................................................................................*.................
+ ldr q4, [x7], #32 // .................................................................................................*................
+ uzp1 v20.8H, v23.8H, v17.8H // ..................................................................................................*...............
+ uzp2 v26.8H, v23.8H, v17.8H // ...................................................................................................*..............
+ uzp1 v9.8H, v4.8H, v3.8H // .....................................................................................................*............
+ smlal2 v24.4S, v20.8H, v27.8H // ..............................................................................................................*...
+ ld1 {v8.8H}, [x6], #16 // ....................................................................................................*.............
+ ldr q25, [x11, #16] // ......................................................................................................*...........
+ ldr q29, [x11], #32 // .......................................................................................................*..........
+ ld1 {v12.8H}, [x9], #16 // ........................................................................................................*.........
+ uzp1 v10.8H, v29.8H, v25.8H // ................................................................................................................*.
+ ldr q14, [x8], #32 // .........................................................................................................*........
+ ld1 {v23.8H}, [x3], #16 // .............................................................................................................*....
+
+ // ------------------------------------------------- new position -------------------------------------------------->
+ // 0 25 50 75 100
+ // |------------------------|------------------------|------------------------|------------------------|-------------
+ // ldr q3, [x2], #32 // .*................................................................................................................
+ // ldr q17, [x2, #-16] // *.................................................................................................................
+ // ldr q21, [x5], #32 // ..*...............................................................................................................
+ // ldr q19, [x5, #-16] // .....*............................................................................................................
+ // uzp2 v27.8H, v21.8H, v19.8H // .......*..........................................................................................................
+ // ldr q25, [x1, #16] // ......*...........................................................................................................
+ // ldr q22, [x1], #32 // .........*........................................................................................................
+ // uzp1 v28.8H, v21.8H, v19.8H // ........*.........................................................................................................
+ // uzp1 v31.8H, v22.8H, v25.8H // ...........*......................................................................................................
+ // uzp2 v16.8H, v22.8H, v25.8H // ............*.....................................................................................................
+ // uzp2 v21.8H, v3.8H, v17.8H // ...*..............................................................................................................
+ // uzp1 v19.8H, v3.8H, v17.8H // ....*.............................................................................................................
+ // smull2 v24.4S, v31.8H, v21.8H // .............*....................................................................................................
+ // ldr q3, [x7, #16] // ..........*.......................................................................................................
+ // ldr q6, [x8, #16] // .................................*................................................................................
+ // ldr q8, [x10], #32 // ....................................*.............................................................................
+ // ldr q26, [x10, #-16] // .....................................*............................................................................
+ // ld1 {v22.8H}, [x12], #16 // ......................................*...........................................................................
+ // uzp1 v30.8H, v8.8H, v26.8H // .......................................*..........................................................................
+ // uzp2 v11.8H, v8.8H, v26.8H // ........................................*.........................................................................
+ // ldr q8, [x4], #32 // ...................*..............................................................................................
+ // ldr q26, [x4, #-16] // ....................*.............................................................................................
+ // ldr q4, [x7], #32 // ................................*.................................................................................
+ // uzp1 v20.8H, v8.8H, v26.8H // ......................*...........................................................................................
+ // uzp2 v26.8H, v8.8H, v26.8H // .......................*..........................................................................................
+ // ld1 {v8.8H}, [x6], #16 // .........................................*........................................................................
+ // uzp1 v9.8H, v4.8H, v3.8H // ..................................*...............................................................................
+ // ldr q25, [x11, #16] // ..........................................*.......................................................................
+ // ldr q29, [x11], #32 // .............................................*....................................................................
+ // ld1 {v12.8H}, [x9], #16 // ..............................................*...................................................................
+ // ldr q14, [x8], #32 // .................................................*................................................................
+ // smlal2 v24.4S, v16.8H, v19.8H // .................*................................................................................................
+ // ld1 {v23.8H}, [x3], #16 // .....................*............................................................................................
+ // smlal2 v24.4S, v20.8H, v27.8H // ........................*.........................................................................................
+ // uzp1 v10.8H, v29.8H, v25.8H // ...............................................*..................................................................
+ // smlal2 v24.4S, v26.8H, v28.8H // ............................*.....................................................................................
+ // uzp2 v4.8H, v4.8H, v3.8H // ...................................*..............................................................................
+ // smull2 v13.4S, v31.8H, v19.8H // ...............*..................................................................................................
+ // ldr q3, [x2], #32 // ..................................................*...............................................................
+ // uzp2 v1.8H, v29.8H, v25.8H // ................................................*.................................................................
+ // smlal2 v13.4S, v16.8H, v23.8H // ..........................*.......................................................................................
+ // ldr q17, [x2, #-16] // .....................................................................*............................................
+ // smull v18.4S, v31.4H, v19.4H // ................*.................................................................................................
+ // smlal2 v13.4S, v20.8H, v28.8H // ..............................*...................................................................................
+ // smull v29.4S, v31.4H, v21.4H // ..............*...................................................................................................
+ // ldr q21, [x5], #32 // ..............................................................................*...................................
+ // smlal2 v13.4S, v26.8H, v8.8H // ...........................................*......................................................................
+ // smlal v29.4S, v16.4H, v19.4H // ..................*...............................................................................................
+ // ldr q19, [x5, #-16] // ..................................................................................*...............................
+ // smlal v18.4S, v16.4H, v23.4H // ...........................*......................................................................................
+ // smlal v29.4S, v20.4H, v27.4H // .........................*........................................................................................
+ // uzp1 v31.8H, v14.8H, v6.8H // ...................................................*..............................................................
+ // uzp2 v27.8H, v21.8H, v19.8H // ....................................................................................*.............................
+ // smlal v18.4S, v20.4H, v28.4H // ...............................*..................................................................................
+ // ldr q25, [x1, #16] // ...................................................................................*..............................
+ // smlal v29.4S, v26.4H, v28.4H // .............................*....................................................................................
+ // smlal v18.4S, v26.4H, v8.4H // ............................................*.....................................................................
+ // uzp2 v26.8H, v14.8H, v6.8H // ....................................................*.............................................................
+ // smlal2 v13.4S, v9.8H, v31.8H // .....................................................*............................................................
+ // smlal2 v24.4S, v9.8H, v26.8H // ......................................................*...........................................................
+ // smlal v29.4S, v9.4H, v26.4H // .......................................................*..........................................................
+ // smlal v18.4S, v9.4H, v31.4H // ........................................................*.........................................................
+ // smlal2 v13.4S, v4.8H, v12.8H // .........................................................*........................................................
+ // smlal2 v24.4S, v4.8H, v31.8H // ..........................................................*.......................................................
+ // smlal v29.4S, v4.4H, v31.4H // ...........................................................*......................................................
+ // smlal v18.4S, v4.4H, v12.4H // ............................................................*.....................................................
+ // smlal2 v13.4S, v30.8H, v10.8H // .............................................................*....................................................
+ // smlal2 v24.4S, v30.8H, v1.8H // ..............................................................*...................................................
+ // smlal v29.4S, v30.4H, v1.4H // ...............................................................*..................................................
+ // smlal v18.4S, v30.4H, v10.4H // ................................................................*.................................................
+ // smlal2 v13.4S, v11.8H, v22.8H // .................................................................*................................................
+ // smlal2 v24.4S, v11.8H, v10.8H // ..................................................................*...............................................
+ // smlal v29.4S, v11.4H, v10.4H // ...................................................................*..............................................
+ // smlal v18.4S, v11.4H, v22.4H // ....................................................................*.............................................
+ // ldr q22, [x1], #32 // .......................................................................................*..........................
+ // uzp1 v31.8H, v29.8H, v24.8H // ......................................................................*...........................................
+ // uzp1 v28.8H, v21.8H, v19.8H // .....................................................................................*............................
+ // mul v19.8H, v31.8H, v2.8H // ........................................................................*.........................................
+ // uzp1 v31.8H, v22.8H, v25.8H // ..........................................................................................*.......................
+ // uzp2 v16.8H, v22.8H, v25.8H // ...........................................................................................*......................
+ // uzp2 v21.8H, v3.8H, v17.8H // .........................................................................*........................................
+ // smlal v29.4S, v19.4H, v0.4H // ............................................................................*.....................................
+ // smlal2 v24.4S, v19.8H, v0.8H // .............................................................................*....................................
+ // uzp1 v19.8H, v3.8H, v17.8H // ..........................................................................*.......................................
+ // uzp1 v26.8H, v18.8H, v13.8H // .......................................................................*..........................................
+ // mul v23.8H, v26.8H, v2.8H // ...........................................................................*......................................
+ // uzp2 v15.8H, v29.8H, v24.8H // ................................................................................*.................................
+ // smull2 v24.4S, v31.8H, v21.8H // ............................................................................................*.....................
+ // ldr q3, [x7, #16] // .........................................................................................*........................
+ // ldr q6, [x8, #16] // .............................................................................................*....................
+ // ldr q8, [x10], #32 // ..............................................................................................*...................
+ // ldr q26, [x10, #-16] // ................................................................................................*.................
+ // ld1 {v22.8H}, [x12], #16 // .................................................................................................*................
+ // uzp1 v30.8H, v8.8H, v26.8H // ..................................................................................................*...............
+ // uzp2 v11.8H, v8.8H, v26.8H // ...................................................................................................*..............
+ // ldr q8, [x4], #32 // ....................................................................................................*.............
+ // ldr q26, [x4, #-16] // .....................................................................................................*............
+ // ldr q4, [x7], #32 // ......................................................................................................*...........
+ // uzp1 v20.8H, v8.8H, v26.8H // .......................................................................................................*..........
+ // uzp2 v26.8H, v8.8H, v26.8H // ........................................................................................................*.........
+ // ld1 {v8.8H}, [x6], #16 // ...........................................................................................................*......
+ // uzp1 v9.8H, v4.8H, v3.8H // .........................................................................................................*........
+ // ldr q25, [x11, #16] // ............................................................................................................*.....
+ // ldr q29, [x11], #32 // .............................................................................................................*....
+ // ld1 {v12.8H}, [x9], #16 // ..............................................................................................................*...
+ // ldr q14, [x8], #32 // ................................................................................................................*.
+ // smlal2 v24.4S, v16.8H, v19.8H // ...............................................................................................*..................
+ // smlal2 v13.4S, v23.8H, v0.8H // ...............................................................................*..................................
+ // smlal v18.4S, v23.4H, v0.4H // .................................................................................*................................
+ // ld1 {v23.8H}, [x3], #16 // .................................................................................................................*
+ // smlal2 v24.4S, v20.8H, v27.8H // ..........................................................................................................*.......
+ // uzp2 v7.8H, v18.8H, v13.8H // ......................................................................................*...........................
+ // uzp1 v10.8H, v29.8H, v25.8H // ...............................................................................................................*..
+ // zip1 v5.8H, v7.8H, v15.8H // ........................................................................................*.........................
+
+ sub count, count, #2
+polyvec_basemul_acc_montgomery_cached_asm_k4_opt_loop:
+ // Instructions: 82
+ // Expected cycles: 102
+ // Expected IPC: 0.80
+ //
+ // Cycle bound: 102.0
+ // IPC bound: 0.80
+ //
+ // Wall time: 15.93s
+ // User time: 15.93s
+ //
+ // ------------------------------- original position ------------------------------->
+ // 0 25 50 75
+ // |------------------------|------------------------|------------------------|------
+ smlal2 v24.4S, v26.8H, v28.8H // .................................*................................................
+ uzp2 v4.8H, v4.8H, v3.8H // .....................................*............................................
+ smull2 v13.4S, v31.8H, v19.8H // ..........*.......................................................................
+ ldr q3, [x2], #32 // ....e.............................................................................
+ uzp2 v1.8H, v29.8H, v25.8H // ..........................................................*.......................
+ smlal2 v13.4S, v16.8H, v23.8H // ............*.....................................................................
+ ldr q17, [x2, #-16] // .....e............................................................................
+ smull v18.4S, v31.4H, v19.4H // .........*........................................................................
+ smlal2 v13.4S, v20.8H, v28.8H // ...........................*......................................................
+ smull v29.4S, v31.4H, v21.4H // .............*....................................................................
+ ldr q21, [x5], #32 // .....................e............................................................
+ smlal2 v13.4S, v26.8H, v8.8H // .............................*....................................................
+ smlal v29.4S, v16.4H, v19.4H // ...............*..................................................................
+ ldr q19, [x5, #-16] // ......................e...........................................................
+ smlal v18.4S, v16.4H, v23.4H // ...........*......................................................................
+ smlal v29.4S, v20.4H, v27.4H // ..............................*...................................................
+ uzp1 v31.8H, v14.8H, v6.8H // ........................................*.........................................
+ uzp2 v27.8H, v21.8H, v19.8H // ........................e.........................................................
+ smlal v18.4S, v20.4H, v28.4H // ..........................*.......................................................
+ ldr q25, [x1, #16] // .e................................................................................
+ smlal v29.4S, v26.4H, v28.4H // ................................*.................................................
+ smlal v18.4S, v26.4H, v8.4H // ............................*.....................................................
+ uzp2 v26.8H, v14.8H, v6.8H // .........................................*........................................
+ smlal2 v13.4S, v9.8H, v31.8H // ............................................*.....................................
+ smlal2 v24.4S, v9.8H, v26.8H // ................................................*.................................
+ smlal v29.4S, v9.4H, v26.4H // ...............................................*..................................
+ smlal v18.4S, v9.4H, v31.4H // ...........................................*......................................
+ smlal2 v13.4S, v4.8H, v12.8H // ..............................................*...................................
+ smlal2 v24.4S, v4.8H, v31.8H // ..................................................*...............................
+ smlal v29.4S, v4.4H, v31.4H // .................................................*................................
+ smlal v18.4S, v4.4H, v12.4H // .............................................*....................................
+ smlal2 v13.4S, v30.8H, v10.8H // .............................................................*....................
+ smlal2 v24.4S, v30.8H, v1.8H // .................................................................*................
+ smlal v29.4S, v30.4H, v1.4H // ................................................................*.................
+ smlal v18.4S, v30.4H, v10.4H // ............................................................*.....................
+ smlal2 v13.4S, v11.8H, v22.8H // ...............................................................*..................
+ smlal2 v24.4S, v11.8H, v10.8H // ...................................................................*..............
+ smlal v29.4S, v11.4H, v10.4H // ..................................................................*...............
+ smlal v18.4S, v11.4H, v22.4H // ..............................................................*...................
+ ldr q22, [x1], #32 // e.................................................................................
+ uzp1 v31.8H, v29.8H, v24.8H // .........................................................................*........
+ uzp1 v28.8H, v21.8H, v19.8H // .......................e..........................................................
+ mul v19.8H, v31.8H, v2.8H // ..........................................................................*.......
+ uzp1 v31.8H, v22.8H, v25.8H // ..e...............................................................................
+ uzp2 v16.8H, v22.8H, v25.8H // ...e..............................................................................
+ uzp2 v21.8H, v3.8H, v17.8H // .......e..........................................................................
+ smlal v29.4S, v19.4H, v0.4H // ...........................................................................*......
+ smlal2 v24.4S, v19.8H, v0.8H // ............................................................................*.....
+ uzp1 v19.8H, v3.8H, v17.8H // ......e...........................................................................
+ uzp1 v26.8H, v18.8H, v13.8H // ....................................................................*.............
+ zip2 v14.8H, v7.8H, v15.8H // ...............................................................................l..
+ mul v23.8H, v26.8H, v2.8H // .....................................................................*............
+ uzp2 v15.8H, v29.8H, v24.8H // .............................................................................*....
+ smull2 v24.4S, v31.8H, v21.8H // ..............e...................................................................
+ str q14, [x0, #16] // .................................................................................l
+ ldr q3, [x7, #16] // ...................................e..............................................
+ ldr q6, [x8, #16] // .......................................e..........................................
+ ldr q8, [x10], #32 // ...................................................e..............................
+ ldr q26, [x10, #-16] // ....................................................e.............................
+ ld1 {v22.8H}, [x12], #16 // ...........................................................e......................
+ uzp1 v30.8H, v8.8H, v26.8H // .....................................................e............................
+ uzp2 v11.8H, v8.8H, v26.8H // ......................................................e...........................
+ ldr q8, [x4], #32 // .................e................................................................
+ ldr q26, [x4, #-16] // ..................e...............................................................
+ ldr q4, [x7], #32 // ..................................e...............................................
+ uzp1 v20.8H, v8.8H, v26.8H // ...................e..............................................................
+ uzp2 v26.8H, v8.8H, v26.8H // ....................e.............................................................
+ ld1 {v8.8H}, [x6], #16 // .........................e........................................................
+ uzp1 v9.8H, v4.8H, v3.8H // ....................................e.............................................
+ ldr q25, [x11, #16] // ........................................................e.........................
+ ldr q29, [x11], #32 // .......................................................e..........................
+ ld1 {v12.8H}, [x9], #16 // ..........................................e.......................................
+ ldr q14, [x8], #32 // ......................................e...........................................
+ smlal2 v24.4S, v16.8H, v19.8H // ................e.................................................................
+ smlal2 v13.4S, v23.8H, v0.8H // .......................................................................*..........
+ smlal v18.4S, v23.4H, v0.4H // ......................................................................*...........
+ ld1 {v23.8H}, [x3], #16 // ........e.........................................................................
+ smlal2 v24.4S, v20.8H, v27.8H // ...............................e..................................................
+ uzp2 v7.8H, v18.8H, v13.8H // ........................................................................*.........
+ uzp1 v10.8H, v29.8H, v25.8H // .........................................................e........................
+ str q5, [x0], #32 // ................................................................................l.
+ zip1 v5.8H, v7.8H, v15.8H // ..............................................................................*...
+
+ // ----------------------------------------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------------------------------------>
+ // 0 25 50 75 100 125 150 175 200 225
+ // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------
+ // ldr q12, [x1], #32 // ....................................e..........................................'......................................~..........................................'......................................~.........................................
+ // ldr q13, [x1, #-16] // ................e..............................................................'..................~..............................................................'..................~.............................................................
+ // uzp1 v3.8h, v12.8h, v13.8h // ........................................e......................................'..........................................~......................................'..........................................~.....................................
+ // uzp2 v4.8h, v12.8h, v13.8h // .........................................e.....................................'...........................................~.....................................'...........................................~....................................
+ // ldr q12, [x2], #32 // e..............................................................................'..~..............................................................................'..~.............................................................................
+ // ldr q13, [x2, #-16] // ...e...........................................................................'.....~...........................................................................'.....~..........................................................................
+ // uzp1 v5.8h, v12.8h, v13.8h // .............................................e.................................'...............................................~.................................'...............................................~................................
+ // uzp2 v6.8h, v12.8h, v13.8h // ..........................................e....................................'............................................~....................................'............................................~...................................
+ // ld1 {v7.8h}, [x3], #16 // .........................................................................e.....'...........................................................................~.....'...........................................................................~....
+ // smull v8.4s, v3.4h, v5.4h // ....~..........................................................................'......*..........................................................................'......~.........................................................................
+ // smull2 v10.4s, v3.8h, v5.8h // ...............................................................................'.*...............................................................................'.~..............................................................................
+ // smlal v8.4s, v4.4h, v7.4h // ...........~...................................................................'.............*...................................................................'.............~..................................................................
+ // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................................................'....*............................................................................'....~...........................................................................
+ // smull v9.4s, v3.4h, v6.4h // ......~........................................................................'........*........................................................................'........~.......................................................................
+ // smull2 v11.4s, v3.8h, v6.8h // ..................................................e............................'....................................................~............................'....................................................~...........................
+ // smlal v9.4s, v4.4h, v5.4h // .........~.....................................................................'...........*.....................................................................'...........~....................................................................
+ // smlal2 v11.4s, v4.8h, v5.8h // ......................................................................e........'........................................................................~........'........................................................................~.......
+ // ldr q12, [x4], #32 // ...........................................................e...................'.............................................................~...................'.............................................................~..................
+ // ldr q13, [x4, #-16] // ............................................................e..................'..............................................................~..................'..............................................................~.................
+ // uzp1 v3.8h, v12.8h, v13.8h // ..............................................................e................'................................................................~................'................................................................~...............
+ // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................e...............'.................................................................~...............'.................................................................~..............
+ // ldr q12, [x5], #32 // .......e.......................................................................'.........~.......................................................................'.........~......................................................................
+ // ldr q13, [x5, #-16] // ..........e....................................................................'............~....................................................................'............~...................................................................
+ // uzp1 v5.8h, v12.8h, v13.8h // ......................................e........................................'........................................~........................................'........................................~.......................................
+ // uzp2 v6.8h, v12.8h, v13.8h // ..............e................................................................'................~................................................................'................~...............................................................
+ // ld1 {v7.8h}, [x6], #16 // ................................................................e..............'..................................................................~..............'..................................................................~.............
+ // smlal v8.4s, v3.4h, v5.4h // ...............~...............................................................'.................*...............................................................'.................~..............................................................
+ // smlal2 v10.4s, v3.8h, v5.8h // .....~.........................................................................'.......*.........................................................................'.......~........................................................................
+ // smlal v8.4s, v4.4h, v7.4h // ..................~............................................................'....................*............................................................'....................~...........................................................
+ // smlal2 v10.4s, v4.8h, v7.8h // ........~......................................................................'..........*......................................................................'..........~.....................................................................
+ // smlal v9.4s, v3.4h, v6.4h // ............~..................................................................'..............*..................................................................'..............~.................................................................
+ // smlal2 v11.4s, v3.8h, v6.8h // ..........................................................................e....'............................................................................~....'............................................................................~...
+ // smlal v9.4s, v4.4h, v5.4h // .................~.............................................................'...................*.............................................................'...................~............................................................
+ // smlal2 v11.4s, v4.8h, v5.8h // ...............................................................................*.................................................................................~................................................................................
+ // ldr q12, [x7], #32 // .............................................................e.................'...............................................................~.................'...............................................................~................
+ // ldr q13, [x7, #-16] // ....................................................e..........................'......................................................~..........................'......................................................~.........................
+ // uzp1 v3.8h, v12.8h, v13.8h // .................................................................e.............'...................................................................~.............'...................................................................~............
+ // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................................'*................................................................................'~...............................................................................
+ // ldr q12, [x8], #32 // .....................................................................e.........'.......................................................................~.........'.......................................................................~........
+ // ldr q13, [x8, #-16] // .....................................................e.........................'.......................................................~.........................'.......................................................~........................
+ // uzp1 v5.8h, v12.8h, v13.8h // .............~.................................................................'...............*.................................................................'...............~................................................................
+ // uzp2 v6.8h, v12.8h, v13.8h // ...................~...........................................................'.....................*...........................................................'.....................~..........................................................
+ // ld1 {v7.8h}, [x9], #16 // ....................................................................e..........'......................................................................~..........'......................................................................~.........
+ // smlal v8.4s, v3.4h, v5.4h // .......................~.......................................................'.........................*.......................................................'.........................~......................................................
+ // smlal2 v10.4s, v3.8h, v5.8h // ....................~..........................................................'......................*..........................................................'......................~.........................................................
+ // smlal v8.4s, v4.4h, v7.4h // ...........................~...................................................'.............................*...................................................'.............................~..................................................
+ // smlal2 v10.4s, v4.8h, v7.8h // ........................~......................................................'..........................*......................................................'..........................~.....................................................
+ // smlal v9.4s, v3.4h, v6.4h // ......................~........................................................'........................*........................................................'........................~.......................................................
+ // smlal2 v11.4s, v3.8h, v6.8h // .....................~.........................................................'.......................*.........................................................'.......................~........................................................
+ // smlal v9.4s, v4.4h, v5.4h // ..........................~....................................................'............................*....................................................'............................~...................................................
+ // smlal2 v11.4s, v4.8h, v5.8h // .........................~.....................................................'...........................*.....................................................'...........................~....................................................
+ // ldr q12, [x10], #32 // ......................................................e........................'........................................................~........................'........................................................~.......................
+ // ldr q13, [x10, #-16] // .......................................................e.......................'.........................................................~.......................'.........................................................~......................
+ // uzp1 v3.8h, v12.8h, v13.8h // .........................................................e.....................'...........................................................~.....................'...........................................................~....................
+ // uzp2 v4.8h, v12.8h, v13.8h // ..........................................................e....................'............................................................~....................'............................................................~...................
+ // ldr q12, [x11], #32 // ...................................................................e...........'.....................................................................~...........'.....................................................................~..........
+ // ldr q13, [x11, #-16] // ..................................................................e............'....................................................................~............'....................................................................~...........
+ // uzp1 v5.8h, v12.8h, v13.8h // ............................................................................e..'..............................................................................~..'..............................................................................~.
+ // uzp2 v6.8h, v12.8h, v13.8h // .~.............................................................................'...*.............................................................................'...~............................................................................
+ // ld1 {v7.8h}, [x12], #16 // ........................................................e......................'..........................................................~......................'..........................................................~.....................
+ // smlal v8.4s, v3.4h, v5.4h // ...............................~...............................................'.................................*...............................................'.................................~..............................................
+ // smlal2 v10.4s, v3.8h, v5.8h // ............................~..................................................'..............................*..................................................'..............................~.................................................
+ // smlal v8.4s, v4.4h, v7.4h // ...................................~...........................................'.....................................*...........................................'.....................................~..........................................
+ // smlal2 v10.4s, v4.8h, v7.8h // ................................~..............................................'..................................*..............................................'..................................~.............................................
+ // smlal v9.4s, v3.4h, v6.4h // ..............................~................................................'................................*................................................'................................~...............................................
+ // smlal2 v11.4s, v3.8h, v6.8h // .............................~.................................................'...............................*.................................................'...............................~................................................
+ // smlal v9.4s, v4.4h, v5.4h // ..................................~............................................'....................................*............................................'....................................~...........................................
+ // smlal2 v11.4s, v4.8h, v5.8h // .................................~.............................................'...................................*.............................................'...................................~............................................
+ // uzp1 v28.8h, v8.8h, v10.8h // ..............................................~................................'................................................*................................'................................................~...............................
+ // mul v28.8h, v28.8h, v2.8h // ................................................~..............................'..................................................*..............................'..................................................~.............................
+ // smlal v8.4s, v28.4h, v0.4h // ........................................................................~......'..........................................................................*......'..........................................................................~.....
+ // smlal2 v10.4s, v28.8h, v0.8h // .......................................................................~.......'.........................................................................*.......'.........................................................................~......
+ // uzp2 v26.8h, v8.8h, v10.8h // ...........................................................................~...'.............................................................................*...'.............................................................................~..
+ // uzp1 v28.8h, v9.8h, v11.8h // .....................................~.........................................'.......................................*.........................................'.......................................~........................................
+ // mul v28.8h, v28.8h, v2.8h // .......................................~.......................................'.........................................*.......................................'.........................................~......................................
+ // smlal v9.4s, v28.4h, v0.4h // ...........................................~...................................'.............................................*...................................'.............................................~..................................
+ // smlal2 v11.4s, v28.8h, v0.8h // ............................................~..................................'..............................................*..................................'..............................................~.................................
+ // uzp2 v27.8h, v9.8h, v11.8h // .................................................~.............................'...................................................*.............................'...................................................~............................
+ // zip1 v12.8h, v26.8h, v27.8h // ..............................................................................~'................................................................................*'................................................................................
+ // zip2 v13.8h, v26.8h, v27.8h // ...............................................~...............................'.................................................~...............................'.................................................l..............................
+ // str q12, [x0], #32 // .............................................................................~.'...............................................................................~.'...............................................................................l
+ // str q13, [x0, #-16] // ...................................................~...........................'.....................................................~...........................'.....................................................l..........................
+
+ sub count, count, #1
+ cbnz count, polyvec_basemul_acc_montgomery_cached_asm_k4_opt_loop
+
+ // Instructions: 50
+ // Expected cycles: 56
+ // Expected IPC: 0.89
+ //
+ // Cycle bound: 56.0
+ // IPC bound: 0.89
+ //
+ // Wall time: 4.16s
+ // User time: 4.16s
+ //
+ // --------------- original position --------------->
+ // 0 25
+ // |------------------------|
+ smull2 v17.4S, v31.8H, v19.8H // ..*...............................................
+ uzp2 v1.8H, v14.8H, v6.8H // ................*.................................
+ smull v18.4S, v31.4H, v21.4H // .......*..........................................
+ smlal2 v24.4S, v26.8H, v28.8H // *.................................................
+ smlal2 v17.4S, v16.8H, v23.8H // ....*.............................................
+ smull v21.4S, v31.4H, v19.4H // .....*............................................
+ smlal v18.4S, v16.4H, v19.4H // .........*........................................
+ uzp2 v31.8H, v4.8H, v3.8H // .*................................................
+ uzp1 v3.8H, v14.8H, v6.8H // ............*.....................................
+ smlal v21.4S, v16.4H, v23.4H // ..........*.......................................
+ smlal v18.4S, v20.4H, v27.4H // ...........*......................................
+ uzp2 v14.8H, v29.8H, v25.8H // ...*..............................................
+ smlal2 v17.4S, v20.8H, v28.8H // ......*...........................................
+ smlal v21.4S, v20.4H, v28.4H // .............*....................................
+ smlal v18.4S, v26.4H, v28.4H // ..............*...................................
+ smlal2 v24.4S, v9.8H, v1.8H // ..................*...............................
+ smlal2 v17.4S, v26.8H, v8.8H // ........*.........................................
+ smlal v21.4S, v26.4H, v8.4H // ...............*..................................
+ smlal v18.4S, v9.4H, v1.4H // ...................*..............................
+ smlal2 v24.4S, v31.8H, v3.8H // ......................*...........................
+ smlal2 v17.4S, v9.8H, v3.8H // .................*................................
+ smlal v21.4S, v9.4H, v3.4H // ....................*.............................
+ smlal v18.4S, v31.4H, v3.4H // .......................*..........................
+ smlal2 v24.4S, v30.8H, v14.8H // ..........................*.......................
+ smlal2 v17.4S, v31.8H, v12.8H // .....................*............................
+ smlal v21.4S, v31.4H, v12.4H // ........................*.........................
+ smlal v18.4S, v30.4H, v14.4H // ...........................*......................
+ smlal2 v24.4S, v11.8H, v10.8H // ..............................*...................
+ smlal2 v17.4S, v30.8H, v10.8H // .........................*........................
+ smlal v21.4S, v30.4H, v10.4H // ............................*.....................
+ smlal v18.4S, v11.4H, v10.4H // ...............................*..................
+ zip2 v19.8H, v7.8H, v15.8H // ......................................*...........
+ smlal2 v17.4S, v11.8H, v22.8H // .............................*....................
+ smlal v21.4S, v11.4H, v22.4H // ................................*.................
+ uzp1 v23.8H, v18.8H, v24.8H // .................................*................
+ str q19, [x0, #16] // .........................................*........
+ mul v19.8H, v23.8H, v2.8H // ..................................*...............
+ uzp1 v23.8H, v21.8H, v17.8H // .....................................*............
+ str q5, [x0], #32 // .............................................*....
+ mul v26.8H, v23.8H, v2.8H // .......................................*..........
+ smlal v18.4S, v19.4H, v0.4H // ...................................*..............
+ smlal2 v24.4S, v19.8H, v0.8H // ....................................*.............
+ smlal v21.4S, v26.4H, v0.4H // ...........................................*......
+ smlal2 v17.4S, v26.8H, v0.8H // ..........................................*.......
+ uzp2 v13.8H, v18.8H, v24.8H // ........................................*.........
+ uzp2 v19.8H, v21.8H, v17.8H // ............................................*.....
+ zip1 v23.8H, v19.8H, v13.8H // ..............................................*...
+ zip2 v19.8H, v19.8H, v13.8H // ...............................................*..
+ str q23, [x0], #32 // .................................................*
+ str q19, [x0, #-16] // ................................................*.
+
+ // ----------------- new position ------------------>
+ // 0 25
+ // |------------------------|------------------------
+ // smlal2 v24.4S, v26.8H, v28.8H // ...*..............................................
+ // uzp2 v4.8H, v4.8H, v3.8H // .......*..........................................
+ // smull2 v13.4S, v31.8H, v19.8H // *.................................................
+ // uzp2 v1.8H, v29.8H, v25.8H // ...........*......................................
+ // smlal2 v13.4S, v16.8H, v23.8H // ....*.............................................
+ // smull v18.4S, v31.4H, v19.4H // .....*............................................
+ // smlal2 v13.4S, v20.8H, v28.8H // ............*.....................................
+ // smull v29.4S, v31.4H, v21.4H // ..*...............................................
+ // smlal2 v13.4S, v26.8H, v8.8H // ................*.................................
+ // smlal v29.4S, v16.4H, v19.4H // ......*...........................................
+ // smlal v18.4S, v16.4H, v23.4H // .........*........................................
+ // smlal v29.4S, v20.4H, v27.4H // ..........*.......................................
+ // uzp1 v31.8H, v14.8H, v6.8H // ........*.........................................
+ // smlal v18.4S, v20.4H, v28.4H // .............*....................................
+ // smlal v29.4S, v26.4H, v28.4H // ..............*...................................
+ // smlal v18.4S, v26.4H, v8.4H // .................*................................
+ // uzp2 v26.8H, v14.8H, v6.8H // .*................................................
+ // smlal2 v13.4S, v9.8H, v31.8H // ....................*.............................
+ // smlal2 v24.4S, v9.8H, v26.8H // ...............*..................................
+ // smlal v29.4S, v9.4H, v26.4H // ..................*...............................
+ // smlal v18.4S, v9.4H, v31.4H // .....................*............................
+ // smlal2 v13.4S, v4.8H, v12.8H // ........................*.........................
+ // smlal2 v24.4S, v4.8H, v31.8H // ...................*..............................
+ // smlal v29.4S, v4.4H, v31.4H // ......................*...........................
+ // smlal v18.4S, v4.4H, v12.4H // .........................*........................
+ // smlal2 v13.4S, v30.8H, v10.8H // ............................*.....................
+ // smlal2 v24.4S, v30.8H, v1.8H // .......................*..........................
+ // smlal v29.4S, v30.4H, v1.4H // ..........................*.......................
+ // smlal v18.4S, v30.4H, v10.4H // .............................*....................
+ // smlal2 v13.4S, v11.8H, v22.8H // ................................*.................
+ // smlal2 v24.4S, v11.8H, v10.8H // ...........................*......................
+ // smlal v29.4S, v11.4H, v10.4H // ..............................*...................
+ // smlal v18.4S, v11.4H, v22.4H // .................................*................
+ // uzp1 v31.8H, v29.8H, v24.8H // ..................................*...............
+ // mul v19.8H, v31.8H, v2.8H // ....................................*.............
+ // smlal v29.4S, v19.4H, v0.4H // ........................................*.........
+ // smlal2 v24.4S, v19.8H, v0.8H // .........................................*........
+ // uzp1 v26.8H, v18.8H, v13.8H // .....................................*............
+ // zip2 v14.8H, v7.8H, v15.8H // ...............................*..................
+ // mul v23.8H, v26.8H, v2.8H // .......................................*..........
+ // uzp2 v15.8H, v29.8H, v24.8H // ............................................*.....
+ // str q14, [x0, #16] // ...................................*..............
+ // smlal2 v13.4S, v23.8H, v0.8H // ...........................................*......
+ // smlal v18.4S, v23.4H, v0.4H // ..........................................*.......
+ // uzp2 v7.8H, v18.8H, v13.8H // .............................................*....
+ // str q5, [x0], #32 // ......................................*...........
+ // zip1 v5.8H, v7.8H, v15.8H // ..............................................*...
+ // zip2 v14.8H, v7.8H, v15.8H // ...............................................*..
+ // str q14, [x0, #16] // .................................................*
+ // str q5, [x0], #32 // ................................................*.
+
+
+ pop_stack
+ ret
+
+/****************** REGISTER DEALLOCATIONS *******************/
+ .unreq out
+ .unreq a0_ptr
+ .unreq b0_ptr
+ .unreq b0_cache_ptr
+ .unreq a1_ptr
+ .unreq b1_ptr
+ .unreq b1_cache_ptr
+ .unreq a2_ptr
+ .unreq b2_ptr
+ .unreq b2_cache_ptr
+ .unreq a3_ptr
+ .unreq b3_ptr
+ .unreq b3_cache_ptr
+ .unreq count
+ .unreq modulus
+ .unreq modulus_twisted
+ .unreq wtmp
+ .unreq aa0
+ .unreq aa1
+ .unreq bb0
+ .unreq bb1
+ .unreq bb1t
+ .unreq res0l
+ .unreq res1l
+ .unreq res0h
+ .unreq res1h
+ .unreq tmp0
+ .unreq tmp1
+ .unreq q_tmp0
+ .unreq q_tmp1
+ .unreq out0
+ .unreq out1
+ .unreq t0
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT && MLKEM_K == 4 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_clean.S
deleted file mode 100644
index 94f0889b7..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_clean.S
+++ /dev/null
@@ -1,310 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-//
-// AArch64 re-implementation of the asymmetric base multiplication from:
-//
-// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
-// https://eprint.iacr.org/2021/986
-// https://github.com/neon-ntt/neon-ntt
-
-#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
-
-// Input:
-// - Vectors al, ah of 32-bit entries
-// Output:
-// - Montgomery reductions of al || ah, stored in al
-.macro montgomery_reduce_long x, a
- uzp1 t0.8h, \a\()l.8h, \a\()h.8h
- mul t0.8h, t0.8h, modulus_twisted.8h
- smlal \a\()l.4s, t0.4h, modulus.4h
- smlal2 \a\()h.4s, t0.8h, modulus.8h
- uzp2 \x\().8h, \a\()l.8h, \a\()h.8h
-.endm
-
-// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit.
-//
-// Bounds:
-// - Assume |a| < 4096,
-// - Result: < 2*4096*2^15 = 2^28
-.macro pmull d, a, b
- smull \d\()0l.4s, \a\()0.4h, \b\()0.4h
- smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
- smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
- smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
-
- smull \d\()1l.4s, \a\()0.4h, \b\()1.4h
- smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
- smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
- smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
-.endm
-
-.macro pmlal d, a, b
- smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h
- smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
- smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
- smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
-
- smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h
- smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
- smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
- smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
-.endm
-
-.macro ld2_wrap a, ptr
- ldr q_tmp0, [\ptr\()], #32
- ldr q_tmp1, [\ptr\(), #-16]
- uzp1 \a\()0.8h, tmp0.8h, tmp1.8h
- uzp2 \a\()1.8h, tmp0.8h, tmp1.8h
-.endm
-
-.macro st2_wrap a, ptr
- zip1 tmp0.8h, \a\()0.8h, \a\()1.8h
- zip2 tmp1.8h, \a\()0.8h, \a\()1.8h
- str q_tmp0, [\ptr\()], #32
- str q_tmp1, [\ptr\(), #-16]
-.endm
-
-.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr
- ld2_wrap \a\(), \a_ptr
- ld2_wrap \b\(), \b_ptr
- ld1 {\b\()1t.8h}, [\b_cache_ptr], #16
-.endm
-
-.macro save_vregs
- sub sp, sp, #(16*4)
- stp d8, d9, [sp, #16*0]
- stp d10, d11, [sp, #16*1]
- stp d12, d13, [sp, #16*2]
- stp d14, d15, [sp, #16*3]
-.endm
-
-.macro restore_vregs
- ldp d8, d9, [sp, #16*0]
- ldp d10, d11, [sp, #16*1]
- ldp d12, d13, [sp, #16*2]
- ldp d14, d15, [sp, #16*3]
- add sp, sp, #(16*4)
-.endm
-
-.macro push_stack
- save_vregs
-.endm
-
-.macro pop_stack
- restore_vregs
-.endm
-
- out .req x0
- a0_ptr .req x1
- b0_ptr .req x2
- b0_cache_ptr .req x3
- a1_ptr .req x4
- b1_ptr .req x5
- b1_cache_ptr .req x6
- a2_ptr .req x7
- b2_ptr .req x8
- b2_cache_ptr .req x9
- a3_ptr .req x10
- b3_ptr .req x11
- b3_cache_ptr .req x12
- count .req x13
- wtmp .req w14
-
- modulus .req v0
- modulus_twisted .req v2
-
- aa0 .req v3
- aa1 .req v4
- bb0 .req v5
- bb1 .req v6
- bb1t .req v7
-
- res0l .req v8
- res1l .req v9
- res0h .req v10
- res1h .req v11
-
- tmp0 .req v12
- tmp1 .req v13
- q_tmp0 .req q12
- q_tmp1 .req q13
-
- out0 .req v26
- out1 .req v27
-
- t0 .req v28
-
-#if MLKEM_K == 2
-.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
-
-.balign 4
-MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean):
- push_stack
-
- mov wtmp, #3329
- dup modulus.8h, wtmp
-
- mov wtmp, #3327
- dup modulus_twisted.8h, wtmp
-
- // Computed bases of vector entries
-
- add a1_ptr, a0_ptr, #(1 * 512)
- add b1_ptr, b0_ptr, #(1 * 512)
- add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
-
- mov count, #(MLKEM_N / 16)
-k2_loop_start:
-
- load_polys aa, bb, a0_ptr, b0_ptr, b0_cache_ptr
- pmull res, aa, bb
- load_polys aa, bb, a1_ptr, b1_ptr, b1_cache_ptr
- pmlal res, aa, bb
-
- montgomery_reduce_long out0, res0
- montgomery_reduce_long out1, res1
-
- st2_wrap out, out
-
- subs count, count, #1
- cbnz count, k2_loop_start
-
- pop_stack
- ret
-#endif /* MLKEM_K == 2 */
-
-#if MLKEM_K == 3
-.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
-
-.balign 4
-MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean):
- push_stack
- mov wtmp, #3329
- dup modulus.8h, wtmp
-
- mov wtmp, #3327
- dup modulus_twisted.8h, wtmp
-
- // Computed bases of vector entries
-
- add a1_ptr, a0_ptr, #(1 * 512)
- add b1_ptr, b0_ptr, #(1 * 512)
- add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
- add a2_ptr, a0_ptr, #(2 * 512)
- add b2_ptr, b0_ptr, #(2 * 512)
- add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2)
-
- mov count, #(MLKEM_N / 16)
-k3_loop_start:
-
- load_polys aa, bb, a0_ptr, b0_ptr, b0_cache_ptr
- pmull res, aa, bb
- load_polys aa, bb, a1_ptr, b1_ptr, b1_cache_ptr
- pmlal res, aa, bb
- load_polys aa, bb, a2_ptr, b2_ptr, b2_cache_ptr
- pmlal res, aa, bb
-
- montgomery_reduce_long out0, res0
- montgomery_reduce_long out1, res1
-
- st2_wrap out, out
-
- subs count, count, #1
- cbnz count, k3_loop_start
-
- pop_stack
- ret
-#endif /* MLKEM_K == 3 */
-
-#if MLKEM_K == 4
-.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
-
-.balign 4
-MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean):
- push_stack
- mov wtmp, #3329
- dup modulus.8h, wtmp
-
- mov wtmp, #3327
- dup modulus_twisted.8h, wtmp
-
- // Computed bases of vector entries
-
- add a1_ptr, a0_ptr, #(1 * 512)
- add b1_ptr, b0_ptr, #(1 * 512)
- add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
- add a2_ptr, a0_ptr, #(2 * 512)
- add b2_ptr, b0_ptr, #(2 * 512)
- add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2)
- add a3_ptr, a0_ptr, #(3 * 512)
- add b3_ptr, b0_ptr, #(3 * 512)
- add b3_cache_ptr, b0_cache_ptr, #(3 * 512/2)
-
- // Bounds:
- //
- // Each pmull is bound by 2*4096*2^15=2^28, so the final value
- // before Montgomery reduction is bound by 2^30.
-
- mov count, #(MLKEM_N / 16)
-k4_loop_start:
-
- load_polys aa, bb, a0_ptr, b0_ptr, b0_cache_ptr
- pmull res, aa, bb
- load_polys aa, bb, a1_ptr, b1_ptr, b1_cache_ptr
- pmlal res, aa, bb
- load_polys aa, bb, a2_ptr, b2_ptr, b2_cache_ptr
- pmlal res, aa, bb
- load_polys aa, bb, a3_ptr, b3_ptr, b3_cache_ptr
- pmlal res, aa, bb
-
- montgomery_reduce_long out0, res0
- montgomery_reduce_long out1, res1
-
- st2_wrap out, out
-
- subs count, count, #1
- cbnz count, k4_loop_start
-
- pop_stack
- ret
-#endif /* MLKEM_K == 4 */
-
-/****************** REGISTER DEALLOCATIONS *******************/
- .unreq out
- .unreq a0_ptr
- .unreq b0_ptr
- .unreq b0_cache_ptr
- .unreq a1_ptr
- .unreq b1_ptr
- .unreq b1_cache_ptr
- .unreq a2_ptr
- .unreq b2_ptr
- .unreq b2_cache_ptr
- .unreq a3_ptr
- .unreq b3_ptr
- .unreq b3_cache_ptr
- .unreq count
- .unreq modulus
- .unreq modulus_twisted
- .unreq aa0
- .unreq aa1
- .unreq bb0
- .unreq bb1
- .unreq bb1t
- .unreq res0l
- .unreq res1l
- .unreq res0h
- .unreq wtmp
- .unreq res1h
- .unreq tmp0
- .unreq tmp1
- .unreq q_tmp0
- .unreq q_tmp1
- .unreq out0
- .unreq out1
- .unreq t0
-
-#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_opt.S
deleted file mode 100644
index 275ca06d2..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_opt.S
+++ /dev/null
@@ -1,1606 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-// AArch64 re-implementation of the asymmetric base multiplication from:
-
-// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
-// https://eprint.iacr.org/2021/986
-// https://github.com/neon-ntt/neon-ntt
-
-#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
-
-// Input:
-// - Vectors al, ah of 32-bit entries
-// Output:
-// - Montgomery reductions of al || ah, stored in al
-.macro montgomery_reduce_long x, a
- uzp1 t0.8h, \a\()l.8h, \a\()h.8h
- mul t0.8h, t0.8h, modulus_twisted.8h
- smlal \a\()l.4s, t0.4h, modulus.4h
- smlal2 \a\()h.4s, t0.8h, modulus.8h
- uzp2 \x\().8h, \a\()l.8h, \a\()h.8h
-.endm
-
-// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit.
-
-// Bounds:
-// - Assume |a| < 4096,
-// - Result: < 2*4096*2^15 = 2^28
-.macro pmull d, a, b
- smull \d\()0l.4s, \a\()0.4h, \b\()0.4h
- smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
- smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
- smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
-
- smull \d\()1l.4s, \a\()0.4h, \b\()1.4h
- smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
- smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
- smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
-.endm
-
-.macro pmlal d, a, b
- smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h
- smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
- smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
- smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
-
- smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h
- smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
- smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
- smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
-.endm
-
-.macro ld2_wrap a, ptr
- ldr q_tmp0, [\ptr\()], #32
- ldr q_tmp1, [\ptr\(), #-16]
- uzp1 \a\()0.8h, tmp0.8h, tmp1.8h
- uzp2 \a\()1.8h, tmp0.8h, tmp1.8h
-.endm
-
-.macro st2_wrap a, ptr
- zip1 tmp0.8h, \a\()0.8h, \a\()1.8h
- zip2 tmp1.8h, \a\()0.8h, \a\()1.8h
- str q_tmp0, [\ptr\()], #32
- str q_tmp1, [\ptr\(), #-16]
-.endm
-
-.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr
- ld2_wrap \a\(), \a_ptr
- ld2_wrap \b\(), \b_ptr
- ld1 {\b\()1t.8h}, [\b_cache_ptr], #16
-.endm
-
-.macro save_vregs
- sub sp, sp, #(16*4)
- stp d8, d9, [sp, #16*0]
- stp d10, d11, [sp, #16*1]
- stp d12, d13, [sp, #16*2]
- stp d14, d15, [sp, #16*3]
-.endm
-
-.macro restore_vregs
- ldp d8, d9, [sp, #16*0]
- ldp d10, d11, [sp, #16*1]
- ldp d12, d13, [sp, #16*2]
- ldp d14, d15, [sp, #16*3]
- add sp, sp, #(16*4)
-.endm
-
-.macro push_stack
- save_vregs
-.endm
-
-.macro pop_stack
- restore_vregs
-.endm
-
- out .req x0
- a0_ptr .req x1
- b0_ptr .req x2
- b0_cache_ptr .req x3
- a1_ptr .req x4
- b1_ptr .req x5
- b1_cache_ptr .req x6
- a2_ptr .req x7
- b2_ptr .req x8
- b2_cache_ptr .req x9
- a3_ptr .req x10
- b3_ptr .req x11
- b3_cache_ptr .req x12
- count .req x13
- wtmp .req w14
-
- modulus .req v0
- modulus_twisted .req v2
-
- aa0 .req v3
- aa1 .req v4
- bb0 .req v5
- bb1 .req v6
- bb1t .req v7
-
- res0l .req v8
- res1l .req v9
- res0h .req v10
- res1h .req v11
-
- tmp0 .req v12
- tmp1 .req v13
- q_tmp0 .req q12
- q_tmp1 .req q13
-
- out0 .req v26
- out1 .req v27
-
- t0 .req v28
-
-#if MLKEM_K == 2
-.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
-
-.balign 4
-MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt):
- push_stack
-
- mov wtmp, #3329
- dup modulus.8h, wtmp
-
- mov wtmp, #3327
- dup modulus_twisted.8h, wtmp
-
- // Computed bases of vector entries
-
- add a1_ptr, a0_ptr, #(1 * 512)
- add b1_ptr, b0_ptr, #(1 * 512)
- add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
-
- mov count, #(MLKEM_N / 16)
- // Instructions: 75
- // Expected cycles: 94
- // Expected IPC: 0.80
-
- // Cycle bound: 94.0
- // IPC bound: 0.80
-
- // Wall time: 1.49s
- // User time: 1.49s
-
- // --------------------------- original position ---------------------------->
- // 0 25 50
- // |------------------------|------------------------|
- ldr q9, [x4], #32 // *..........................................................................
- ldr q5, [x4, #-16] // ......*....................................................................
- ldr q11, [x5], #32 // .*.........................................................................
- uzp1 v23.8H, v9.8H, v5.8H // .........*.................................................................
- uzp2 v9.8H, v9.8H, v5.8H // .....................*.....................................................
- ldr q5, [x2], #32 // ..*........................................................................
- ldr q7, [x5, #-16] // ..............*............................................................
- ldr q21, [x2, #-16] // ...*.......................................................................
- uzp2 v10.8H, v11.8H, v7.8H // .................*.........................................................
- uzp1 v11.8H, v11.8H, v7.8H // ..................*........................................................
- uzp1 v7.8H, v5.8H, v21.8H // ....*......................................................................
- uzp2 v5.8H, v5.8H, v21.8H // .....*.....................................................................
- ldr q21, [x1], #32 // .......*...................................................................
- ldr q25, [x1, #-16] // ........*..................................................................
- ld1 {v6.8H}, [x3], #16 // ............................*..............................................
- uzp1 v26.8H, v21.8H, v25.8H // ..........*................................................................
- uzp2 v21.8H, v21.8H, v25.8H // ...........*...............................................................
- smull v25.4S, v26.4H, v5.4H // ............*..............................................................
- smull2 v5.4S, v26.8H, v5.8H // .............*.............................................................
- smull v19.4S, v26.4H, v7.4H // ..........................*................................................
- smull2 v26.4S, v26.8H, v7.8H // ..............................*............................................
- smlal v25.4S, v21.4H, v7.4H // ...............*...........................................................
- smlal2 v5.4S, v21.8H, v7.8H // ................*..........................................................
- smlal v19.4S, v21.4H, v6.4H // ...................................*.......................................
- smlal2 v26.4S, v21.8H, v6.8H // .................................*.........................................
- smlal v25.4S, v23.4H, v10.4H // ...................*.......................................................
- smlal2 v5.4S, v23.8H, v10.8H // ....................*......................................................
- smlal v19.4S, v23.4H, v11.4H // ......................................*....................................
- smlal2 v26.4S, v23.8H, v11.8H // ....................................*......................................
- ld1 {v23.8H}, [x6], #16 // ........................*..................................................
- smlal v25.4S, v9.4H, v11.4H // ......................*....................................................
- smlal2 v5.4S, v9.8H, v11.8H // .......................*...................................................
- smlal2 v26.4S, v9.8H, v23.8H // .......................................*...................................
- smlal v19.4S, v9.4H, v23.4H // .........................................*.................................
- ldr q9, [x4], #32 // ...............................*...........................................
- uzp1 v11.8H, v25.8H, v5.8H // .........................*.................................................
- uzp1 v23.8H, v19.8H, v26.8H // .............................................*.............................
- mul v11.8H, v11.8H, v2.8H // ...........................*...............................................
- mul v23.8H, v23.8H, v2.8H // ..............................................*............................
- ldr q7, [x5], #32 // ................................*..........................................
- smlal2 v5.4S, v11.8H, v0.8H // .............................*.............................................
- smlal v25.4S, v11.4H, v0.4H // ..................................*........................................
- ldr q11, [x2], #32 // .....................................*.....................................
- ldr q21, [x2, #-16] // ........................................*..................................
- ldr q6, [x4, #-16] // ...............................................*...........................
- uzp1 v17.8H, v11.8H, v21.8H // ...........................................*...............................
- ldr q10, [x1], #32 // ................................................*..........................
- ldr q29, [x1, #-16] // .................................................*.........................
- uzp2 v11.8H, v11.8H, v21.8H // ............................................*..............................
- uzp1 v13.8H, v9.8H, v6.8H // ...................................................*.......................
- uzp1 v3.8H, v10.8H, v29.8H // ....................................................*......................
- uzp2 v10.8H, v10.8H, v29.8H // .....................................................*.....................
- smull v12.4S, v3.4H, v11.4H // ......................................................*....................
- smull2 v11.4S, v3.8H, v11.8H // .......................................................*...................
- ldr q21, [x5, #-16] // ........................................................*..................
- smlal v12.4S, v10.4H, v17.4H // .........................................................*.................
- smlal2 v11.4S, v10.8H, v17.8H // ..........................................................*................
- uzp2 v29.8H, v7.8H, v21.8H // ...........................................................*...............
- uzp1 v15.8H, v7.8H, v21.8H // ............................................................*..............
- smlal v12.4S, v13.4H, v29.4H // .............................................................*.............
- smlal2 v11.4S, v13.8H, v29.8H // ..............................................................*............
- uzp2 v28.8H, v9.8H, v6.8H // ...............................................................*...........
- smlal2 v26.4S, v23.8H, v0.8H // ..................................................*........................
- smlal v12.4S, v28.4H, v15.4H // .................................................................*.........
- smlal2 v11.4S, v28.8H, v15.8H // ..................................................................*........
- smlal v19.4S, v23.4H, v0.4H // ................................................................*..........
- uzp2 v27.8H, v25.8H, v5.8H // ..........................................*................................
- smull v23.4S, v3.4H, v17.4H // ......................................................................*....
- uzp1 v9.8H, v12.8H, v11.8H // .....................................................................*.....
- uzp2 v19.8H, v19.8H, v26.8H // ....................................................................*......
- mul v14.8H, v9.8H, v2.8H // .......................................................................*...
- ld1 {v22.8H}, [x6], #16 // ...................................................................*.......
- zip2 v9.8H, v19.8H, v27.8H // ........................................................................*..
- smlal2 v11.4S, v14.8H, v0.8H // ..........................................................................*
- ld1 {v4.8H}, [x3], #16 // .........................................................................*.
-
- // ------------------------------ new position ------------------------------>
- // 0 25 50
- // |------------------------|------------------------|------------------------
- // ldr q18, [x4], #32 // *..........................................................................
- // ldr q30, [x5], #32 // ..*........................................................................
- // ldr q8, [x2], #32 // .....*.....................................................................
- // ldr q9, [x2, #-16] // .......*...................................................................
- // uzp1 v17.8H, v8.8H, v9.8H // ..........*................................................................
- // uzp2 v4.8H, v8.8H, v9.8H // ...........*...............................................................
- // ldr q19, [x4, #-16] // .*.........................................................................
- // ldr q29, [x1], #32 // ............*..............................................................
- // ldr q12, [x1, #-16] // .............*.............................................................
- // uzp1 v13.8H, v18.8H, v19.8H // ...*.......................................................................
- // uzp1 v3.8H, v29.8H, v12.8H // ...............*...........................................................
- // uzp2 v10.8H, v29.8H, v12.8H // ................*..........................................................
- // smull v12.4S, v3.4H, v4.4H // .................*.........................................................
- // smull2 v11.4S, v3.8H, v4.8H // ..................*........................................................
- // ldr q5, [x5, #-16] // ......*....................................................................
- // smlal v12.4S, v10.4H, v17.4H // .....................*.....................................................
- // smlal2 v11.4S, v10.8H, v17.8H // ......................*....................................................
- // uzp2 v14.8H, v30.8H, v5.8H // ........*..................................................................
- // uzp1 v15.8H, v30.8H, v5.8H // .........*.................................................................
- // smlal v12.4S, v13.4H, v14.4H // .........................*.................................................
- // smlal2 v11.4S, v13.8H, v14.8H // ..........................*................................................
- // uzp2 v28.8H, v18.8H, v19.8H // ....*......................................................................
- // smlal v12.4S, v28.4H, v15.4H // ..............................*............................................
- // smlal2 v11.4S, v28.8H, v15.8H // ...............................*...........................................
- // ld1 {v22.8H}, [x6], #16 // .............................*.............................................
- // uzp1 v1.8H, v12.8H, v11.8H // ...................................*.......................................
- // smull v23.4S, v3.4H, v17.4H // ...................*.......................................................
- // mul v14.8H, v1.8H, v2.8H // .....................................*.....................................
- // ld1 {v4.8H}, [x3], #16 // ..............*............................................................
- // smlal2 v11.4S, v14.8H, v0.8H // ........................................*..................................
- // smull2 v20.4S, v3.8H, v17.8H // ....................*......................................................
- // ldr q18, [x4], #32 // ..................................*........................................
- // ldr q30, [x5], #32 // .......................................*...................................
- // smlal2 v20.4S, v10.8H, v4.8H // ........................*..................................................
- // smlal v12.4S, v14.4H, v0.4H // .........................................*.................................
- // smlal v23.4S, v10.4H, v4.4H // .......................*...................................................
- // smlal2 v20.4S, v13.8H, v15.8H // ............................*..............................................
- // ldr q8, [x2], #32 // ..........................................*................................
- // smlal v23.4S, v13.4H, v15.4H // ...........................*...............................................
- // smlal2 v20.4S, v28.8H, v22.8H // ................................*..........................................
- // ldr q9, [x2, #-16] // ...........................................*...............................
- // smlal v23.4S, v28.4H, v22.4H // .................................*.........................................
- // uzp2 v27.8H, v12.8H, v11.8H // ..................................................................*........
- // uzp1 v17.8H, v8.8H, v9.8H // .............................................*.............................
- // uzp2 v4.8H, v8.8H, v9.8H // ................................................*..........................
- // uzp1 v5.8H, v23.8H, v20.8H // ....................................*......................................
- // mul v31.8H, v5.8H, v2.8H // ......................................*....................................
- // ldr q19, [x4, #-16] // ............................................*..............................
- // ldr q29, [x1], #32 // ..............................................*............................
- // ldr q12, [x1, #-16] // ...............................................*...........................
- // smlal2 v20.4S, v31.8H, v0.8H // ..............................................................*............
- // uzp1 v13.8H, v18.8H, v19.8H // .................................................*.........................
- // uzp1 v3.8H, v29.8H, v12.8H // ..................................................*........................
- // uzp2 v10.8H, v29.8H, v12.8H // ...................................................*.......................
- // smull v12.4S, v3.4H, v4.4H // ....................................................*......................
- // smull2 v11.4S, v3.8H, v4.8H // .....................................................*.....................
- // ldr q5, [x5, #-16] // ......................................................*....................
- // smlal v12.4S, v10.4H, v17.4H // .......................................................*...................
- // smlal2 v11.4S, v10.8H, v17.8H // ........................................................*..................
- // uzp2 v14.8H, v30.8H, v5.8H // .........................................................*.................
- // uzp1 v15.8H, v30.8H, v5.8H // ..........................................................*................
- // smlal v12.4S, v13.4H, v14.4H // ...........................................................*...............
- // smlal2 v11.4S, v13.8H, v14.8H // ............................................................*..............
- // uzp2 v28.8H, v18.8H, v19.8H // .............................................................*.............
- // smlal v23.4S, v31.4H, v0.4H // .................................................................*.........
- // smlal v12.4S, v28.4H, v15.4H // ...............................................................*...........
- // smlal2 v11.4S, v28.8H, v15.8H // ................................................................*..........
- // ld1 {v22.8H}, [x6], #16 // .......................................................................*...
- // uzp2 v19.8H, v23.8H, v20.8H // .....................................................................*.....
- // uzp1 v1.8H, v12.8H, v11.8H // ....................................................................*......
- // smull v23.4S, v3.4H, v17.4H // ...................................................................*.......
- // mul v14.8H, v1.8H, v2.8H // ......................................................................*....
- // zip2 v9.8H, v19.8H, v27.8H // ........................................................................*..
- // ld1 {v4.8H}, [x3], #16 // ..........................................................................*
- // smlal2 v11.4S, v14.8H, v0.8H // .........................................................................*.
-
- sub count, count, #2
-1:
- // Instructions: 48
- // Expected cycles: 58
- // Expected IPC: 0.83
-
- // Cycle bound: 58.0
- // IPC bound: 0.83
-
- // Wall time: 6.39s
- // User time: 6.39s
-
- // -------------- original position -------------->
- // 0 25
- // |------------------------|----------------------
- smull2 v20.4S, v3.8H, v17.8H // ..........*.....................................
- ldr q18, [x4], #32 // .................e..............................
- ldr q30, [x5], #32 // .....................e..........................
- smlal2 v20.4S, v10.8H, v4.8H // ............*...................................
- smlal v12.4S, v14.4H, v0.4H // .........................................*......
- smlal v23.4S, v10.4H, v4.4H // ...........*....................................
- str q9, [x0, #16] // ...............................................l
- smlal2 v20.4S, v13.8H, v15.8H // ...........................*....................
- ldr q8, [x2], #32 // ....e...........................................
- smlal v23.4S, v13.4H, v15.4H // ..........................*.....................
- smlal2 v20.4S, v28.8H, v22.8H // .............................*..................
- zip1 v26.8H, v19.8H, v27.8H // ............................................l...
- ldr q9, [x2, #-16] // .....e..........................................
- smlal v23.4S, v28.4H, v22.4H // ............................*...................
- uzp2 v27.8H, v12.8H, v11.8H // ...........................................*....
- uzp1 v17.8H, v8.8H, v9.8H // ......e.........................................
- uzp2 v4.8H, v8.8H, v9.8H // .......e........................................
- uzp1 v5.8H, v23.8H, v20.8H // ..................................*.............
- str q26, [x0], #32 // ..............................................l.
- mul v31.8H, v5.8H, v2.8H // ...................................*............
- ldr q19, [x4, #-16] // ..................e.............................
- ldr q29, [x1], #32 // e...............................................
- ldr q12, [x1, #-16] // .e..............................................
- smlal2 v20.4S, v31.8H, v0.8H // .....................................*..........
- uzp1 v13.8H, v18.8H, v19.8H // ...................e............................
- uzp1 v3.8H, v29.8H, v12.8H // ..e.............................................
- uzp2 v10.8H, v29.8H, v12.8H // ...e............................................
- smull v12.4S, v3.4H, v4.4H // .............e..................................
- smull2 v11.4S, v3.8H, v4.8H // ..............e.................................
- ldr q5, [x5, #-16] // ......................e.........................
- smlal v12.4S, v10.4H, v17.4H // ...............e................................
- smlal2 v11.4S, v10.8H, v17.8H // ................e...............................
- uzp2 v14.8H, v30.8H, v5.8H // ........................e.......................
- uzp1 v15.8H, v30.8H, v5.8H // .......................e........................
- smlal v12.4S, v13.4H, v14.4H // ..............................e.................
- smlal2 v11.4S, v13.8H, v14.8H // ...............................e................
- uzp2 v28.8H, v18.8H, v19.8H // ....................e...........................
- smlal v23.4S, v31.4H, v0.4H // ....................................*...........
- smlal v12.4S, v28.4H, v15.4H // ................................e...............
- smlal2 v11.4S, v28.8H, v15.8H // .................................e..............
- ld1 {v22.8H}, [x6], #16 // .........................e......................
- uzp2 v19.8H, v23.8H, v20.8H // ......................................*.........
- uzp1 v1.8H, v12.8H, v11.8H // .......................................e........
- smull v23.4S, v3.4H, v17.4H // .........e......................................
- mul v14.8H, v1.8H, v2.8H // ........................................e.......
- zip2 v9.8H, v19.8H, v27.8H // .............................................*..
- ld1 {v4.8H}, [x3], #16 // ........e.......................................
- smlal2 v11.4S, v14.8H, v0.8H // ..........................................e.....
-
- // ------------------------------------------------- new position -------------------------------------------------->
- // 0 25 50 75 100
- // |------------------------|------------------------|------------------------|------------------------|-------------
- // ldr q12, [x1], #32 // ....................e..........................'....................~..........................'..................
- // ldr q13, [x1, #-16] // .....................e.........................'.....................~.........................'..................
- // uzp1 v3.8h, v12.8h, v13.8h // ........................e......................'........................~......................'..................
- // uzp2 v4.8h, v12.8h, v13.8h // .........................e.....................'.........................~.....................'..................
- // ldr q12, [x2], #32 // .......e.......................................'.......~.......................................'.......~..........
- // ldr q13, [x2, #-16] // ...........e...................................'...........~...................................'...........~......
- // uzp1 v5.8h, v12.8h, v13.8h // ..............e................................'..............~................................'..............~...
- // uzp2 v6.8h, v12.8h, v13.8h // ...............e...............................'...............~...............................'...............~..
- // ld1 {v7.8h}, [x3], #16 // .............................................e.'.............................................~.'..................
- // smull v8.4s, v3.4h, v5.4h // ..........................................e....'..........................................~....'..................
- // smull2 v10.4s, v3.8h, v5.8h // ...............................................*...............................................~..................
- // smlal v8.4s, v4.4h, v7.4h // ....~..........................................'....*..........................................'....~.............
- // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................'..*............................................'..~...............
- // smull v9.4s, v3.4h, v6.4h // ..........................e....................'..........................~....................'..................
- // smull2 v11.4s, v3.8h, v6.8h // ...........................e...................'...........................~...................'..................
- // smlal v9.4s, v4.4h, v5.4h // .............................e.................'.............................~.................'..................
- // smlal2 v11.4s, v4.8h, v5.8h // ..............................e................'..............................~................'..................
- // ldr q12, [x4], #32 // e..............................................'~..............................................'~.................
- // ldr q13, [x4, #-16] // ...................e...........................'...................~...........................'..................
- // uzp1 v3.8h, v12.8h, v13.8h // .......................e.......................'.......................~.......................'..................
- // uzp2 v4.8h, v12.8h, v13.8h // ...................................e...........'...................................~...........'..................
- // ldr q12, [x5], #32 // .e.............................................'.~.............................................'.~................
- // ldr q13, [x5, #-16] // ............................e..................'............................~..................'..................
- // uzp1 v5.8h, v12.8h, v13.8h // ................................e..............'................................~..............'..................
- // uzp2 v6.8h, v12.8h, v13.8h // ...............................e...............'...............................~...............'..................
- // ld1 {v7.8h}, [x6], #16 // .......................................e.......'.......................................~.......'..................
- // smlal v8.4s, v3.4h, v5.4h // ........~......................................'........*......................................'........~.........
- // smlal2 v10.4s, v3.8h, v5.8h // ......~........................................'......*........................................'......~...........
- // smlal v8.4s, v4.4h, v7.4h // ............~..................................'............*..................................'............~.....
- // smlal2 v10.4s, v4.8h, v7.8h // .........~.....................................'.........*.....................................'.........~........
- // smlal v9.4s, v3.4h, v6.4h // .................................e.............'.................................~.............'..................
- // smlal2 v11.4s, v3.8h, v6.8h // ..................................e............'..................................~............'..................
- // smlal v9.4s, v4.4h, v5.4h // .....................................e.........'.....................................~.........'..................
- // smlal2 v11.4s, v4.8h, v5.8h // ......................................e........'......................................~........'..................
- // uzp1 v28.8h, v8.8h, v10.8h // ................~..............................'................*..............................'................~.
- // mul v28.8h, v28.8h, v2.8h // ..................~............................'..................*............................'..................
- // smlal v8.4s, v28.4h, v0.4h // ....................................~..........'....................................*..........'..................
- // smlal2 v10.4s, v28.8h, v0.8h // ......................~........................'......................*........................'..................
- // uzp2 v26.8h, v8.8h, v10.8h // ........................................~......'........................................*......'..................
- // uzp1 v28.8h, v9.8h, v11.8h // .........................................e.....'.........................................~.....'..................
- // mul v28.8h, v28.8h, v2.8h // ...........................................e...'...........................................~...'..................
- // smlal v9.4s, v28.4h, v0.4h // ...~...........................................'...*...........................................'...~..............
- // smlal2 v11.4s, v28.8h, v0.8h // ..............................................e'..............................................~'..................
- // uzp2 v27.8h, v9.8h, v11.8h // .............~.................................'.............*.................................'.............~....
- // zip1 v12.8h, v26.8h, v27.8h // ..........~....................................'..........~....................................'..........l.......
- // zip2 v13.8h, v26.8h, v27.8h // ............................................~..'............................................*..'..................
- // str q12, [x0], #32 // .................~.............................'.................~.............................'.................l
- // str q13, [x0, #-16] // .....~.........................................'.....~.........................................'.....l............
-
- sub count, count, #1
- cbnz count, 1b
- // Instructions: 21
- // Expected cycles: 35
- // Expected IPC: 0.60
-
- // Cycle bound: 35.0
- // IPC bound: 0.60
-
- // Wall time: 0.08s
- // User time: 0.08s
-
- // ----- original position ----->
- // 0 25
- // |------------------------|----
- smull2 v5.4S, v3.8H, v17.8H // *.............................
- smlal v12.4S, v14.4H, v0.4H // ..*...........................
- smlal v23.4S, v10.4H, v4.4H // ...*..........................
- str q9, [x0, #16] // ....*.........................
- smlal2 v5.4S, v10.8H, v4.8H // .*............................
- uzp2 v11.8H, v12.8H, v11.8H // ..........*...................
- zip1 v9.8H, v19.8H, v27.8H // ........*.....................
- smlal v23.4S, v13.4H, v15.4H // ......*.......................
- smlal2 v5.4S, v13.8H, v15.8H // .....*........................
- str q9, [x0], #32 // ............*.................
- smlal v23.4S, v28.4H, v22.4H // .........*....................
- smlal2 v5.4S, v28.8H, v22.8H // .......*......................
- uzp1 v9.8H, v23.8H, v5.8H // ...........*..................
- mul v9.8H, v9.8H, v2.8H // .............*................
- smlal2 v5.4S, v9.8H, v0.8H // ..............*...............
- smlal v23.4S, v9.4H, v0.4H // ...............*..............
- uzp2 v9.8H, v23.8H, v5.8H // ................*.............
- zip2 v5.8H, v9.8H, v11.8H // .................*............
- zip1 v9.8H, v9.8H, v11.8H // ...................*..........
- str q5, [x0, #16] // ..................*...........
- str q9, [x0], #32 // ....................*.........
-
- // -------- new position -------->
- // 0 25
- // |------------------------|-----
- // smull2 v20.4S, v3.8H, v17.8H // *..............................
- // smlal2 v20.4S, v10.8H, v4.8H // ....*..........................
- // smlal v12.4S, v14.4H, v0.4H // .*.............................
- // smlal v23.4S, v10.4H, v4.4H // ..*............................
- // str q9, [x0, #16] // ...*...........................
- // smlal2 v20.4S, v13.8H, v15.8H // ........*......................
- // smlal v23.4S, v13.4H, v15.4H // .......*.......................
- // smlal2 v20.4S, v28.8H, v22.8H // ...........*...................
- // zip1 v26.8H, v19.8H, v27.8H // ......*........................
- // smlal v23.4S, v28.4H, v22.4H // ..........*....................
- // uzp2 v27.8H, v12.8H, v11.8H // .....*.........................
- // uzp1 v5.8H, v23.8H, v20.8H // ............*..................
- // str q26, [x0], #32 // .........*.....................
- // mul v31.8H, v5.8H, v2.8H // .............*.................
- // smlal2 v20.4S, v31.8H, v0.8H // ..............*................
- // smlal v23.4S, v31.4H, v0.4H // ...............*...............
- // uzp2 v19.8H, v23.8H, v20.8H // ................*..............
- // zip2 v9.8H, v19.8H, v27.8H // .................*.............
- // str q9, [x0, #16] // ...................*...........
- // zip1 v26.8H, v19.8H, v27.8H // ..................*............
- // str q26, [x0], #32 // ....................*..........
-
-
- pop_stack
- ret
-#endif /* MLKEM_K == 2 */
-
-#if MLKEM_K == 3
-.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
-
-.balign 4
-MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt):
- push_stack
- mov wtmp, #3329
- dup modulus.8h, wtmp
-
- mov wtmp, #3327
- dup modulus_twisted.8h, wtmp
-
- // Computed bases of vector entries
-
- add a1_ptr, a0_ptr, #(1 * 512)
- add b1_ptr, b0_ptr, #(1 * 512)
- add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
- add a2_ptr, a0_ptr, #(2 * 512)
- add b2_ptr, b0_ptr, #(2 * 512)
- add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2)
-
- mov count, #(MLKEM_N / 16)
- // Instructions: 75
- // Expected cycles: 103
- // Expected IPC: 0.73
-
- // Cycle bound: 103.0
- // IPC bound: 0.73
-
- // Wall time: 0.94s
- // User time: 0.94s
-
- // --------------------------- original position ---------------------------->
- // 0 25 50
- // |------------------------|------------------------|
- ldr q7, [x2, #16] // *..........................................................................
- ldr q20, [x2], #32 // ..*........................................................................
- ldr q15, [x1, #16] // .*.........................................................................
- uzp1 v8.8H, v20.8H, v7.8H // ...............*...........................................................
- uzp2 v7.8H, v20.8H, v7.8H // ................*..........................................................
- ld1 {v20.8H}, [x3], #16 // ...*.......................................................................
- ldr q30, [x1], #32 // ..............*............................................................
- ldr q11, [x4], #32 // ....*......................................................................
- uzp1 v16.8H, v30.8H, v15.8H // .................*.........................................................
- uzp2 v15.8H, v30.8H, v15.8H // ..................*........................................................
- smull v30.4S, v16.4H, v7.4H // ...................*.......................................................
- smull2 v7.4S, v16.8H, v7.8H // ....................*......................................................
- smull v9.4S, v16.4H, v8.4H // .....................*.....................................................
- smull2 v16.4S, v16.8H, v8.8H // ......................*....................................................
- smlal v30.4S, v15.4H, v8.4H // .......................*...................................................
- smlal2 v7.4S, v15.8H, v8.8H // ........................*..................................................
- smlal v9.4S, v15.4H, v20.4H // .........................*.................................................
- smlal2 v16.4S, v15.8H, v20.8H // ..........................*................................................
- ldr q20, [x4, #-16] // .....*.....................................................................
- ldr q15, [x5], #32 // ......*....................................................................
- uzp1 v8.8H, v11.8H, v20.8H // ...........................*...............................................
- uzp2 v20.8H, v11.8H, v20.8H // ............................*..............................................
- ldr q11, [x5, #-16] // .......*...................................................................
- ld1 {v27.8H}, [x6], #16 // ........*..................................................................
- uzp1 v10.8H, v15.8H, v11.8H // .............................*.............................................
- uzp2 v15.8H, v15.8H, v11.8H // ..............................*............................................
- smlal v9.4S, v8.4H, v10.4H // ...............................*...........................................
- smlal2 v16.4S, v8.8H, v10.8H // ................................*..........................................
- smlal v30.4S, v8.4H, v15.4H // .................................*.........................................
- smlal2 v7.4S, v8.8H, v15.8H // ..................................*........................................
- smlal v9.4S, v20.4H, v27.4H // ...................................*.......................................
- smlal2 v16.4S, v20.8H, v27.8H // ....................................*......................................
- smlal v30.4S, v20.4H, v10.4H // .....................................*.....................................
- smlal2 v7.4S, v20.8H, v10.8H // ......................................*....................................
- ldr q20, [x7], #32 // .........*.................................................................
- ldr q15, [x7, #-16] // ..........*................................................................
- ldr q8, [x8], #32 // ...........*...............................................................
- uzp1 v11.8H, v20.8H, v15.8H // .......................................*...................................
- uzp2 v20.8H, v20.8H, v15.8H // ........................................*..................................
- ldr q15, [x8, #-16] // ............*..............................................................
- ld1 {v27.8H}, [x9], #16 // .............*.............................................................
- uzp1 v10.8H, v8.8H, v15.8H // .........................................*.................................
- uzp2 v15.8H, v8.8H, v15.8H // ..........................................*................................
- smlal v9.4S, v11.4H, v10.4H // ...........................................*...............................
- smlal2 v16.4S, v11.8H, v10.8H // ............................................*..............................
- smlal v30.4S, v11.4H, v15.4H // .............................................*.............................
- smlal2 v7.4S, v11.8H, v15.8H // ..............................................*............................
- smlal v9.4S, v20.4H, v27.4H // ...............................................*...........................
- smlal2 v16.4S, v20.8H, v27.8H // ................................................*..........................
- smlal v30.4S, v20.4H, v10.4H // .................................................*.........................
- smlal2 v7.4S, v20.8H, v10.8H // ..................................................*........................
- ldr q15, [x2], #32 // ...............................................................*...........
- uzp1 v20.8H, v9.8H, v16.8H // ....................................................*......................
- uzp1 v8.8H, v30.8H, v7.8H // .....................................................*.....................
- mul v20.8H, v20.8H, v2.8H // ......................................................*....................
- mul v8.8H, v8.8H, v2.8H // .......................................................*...................
- ldr q21, [x4], #32 // .................................................................*.........
- smlal v9.4S, v20.4H, v0.4H // ........................................................*..................
- smlal2 v16.4S, v20.8H, v0.8H // .........................................................*.................
- smlal v30.4S, v8.4H, v0.4H // ..........................................................*................
- smlal2 v7.4S, v8.8H, v0.8H // ...........................................................*...............
- ldr q6, [x4, #-16] // ..................................................................*........
- uzp2 v27.8H, v9.8H, v16.8H // ............................................................*..............
- uzp2 v10.8H, v30.8H, v7.8H // .............................................................*.............
- ldr q16, [x2, #-16] // ...................................................*.......................
- ldr q30, [x1, #16] // ..............................................................*............
- ld1 {v9.8H}, [x3], #16 // ................................................................*..........
- ldr q1, [x5], #32 // ...................................................................*.......
- ldr q12, [x5, #-16] // ....................................................................*......
- ld1 {v24.8H}, [x6], #16 // .....................................................................*.....
- ldr q19, [x7], #32 // ......................................................................*....
- ldr q31, [x7, #-16] // .......................................................................*...
- ldr q17, [x8], #32 // ........................................................................*..
- ldr q18, [x8, #-16] // .........................................................................*.
- ld1 {v25.8H}, [x9], #16 // ..........................................................................*
-
- // ------------------------------ new position ------------------------------>
- // 0 25 50
- // |------------------------|------------------------|------------------------
- // ldr q16, [x2, #16] // *..........................................................................
- // ldr q30, [x1, #16] // ..*........................................................................
- // ldr q15, [x2], #32 // .*.........................................................................
- // ld1 {v9.8H}, [x3], #16 // .....*.....................................................................
- // ldr q21, [x4], #32 // .......*...................................................................
- // ldr q6, [x4, #-16] // ..................*........................................................
- // ldr q1, [x5], #32 // ...................*.......................................................
- // ldr q12, [x5, #-16] // ......................*....................................................
- // ld1 {v24.8H}, [x6], #16 // .......................*...................................................
- // ldr q19, [x7], #32 // ..................................*........................................
- // ldr q31, [x7, #-16] // ...................................*.......................................
- // ldr q17, [x8], #32 // ....................................*......................................
- // ldr q18, [x8, #-16] // .......................................*...................................
- // ld1 {v25.8H}, [x9], #16 // ........................................*..................................
- // ldr q20, [x1], #32 // ......*....................................................................
- // uzp1 v7.8H, v15.8H, v16.8H // ...*.......................................................................
- // uzp2 v15.8H, v15.8H, v16.8H // ....*......................................................................
- // uzp1 v8.8H, v20.8H, v30.8H // ........*..................................................................
- // uzp2 v20.8H, v20.8H, v30.8H // .........*.................................................................
- // smull v30.4S, v8.4H, v15.4H // ..........*................................................................
- // smull2 v15.4S, v8.8H, v15.8H // ...........*...............................................................
- // smull v11.4S, v8.4H, v7.4H // ............*..............................................................
- // smull2 v8.4S, v8.8H, v7.8H // .............*.............................................................
- // smlal v30.4S, v20.4H, v7.4H // ..............*............................................................
- // smlal2 v15.4S, v20.8H, v7.8H // ...............*...........................................................
- // smlal v11.4S, v20.4H, v9.4H // ................*..........................................................
- // smlal2 v8.4S, v20.8H, v9.8H // .................*.........................................................
- // uzp1 v7.8H, v21.8H, v6.8H // ....................*......................................................
- // uzp2 v20.8H, v21.8H, v6.8H // .....................*.....................................................
- // uzp1 v16.8H, v1.8H, v12.8H // ........................*..................................................
- // uzp2 v9.8H, v1.8H, v12.8H // .........................*.................................................
- // smlal v11.4S, v7.4H, v16.4H // ..........................*................................................
- // smlal2 v8.4S, v7.8H, v16.8H // ...........................*...............................................
- // smlal v30.4S, v7.4H, v9.4H // ............................*..............................................
- // smlal2 v15.4S, v7.8H, v9.8H // .............................*.............................................
- // smlal v11.4S, v20.4H, v24.4H // ..............................*............................................
- // smlal2 v8.4S, v20.8H, v24.8H // ...............................*...........................................
- // smlal v30.4S, v20.4H, v16.4H // ................................*..........................................
- // smlal2 v15.4S, v20.8H, v16.8H // .................................*.........................................
- // uzp1 v7.8H, v19.8H, v31.8H // .....................................*.....................................
- // uzp2 v20.8H, v19.8H, v31.8H // ......................................*....................................
- // uzp1 v16.8H, v17.8H, v18.8H // .........................................*.................................
- // uzp2 v9.8H, v17.8H, v18.8H // ..........................................*................................
- // smlal v11.4S, v7.4H, v16.4H // ...........................................*...............................
- // smlal2 v8.4S, v7.8H, v16.8H // ............................................*..............................
- // smlal v30.4S, v7.4H, v9.4H // .............................................*.............................
- // smlal2 v15.4S, v7.8H, v9.8H // ..............................................*............................
- // smlal v11.4S, v20.4H, v25.4H // ...............................................*...........................
- // smlal2 v8.4S, v20.8H, v25.8H // ................................................*..........................
- // smlal v30.4S, v20.4H, v16.4H // .................................................*.........................
- // smlal2 v15.4S, v20.8H, v16.8H // ..................................................*........................
- // ldr q16, [x2, #16] // ................................................................*..........
- // uzp1 v7.8H, v11.8H, v8.8H // ....................................................*......................
- // uzp1 v20.8H, v30.8H, v15.8H // .....................................................*.....................
- // mul v7.8H, v7.8H, v2.8H // ......................................................*....................
- // mul v20.8H, v20.8H, v2.8H // .......................................................*...................
- // smlal v11.4S, v7.4H, v0.4H // .........................................................*.................
- // smlal2 v8.4S, v7.8H, v0.8H // ..........................................................*................
- // smlal v30.4S, v20.4H, v0.4H // ...........................................................*...............
- // smlal2 v15.4S, v20.8H, v0.8H // ............................................................*..............
- // uzp2 v27.8H, v11.8H, v8.8H // ..............................................................*............
- // uzp2 v10.8H, v30.8H, v15.8H // ...............................................................*...........
- // ldr q30, [x1, #16] // .................................................................*.........
- // ldr q15, [x2], #32 // ...................................................*.......................
- // ld1 {v9.8H}, [x3], #16 // ..................................................................*........
- // ldr q21, [x4], #32 // ........................................................*..................
- // ldr q6, [x4, #-16] // .............................................................*.............
- // ldr q1, [x5], #32 // ...................................................................*.......
- // ldr q12, [x5, #-16] // ....................................................................*......
- // ld1 {v24.8H}, [x6], #16 // .....................................................................*.....
- // ldr q19, [x7], #32 // ......................................................................*....
- // ldr q31, [x7, #-16] // .......................................................................*...
- // ldr q17, [x8], #32 // ........................................................................*..
- // ldr q18, [x8, #-16] // .........................................................................*.
- // ld1 {v25.8H}, [x9], #16 // ..........................................................................*
-
- sub count, count, #2
-1:
- // Instructions: 65
- // Expected cycles: 80
- // Expected IPC: 0.81
-
- // Cycle bound: 80.0
- // IPC bound: 0.81
-
- // Wall time: 11.64s
- // User time: 11.64s
-
- // ---------------------- original position ----------------------->
- // 0 25 50
- // |------------------------|------------------------|--------------
- ldr q20, [x1], #32 // *................................................................
- uzp1 v7.8H, v15.8H, v16.8H // ......*..........................................................
- uzp2 v15.8H, v15.8H, v16.8H // .......*.........................................................
- uzp1 v8.8H, v20.8H, v30.8H // ..*..............................................................
- uzp2 v20.8H, v20.8H, v30.8H // ...*.............................................................
- smull v30.4S, v8.4H, v15.4H // .............*...................................................
- smull2 v15.4S, v8.8H, v15.8H // ..............*..................................................
- smull v11.4S, v8.4H, v7.4H // .........*.......................................................
- smull2 v8.4S, v8.8H, v7.8H // ..........*......................................................
- smlal v30.4S, v20.4H, v7.4H // ...............*.................................................
- smlal2 v15.4S, v20.8H, v7.8H // ................*................................................
- smlal v11.4S, v20.4H, v9.4H // ...........*.....................................................
- smlal2 v8.4S, v20.8H, v9.8H // ............*....................................................
- uzp1 v7.8H, v21.8H, v6.8H // ...................*.............................................
- uzp2 v20.8H, v21.8H, v6.8H // ....................*............................................
- uzp1 v16.8H, v1.8H, v12.8H // .......................*.........................................
- uzp2 v9.8H, v1.8H, v12.8H // ........................*........................................
- smlal v11.4S, v7.4H, v16.4H // ..........................*......................................
- smlal2 v8.4S, v7.8H, v16.8H // ...........................*.....................................
- smlal v30.4S, v7.4H, v9.4H // ..............................*..................................
- smlal2 v15.4S, v7.8H, v9.8H // ...............................*.................................
- smlal v11.4S, v20.4H, v24.4H // ............................*....................................
- smlal2 v8.4S, v20.8H, v24.8H // .............................*...................................
- smlal v30.4S, v20.4H, v16.4H // ................................*................................
- smlal2 v15.4S, v20.8H, v16.8H // .................................*...............................
- uzp1 v7.8H, v19.8H, v31.8H // ....................................*............................
- uzp2 v20.8H, v19.8H, v31.8H // .....................................*...........................
- uzp1 v16.8H, v17.8H, v18.8H // ........................................*........................
- uzp2 v9.8H, v17.8H, v18.8H // .........................................*.......................
- smlal v11.4S, v7.4H, v16.4H // ...........................................*.....................
- smlal2 v8.4S, v7.8H, v16.8H // ............................................*....................
- smlal v30.4S, v7.4H, v9.4H // ...............................................*.................
- smlal2 v15.4S, v7.8H, v9.8H // ................................................*................
- smlal v11.4S, v20.4H, v25.4H // .............................................*...................
- smlal2 v8.4S, v20.8H, v25.8H // ..............................................*..................
- smlal v30.4S, v20.4H, v16.4H // .................................................*...............
- smlal2 v15.4S, v20.8H, v16.8H // ..................................................*..............
- ldr q16, [x2, #16] // .....e...........................................................
- uzp1 v7.8H, v11.8H, v8.8H // ...................................................*.............
- uzp1 v20.8H, v30.8H, v15.8H // ........................................................*........
- mul v7.8H, v7.8H, v2.8H // ....................................................*............
- mul v20.8H, v20.8H, v2.8H // .........................................................*.......
- zip2 v9.8H, v27.8H, v10.8H // ..............................................................l..
- zip1 v27.8H, v27.8H, v10.8H // .............................................................l...
- smlal v11.4S, v7.4H, v0.4H // .....................................................*...........
- smlal2 v8.4S, v7.8H, v0.8H // ......................................................*..........
- smlal v30.4S, v20.4H, v0.4H // ..........................................................*......
- smlal2 v15.4S, v20.8H, v0.8H // ...........................................................*.....
- str q27, [x0], #32 // ...............................................................l.
- uzp2 v27.8H, v11.8H, v8.8H // .......................................................*.........
- str q9, [x0, #-16] // ................................................................l
- uzp2 v10.8H, v30.8H, v15.8H // ............................................................*....
- ldr q30, [x1, #16] // .e...............................................................
- ldr q15, [x2], #32 // ....e............................................................
- ld1 {v9.8H}, [x3], #16 // ........e........................................................
- ldr q21, [x4], #32 // .................e...............................................
- ldr q6, [x4, #-16] // ..................e..............................................
- ldr q1, [x5], #32 // .....................e...........................................
- ldr q12, [x5, #-16] // ......................e..........................................
- ld1 {v24.8H}, [x6], #16 // .........................e.......................................
- ldr q19, [x7], #32 // ..................................e..............................
- ldr q31, [x7, #-16] // ...................................e.............................
- ldr q17, [x8], #32 // ......................................e..........................
- ldr q18, [x8, #-16] // .......................................e.........................
- ld1 {v25.8H}, [x9], #16 // ..........................................e......................
-
- // ---------------------------------------------------------------- new position ----------------------------------------------------------------->
- // 0 25 50 75 100 125
- // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------
- // ldr q12, [x1], #32 // ............................*................................................................~..................................................
- // ldr q13, [x1, #-16] // ...............e............'...................................................~............'..................................................
- // uzp1 v3.8h, v12.8h, v13.8h // ............................'..*.............................................................'..~...............................................
- // uzp2 v4.8h, v12.8h, v13.8h // ............................'...*............................................................'...~..............................................
- // ldr q12, [x2], #32 // ................e...........'....................................................~...........'..................................................
- // ldr q13, [x2, #-16] // e...........................'....................................~...........................'....................................~.............
- // uzp1 v5.8h, v12.8h, v13.8h // ............................'*...............................................................'~.................................................
- // uzp2 v6.8h, v12.8h, v13.8h // ............................'.*..............................................................'.~................................................
- // ld1 {v7.8h}, [x3], #16 // .................e..........'.....................................................~..........'..................................................
- // smull v8.4s, v3.4h, v5.4h // ............................'......*.........................................................'......~...........................................
- // smull2 v10.4s, v3.8h, v5.8h // ............................'.......*........................................................'.......~..........................................
- // smlal v8.4s, v4.4h, v7.4h // ............................'..........*.....................................................'..........~.......................................
- // smlal2 v10.4s, v4.8h, v7.8h // ............................'...........*....................................................'...........~......................................
- // smull v9.4s, v3.4h, v6.4h // ............................'....*...........................................................'....~.............................................
- // smull2 v11.4s, v3.8h, v6.8h // ............................'.....*..........................................................'.....~............................................
- // smlal v9.4s, v4.4h, v5.4h // ............................'........*.......................................................'........~.........................................
- // smlal2 v11.4s, v4.8h, v5.8h // ............................'.........*......................................................'.........~........................................
- // ldr q12, [x4], #32 // ..................e.........'......................................................~.........'..................................................
- // ldr q13, [x4, #-16] // ...................e........'.......................................................~........'..................................................
- // uzp1 v3.8h, v12.8h, v13.8h // ............................'............*...................................................'............~.....................................
- // uzp2 v4.8h, v12.8h, v13.8h // ............................'.............*..................................................'.............~....................................
- // ldr q12, [x5], #32 // ....................e.......'........................................................~.......'..................................................
- // ldr q13, [x5, #-16] // .....................e......'.........................................................~......'..................................................
- // uzp1 v5.8h, v12.8h, v13.8h // ............................'..............*.................................................'..............~...................................
- // uzp2 v6.8h, v12.8h, v13.8h // ............................'...............*................................................'...............~..................................
- // ld1 {v7.8h}, [x6], #16 // ......................e.....'..........................................................~.....'..................................................
- // smlal v8.4s, v3.4h, v5.4h // ............................'................*...............................................'................~.................................
- // smlal2 v10.4s, v3.8h, v5.8h // ............................'.................*..............................................'.................~................................
- // smlal v8.4s, v4.4h, v7.4h // ............................'....................*...........................................'....................~.............................
- // smlal2 v10.4s, v4.8h, v7.8h // ............................'.....................*..........................................'.....................~............................
- // smlal v9.4s, v3.4h, v6.4h // ............................'..................*.............................................'..................~...............................
- // smlal2 v11.4s, v3.8h, v6.8h // ............................'...................*............................................'...................~..............................
- // smlal v9.4s, v4.4h, v5.4h // ............................'......................*.........................................'......................~...........................
- // smlal2 v11.4s, v4.8h, v5.8h // ............................'.......................*........................................'.......................~..........................
- // ldr q12, [x7], #32 // .......................e....'...........................................................~....'..................................................
- // ldr q13, [x7, #-16] // ........................e...'............................................................~...'..................................................
- // uzp1 v3.8h, v12.8h, v13.8h // ............................'........................*.......................................'........................~.........................
- // uzp2 v4.8h, v12.8h, v13.8h // ............................'.........................*......................................'.........................~........................
- // ldr q12, [x8], #32 // .........................e..'.............................................................~..'..................................................
- // ldr q13, [x8, #-16] // ..........................e.'..............................................................~.'..................................................
- // uzp1 v5.8h, v12.8h, v13.8h // ............................'..........................*.....................................'..........................~.......................
- // uzp2 v6.8h, v12.8h, v13.8h // ............................'...........................*....................................'...........................~......................
- // ld1 {v7.8h}, [x9], #16 // ...........................e'...............................................................~'..................................................
- // smlal v8.4s, v3.4h, v5.4h // ............................'............................*...................................'............................~.....................
- // smlal2 v10.4s, v3.8h, v5.8h // ............................'.............................*..................................'.............................~....................
- // smlal v8.4s, v4.4h, v7.4h // ............................'................................*...............................'................................~.................
- // smlal2 v10.4s, v4.8h, v7.8h // ............................'.................................*..............................'.................................~................
- // smlal v9.4s, v3.4h, v6.4h // ............................'..............................*.................................'..............................~...................
- // smlal2 v11.4s, v3.8h, v6.8h // ............................'...............................*................................'...............................~..................
- // smlal v9.4s, v4.4h, v5.4h // ............................'..................................*.............................'..................................~...............
- // smlal2 v11.4s, v4.8h, v5.8h // ............................'...................................*............................'...................................~..............
- // uzp1 v28.8h, v8.8h, v10.8h // .~..........................'.....................................*..........................'.....................................~............
- // mul v28.8h, v28.8h, v2.8h // ...~........................'.......................................*........................'.......................................~..........
- // smlal v8.4s, v28.4h, v0.4h // .......~....................'...........................................*....................'...........................................~......
- // smlal2 v10.4s, v28.8h, v0.8h // ........~...................'............................................*...................'............................................~.....
- // uzp2 v26.8h, v8.8h, v10.8h // ............~...............'................................................*...............'................................................~.
- // uzp1 v28.8h, v9.8h, v11.8h // ..~.........................'......................................*.........................'......................................~...........
- // mul v28.8h, v28.8h, v2.8h // ....~.......................'........................................*.......................'........................................~.........
- // smlal v9.4s, v28.4h, v0.4h // .........~..................'.............................................*..................'.............................................~....
- // smlal2 v11.4s, v28.8h, v0.8h // ..........~.................'..............................................*.................'..............................................~...
- // uzp2 v27.8h, v9.8h, v11.8h // ..............~.............'..................................................*.............'..................................................
- // zip1 v12.8h, v26.8h, v27.8h // ......~.....................'..........................................~.....................'..........................................l.......
- // zip2 v13.8h, v26.8h, v27.8h // .....~......................'.........................................~......................'.........................................l........
- // str q12, [x0], #32 // ...........~................'...............................................~................'...............................................l..
- // str q13, [x0, #-16] // .............~..............'.................................................~..............'.................................................l
-
- sub count, count, #1
- cbnz count, 1b
- // Instructions: 55
- // Expected cycles: 61
- // Expected IPC: 0.90
-
- // Cycle bound: 61.0
- // IPC bound: 0.90
-
- // Wall time: 8.41s
- // User time: 8.41s
-
- // ----------------- original position ------------------>
- // 0 25 50
- // |------------------------|------------------------|----
- ldr q7, [x1], #32 // *......................................................
- uzp1 v20.8H, v15.8H, v16.8H // .*.....................................................
- uzp2 v15.8H, v15.8H, v16.8H // ..*....................................................
- uzp1 v23.8H, v7.8H, v30.8H // ...*...................................................
- uzp2 v11.8H, v7.8H, v30.8H // ....*..................................................
- smull2 v8.4S, v23.8H, v20.8H // ........*..............................................
- smull v5.4S, v23.4H, v20.4H // .......*...............................................
- smull2 v30.4S, v23.8H, v15.8H // ......*................................................
- uzp1 v28.8H, v1.8H, v12.8H // ...............*.......................................
- smlal2 v8.4S, v11.8H, v9.8H // ............*..........................................
- smlal v5.4S, v11.4H, v9.4H // ...........*...........................................
- uzp1 v3.8H, v21.8H, v6.8H // .............*.........................................
- smull v16.4S, v23.4H, v15.4H // .....*.................................................
- smlal2 v8.4S, v3.8H, v28.8H // ..................*....................................
- smlal v5.4S, v3.4H, v28.4H // .................*.....................................
- uzp2 v29.8H, v21.8H, v6.8H // ..............*........................................
- uzp1 v7.8H, v17.8H, v18.8H // ...........................*...........................
- smlal2 v8.4S, v29.8H, v24.8H // ......................*................................
- uzp1 v14.8H, v19.8H, v31.8H // .........................*.............................
- smlal v16.4S, v11.4H, v20.4H // .........*.............................................
- smlal2 v30.4S, v11.8H, v20.8H // ..........*............................................
- smlal2 v8.4S, v14.8H, v7.8H // ..............................*........................
- uzp2 v20.8H, v1.8H, v12.8H // ................*......................................
- uzp2 v21.8H, v19.8H, v31.8H // ..........................*............................
- smlal2 v30.4S, v3.8H, v20.8H // ....................*..................................
- smlal v16.4S, v3.4H, v20.4H // ...................*...................................
- smlal v5.4S, v29.4H, v24.4H // .....................*.................................
- uzp2 v9.8H, v17.8H, v18.8H // ............................*..........................
- smlal2 v30.4S, v29.8H, v28.8H // ........................*..............................
- smlal v16.4S, v29.4H, v28.4H // .......................*...............................
- smlal v5.4S, v14.4H, v7.4H // .............................*.........................
- smlal2 v8.4S, v21.8H, v25.8H // ..................................*....................
- smlal2 v30.4S, v14.8H, v9.8H // ................................*......................
- smlal v16.4S, v14.4H, v9.4H // ...............................*.......................
- smlal v5.4S, v21.4H, v25.4H // .................................*.....................
- zip1 v20.8H, v27.8H, v10.8H // ..........................................*............
- smlal2 v30.4S, v21.8H, v7.8H // ....................................*..................
- smlal v16.4S, v21.4H, v7.4H // ...................................*...................
- uzp1 v7.8H, v5.8H, v8.8H // .....................................*.................
- str q20, [x0], #32 // ...............................................*.......
- mul v15.8H, v7.8H, v2.8H // .......................................*...............
- uzp1 v7.8H, v16.8H, v30.8H // ......................................*................
- zip2 v31.8H, v27.8H, v10.8H // .........................................*.............
- mul v20.8H, v7.8H, v2.8H // ........................................*..............
- smlal v5.4S, v15.4H, v0.4H // ...........................................*...........
- smlal2 v8.4S, v15.8H, v0.8H // ............................................*..........
- str q31, [x0, #-16] // .................................................*.....
- smlal2 v30.4S, v20.8H, v0.8H // ..............................................*........
- smlal v16.4S, v20.4H, v0.4H // .............................................*.........
- uzp2 v15.8H, v5.8H, v8.8H // ................................................*......
- uzp2 v20.8H, v16.8H, v30.8H // ..................................................*....
- zip1 v7.8H, v15.8H, v20.8H // ....................................................*..
- zip2 v20.8H, v15.8H, v20.8H // ...................................................*...
- str q7, [x0], #32 // .....................................................*.
- str q20, [x0, #-16] // ......................................................*
-
- // -------------------- new position -------------------->
- // 0 25 50
- // |------------------------|------------------------|----
- // ldr q20, [x1], #32 // *......................................................
- // uzp1 v7.8H, v15.8H, v16.8H // .*.....................................................
- // uzp2 v15.8H, v15.8H, v16.8H // ..*....................................................
- // uzp1 v8.8H, v20.8H, v30.8H // ...*...................................................
- // uzp2 v20.8H, v20.8H, v30.8H // ....*..................................................
- // smull v30.4S, v8.4H, v15.4H // ............*..........................................
- // smull2 v15.4S, v8.8H, v15.8H // .......*...............................................
- // smull v11.4S, v8.4H, v7.4H // ......*................................................
- // smull2 v8.4S, v8.8H, v7.8H // .....*.................................................
- // smlal v30.4S, v20.4H, v7.4H // ...................*...................................
- // smlal2 v15.4S, v20.8H, v7.8H // ....................*..................................
- // smlal v11.4S, v20.4H, v9.4H // ..........*............................................
- // smlal2 v8.4S, v20.8H, v9.8H // .........*.............................................
- // uzp1 v7.8H, v21.8H, v6.8H // ...........*...........................................
- // uzp2 v20.8H, v21.8H, v6.8H // ...............*.......................................
- // uzp1 v16.8H, v1.8H, v12.8H // ........*..............................................
- // uzp2 v9.8H, v1.8H, v12.8H // ......................*................................
- // smlal v11.4S, v7.4H, v16.4H // ..............*........................................
- // smlal2 v8.4S, v7.8H, v16.8H // .............*.........................................
- // smlal v30.4S, v7.4H, v9.4H // .........................*.............................
- // smlal2 v15.4S, v7.8H, v9.8H // ........................*..............................
- // smlal v11.4S, v20.4H, v24.4H // ..........................*............................
- // smlal2 v8.4S, v20.8H, v24.8H // .................*.....................................
- // smlal v30.4S, v20.4H, v16.4H // .............................*.........................
- // smlal2 v15.4S, v20.8H, v16.8H // ............................*..........................
- // uzp1 v7.8H, v19.8H, v31.8H // ..................*....................................
- // uzp2 v20.8H, v19.8H, v31.8H // .......................*...............................
- // uzp1 v16.8H, v17.8H, v18.8H // ................*......................................
- // uzp2 v9.8H, v17.8H, v18.8H // ...........................*...........................
- // smlal v11.4S, v7.4H, v16.4H // ..............................*........................
- // smlal2 v8.4S, v7.8H, v16.8H // .....................*.................................
- // smlal v30.4S, v7.4H, v9.4H // .................................*.....................
- // smlal2 v15.4S, v7.8H, v9.8H // ................................*......................
- // smlal v11.4S, v20.4H, v25.4H // ..................................*....................
- // smlal2 v8.4S, v20.8H, v25.8H // ...............................*.......................
- // smlal v30.4S, v20.4H, v16.4H // .....................................*.................
- // smlal2 v15.4S, v20.8H, v16.8H // ....................................*..................
- // uzp1 v7.8H, v11.8H, v8.8H // ......................................*................
- // uzp1 v20.8H, v30.8H, v15.8H // .........................................*.............
- // mul v7.8H, v7.8H, v2.8H // ........................................*..............
- // mul v20.8H, v20.8H, v2.8H // ...........................................*...........
- // zip2 v9.8H, v27.8H, v10.8H // ..........................................*............
- // zip1 v27.8H, v27.8H, v10.8H // ...................................*...................
- // smlal v11.4S, v7.4H, v0.4H // ............................................*..........
- // smlal2 v8.4S, v7.8H, v0.8H // .............................................*.........
- // smlal v30.4S, v20.4H, v0.4H // ................................................*......
- // smlal2 v15.4S, v20.8H, v0.8H // ...............................................*.......
- // str q27, [x0], #32 // .......................................*...............
- // uzp2 v27.8H, v11.8H, v8.8H // .................................................*.....
- // str q9, [x0, #-16] // ..............................................*........
- // uzp2 v10.8H, v30.8H, v15.8H // ..................................................*....
- // zip2 v9.8H, v27.8H, v10.8H // ....................................................*..
- // zip1 v27.8H, v27.8H, v10.8H // ...................................................*...
- // str q27, [x0], #32 // .....................................................*.
- // str q9, [x0, #-16] // ......................................................*
-
-
- pop_stack
- ret
-#endif /* MLKEM_K == 3 */
-
-#if MLKEM_K == 4
-.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
-
-.balign 4
-MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt):
- push_stack
- mov wtmp, #3329
- dup modulus.8h, wtmp
-
- mov wtmp, #3327
- dup modulus_twisted.8h, wtmp
-
- // Computed bases of vector entries
-
- add a1_ptr, a0_ptr, #(1 * 512)
- add b1_ptr, b0_ptr, #(1 * 512)
- add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
- add a2_ptr, a0_ptr, #(2 * 512)
- add b2_ptr, b0_ptr, #(2 * 512)
- add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2)
- add a3_ptr, a0_ptr, #(3 * 512)
- add b3_ptr, b0_ptr, #(3 * 512)
- add b3_cache_ptr, b0_cache_ptr, #(3 * 512/2)
-
- // Bounds:
-
- // Each pmull is bound by 2*4096*2^15=2^28, so the final value
- // before Montgomery reduction is bound by 2^30.
-
- mov count, #(MLKEM_N / 16)
- // Instructions: 114
- // Expected cycles: 153
- // Expected IPC: 0.75
- //
- // Cycle bound: 153.0
- // IPC bound: 0.75
- //
- // Wall time: 0.69s
- // User time: 0.69s
- //
- // ----------------------------------------------- original position ----------------------------------------------->
- // 0 25 50 75 100
- // |------------------------|------------------------|------------------------|------------------------|-------------
- ldr q23, [x2, #16] // .*................................................................................................................
- ldr q19, [x2], #32 // *.................................................................................................................
- ldr q17, [x5], #32 // ..*...............................................................................................................
- uzp2 v13.8H, v19.8H, v23.8H // ..........*.......................................................................................................
- uzp1 v19.8H, v19.8H, v23.8H // ...........*......................................................................................................
- ldr q23, [x5, #-16] // ...*..............................................................................................................
- ldr q30, [x1, #16] // .....*............................................................................................................
- uzp2 v9.8H, v17.8H, v23.8H // ....*.............................................................................................................
- uzp1 v23.8H, v17.8H, v23.8H // .......*..........................................................................................................
- ldr q17, [x1], #32 // ......*...........................................................................................................
- ldr q10, [x7, #16] // .............*....................................................................................................
- uzp1 v12.8H, v17.8H, v30.8H // ........*.........................................................................................................
- uzp2 v17.8H, v17.8H, v30.8H // .........*........................................................................................................
- smull2 v30.4S, v12.8H, v13.8H // ............*.....................................................................................................
- smull v13.4S, v12.4H, v13.4H // ............................................*.....................................................................
- smull2 v22.4S, v12.8H, v19.8H // .....................................*............................................................................
- smull v12.4S, v12.4H, v19.4H // ..........................................*.......................................................................
- smlal2 v30.4S, v17.8H, v19.8H // ...............................*..................................................................................
- smlal v13.4S, v17.4H, v19.4H // ...............................................*..................................................................
- ldr q19, [x4], #32 // ....................*.............................................................................................
- ldr q16, [x4, #-16] // .....................*............................................................................................
- ld1 {v8.8H}, [x3], #16 // ................................*.................................................................................
- uzp1 v26.8H, v19.8H, v16.8H // .......................*..........................................................................................
- uzp2 v19.8H, v19.8H, v16.8H // ........................*.........................................................................................
- smlal2 v30.4S, v26.8H, v9.8H // .................................*................................................................................
- smlal v13.4S, v26.4H, v9.4H // ..................................................*...............................................................
- smlal2 v22.4S, v17.8H, v8.8H // ........................................*.........................................................................
- smlal v12.4S, v17.4H, v8.4H // .................................................*................................................................
- smlal2 v30.4S, v19.8H, v23.8H // ...................................*..............................................................................
- smlal v13.4S, v19.4H, v23.4H // .......................................................*..........................................................
- smlal2 v22.4S, v26.8H, v23.8H // ...........................................*......................................................................
- smlal v12.4S, v26.4H, v23.4H // .....................................................*............................................................
- ldr q23, [x7], #32 // ......................*...........................................................................................
- ldr q17, [x8, #16] // ..............*...................................................................................................
- uzp1 v9.8H, v23.8H, v10.8H // ..........................*.......................................................................................
- uzp2 v23.8H, v23.8H, v10.8H // ....................................*.............................................................................
- ldr q10, [x10], #32 // ...............*..................................................................................................
- ldr q16, [x10, #-16] // ................*.................................................................................................
- ld1 {v8.8H}, [x12], #16 // .................*................................................................................................
- uzp1 v26.8H, v10.8H, v16.8H // ..................*...............................................................................................
- uzp2 v10.8H, v10.8H, v16.8H // ...................*..............................................................................................
- ld1 {v16.8H}, [x6], #16 // .........................*........................................................................................
- ldr q3, [x11, #16] // ...........................*......................................................................................
- smlal2 v22.4S, v19.8H, v16.8H // ..............................................*...................................................................
- smlal v12.4S, v19.4H, v16.4H // ........................................................*.........................................................
- ldr q19, [x11], #32 // ............................*.....................................................................................
- ld1 {v16.8H}, [x9], #16 // .............................*....................................................................................
- uzp1 v4.8H, v19.8H, v3.8H // ..................................*...............................................................................
- uzp2 v19.8H, v19.8H, v3.8H // .......................................*..........................................................................
- ldr q3, [x8], #32 // ..............................*...................................................................................
- ldr q31, [x2], #32 // ......................................*...........................................................................
- uzp1 v6.8H, v3.8H, v17.8H // ...................................................*..............................................................
- uzp2 v17.8H, v3.8H, v17.8H // .........................................................*........................................................
- smlal2 v22.4S, v9.8H, v6.8H // ..........................................................*.......................................................
- smlal2 v30.4S, v9.8H, v17.8H // ...........................................................*......................................................
- smlal v13.4S, v9.4H, v17.4H // ............................................................*.....................................................
- smlal v12.4S, v9.4H, v6.4H // .............................................................*....................................................
- smlal2 v22.4S, v23.8H, v16.8H // ..............................................................*...................................................
- smlal2 v30.4S, v23.8H, v6.8H // ...............................................................*..................................................
- smlal v13.4S, v23.4H, v6.4H // ................................................................*.................................................
- smlal v12.4S, v23.4H, v16.4H // .................................................................*................................................
- smlal2 v22.4S, v26.8H, v4.8H // ..................................................................*...............................................
- smlal2 v30.4S, v26.8H, v19.8H // ...................................................................*..............................................
- smlal v13.4S, v26.4H, v19.4H // ....................................................................*.............................................
- smlal v12.4S, v26.4H, v4.4H // .....................................................................*............................................
- smlal2 v22.4S, v10.8H, v8.8H // ......................................................................*...........................................
- smlal2 v30.4S, v10.8H, v4.8H // .......................................................................*..........................................
- smlal v13.4S, v10.4H, v4.4H // ........................................................................*.........................................
- smlal v12.4S, v10.4H, v8.4H // .........................................................................*........................................
- ldr q19, [x2, #-16] // .........................................*........................................................................
- uzp1 v23.8H, v13.8H, v30.8H // ...........................................................................*......................................
- uzp1 v17.8H, v12.8H, v22.8H // ....................................................................................*.............................
- mul v23.8H, v23.8H, v2.8H // .............................................................................*....................................
- uzp2 v21.8H, v31.8H, v19.8H // ................................................................................*.................................
- uzp1 v19.8H, v31.8H, v19.8H // ...................................................................................*..............................
- mul v17.8H, v17.8H, v2.8H // .....................................................................................*............................
- smlal v13.4S, v23.4H, v0.4H // .................................................................................*................................
- smlal2 v30.4S, v23.8H, v0.8H // ..................................................................................*...............................
- ldr q23, [x5], #32 // .............................................*....................................................................
- smlal2 v22.4S, v17.8H, v0.8H // ...........................................................................................................*......
- uzp2 v15.8H, v13.8H, v30.8H // ......................................................................................*...........................
- smlal v12.4S, v17.4H, v0.4H // ............................................................................................................*.....
- ldr q17, [x5, #-16] // ................................................*.................................................................
- ldr q13, [x1, #16] // ......................................................*...........................................................
- uzp2 v27.8H, v23.8H, v17.8H // ....................................................*.............................................................
- uzp1 v28.8H, v23.8H, v17.8H // ............................................................................*.....................................
- uzp2 v7.8H, v12.8H, v22.8H // ...............................................................................................................*..
- ldr q23, [x1], #32 // ..........................................................................*.......................................
- zip1 v5.8H, v7.8H, v15.8H // .................................................................................................................*
- ldr q3, [x7, #16] // ........................................................................................*.........................
- uzp1 v31.8H, v23.8H, v13.8H // ..............................................................................*...................................
- uzp2 v16.8H, v23.8H, v13.8H // ...............................................................................*..................................
- smull2 v24.4S, v31.8H, v21.8H // .......................................................................................*..........................
- ldr q6, [x8, #16] // .........................................................................................*........................
- ldr q23, [x10], #32 // ..........................................................................................*.......................
- smlal2 v24.4S, v16.8H, v19.8H // ..........................................................................................................*.......
- ldr q17, [x10, #-16] // ...........................................................................................*......................
- ld1 {v22.8H}, [x12], #16 // ............................................................................................*.....................
- uzp1 v30.8H, v23.8H, v17.8H // .............................................................................................*....................
- uzp2 v11.8H, v23.8H, v17.8H // ..............................................................................................*...................
- ldr q23, [x4], #32 // ...............................................................................................*..................
- ldr q17, [x4, #-16] // ................................................................................................*.................
- ldr q4, [x7], #32 // .................................................................................................*................
- uzp1 v20.8H, v23.8H, v17.8H // ..................................................................................................*...............
- uzp2 v26.8H, v23.8H, v17.8H // ...................................................................................................*..............
- uzp1 v9.8H, v4.8H, v3.8H // .....................................................................................................*............
- smlal2 v24.4S, v20.8H, v27.8H // ..............................................................................................................*...
- ld1 {v8.8H}, [x6], #16 // ....................................................................................................*.............
- ldr q25, [x11, #16] // ......................................................................................................*...........
- ldr q29, [x11], #32 // .......................................................................................................*..........
- ld1 {v12.8H}, [x9], #16 // ........................................................................................................*.........
- uzp1 v10.8H, v29.8H, v25.8H // ................................................................................................................*.
- ldr q14, [x8], #32 // .........................................................................................................*........
- ld1 {v23.8H}, [x3], #16 // .............................................................................................................*....
-
- // ------------------------------------------------- new position -------------------------------------------------->
- // 0 25 50 75 100
- // |------------------------|------------------------|------------------------|------------------------|-------------
- // ldr q3, [x2], #32 // .*................................................................................................................
- // ldr q17, [x2, #-16] // *.................................................................................................................
- // ldr q21, [x5], #32 // ..*...............................................................................................................
- // ldr q19, [x5, #-16] // .....*............................................................................................................
- // uzp2 v27.8H, v21.8H, v19.8H // .......*..........................................................................................................
- // ldr q25, [x1, #16] // ......*...........................................................................................................
- // ldr q22, [x1], #32 // .........*........................................................................................................
- // uzp1 v28.8H, v21.8H, v19.8H // ........*.........................................................................................................
- // uzp1 v31.8H, v22.8H, v25.8H // ...........*......................................................................................................
- // uzp2 v16.8H, v22.8H, v25.8H // ............*.....................................................................................................
- // uzp2 v21.8H, v3.8H, v17.8H // ...*..............................................................................................................
- // uzp1 v19.8H, v3.8H, v17.8H // ....*.............................................................................................................
- // smull2 v24.4S, v31.8H, v21.8H // .............*....................................................................................................
- // ldr q3, [x7, #16] // ..........*.......................................................................................................
- // ldr q6, [x8, #16] // .................................*................................................................................
- // ldr q8, [x10], #32 // ....................................*.............................................................................
- // ldr q26, [x10, #-16] // .....................................*............................................................................
- // ld1 {v22.8H}, [x12], #16 // ......................................*...........................................................................
- // uzp1 v30.8H, v8.8H, v26.8H // .......................................*..........................................................................
- // uzp2 v11.8H, v8.8H, v26.8H // ........................................*.........................................................................
- // ldr q8, [x4], #32 // ...................*..............................................................................................
- // ldr q26, [x4, #-16] // ....................*.............................................................................................
- // ldr q4, [x7], #32 // ................................*.................................................................................
- // uzp1 v20.8H, v8.8H, v26.8H // ......................*...........................................................................................
- // uzp2 v26.8H, v8.8H, v26.8H // .......................*..........................................................................................
- // ld1 {v8.8H}, [x6], #16 // .........................................*........................................................................
- // uzp1 v9.8H, v4.8H, v3.8H // ..................................*...............................................................................
- // ldr q25, [x11, #16] // ..........................................*.......................................................................
- // ldr q29, [x11], #32 // .............................................*....................................................................
- // ld1 {v12.8H}, [x9], #16 // ..............................................*...................................................................
- // ldr q14, [x8], #32 // .................................................*................................................................
- // smlal2 v24.4S, v16.8H, v19.8H // .................*................................................................................................
- // ld1 {v23.8H}, [x3], #16 // .....................*............................................................................................
- // smlal2 v24.4S, v20.8H, v27.8H // ........................*.........................................................................................
- // uzp1 v10.8H, v29.8H, v25.8H // ...............................................*..................................................................
- // smlal2 v24.4S, v26.8H, v28.8H // ............................*.....................................................................................
- // uzp2 v4.8H, v4.8H, v3.8H // ...................................*..............................................................................
- // smull2 v13.4S, v31.8H, v19.8H // ...............*..................................................................................................
- // ldr q3, [x2], #32 // ..................................................*...............................................................
- // uzp2 v1.8H, v29.8H, v25.8H // ................................................*.................................................................
- // smlal2 v13.4S, v16.8H, v23.8H // ..........................*.......................................................................................
- // ldr q17, [x2, #-16] // .....................................................................*............................................
- // smull v18.4S, v31.4H, v19.4H // ................*.................................................................................................
- // smlal2 v13.4S, v20.8H, v28.8H // ..............................*...................................................................................
- // smull v29.4S, v31.4H, v21.4H // ..............*...................................................................................................
- // ldr q21, [x5], #32 // ..............................................................................*...................................
- // smlal2 v13.4S, v26.8H, v8.8H // ...........................................*......................................................................
- // smlal v29.4S, v16.4H, v19.4H // ..................*...............................................................................................
- // ldr q19, [x5, #-16] // ..................................................................................*...............................
- // smlal v18.4S, v16.4H, v23.4H // ...........................*......................................................................................
- // smlal v29.4S, v20.4H, v27.4H // .........................*........................................................................................
- // uzp1 v31.8H, v14.8H, v6.8H // ...................................................*..............................................................
- // uzp2 v27.8H, v21.8H, v19.8H // ....................................................................................*.............................
- // smlal v18.4S, v20.4H, v28.4H // ...............................*..................................................................................
- // ldr q25, [x1, #16] // ...................................................................................*..............................
- // smlal v29.4S, v26.4H, v28.4H // .............................*....................................................................................
- // smlal v18.4S, v26.4H, v8.4H // ............................................*.....................................................................
- // uzp2 v26.8H, v14.8H, v6.8H // ....................................................*.............................................................
- // smlal2 v13.4S, v9.8H, v31.8H // .....................................................*............................................................
- // smlal2 v24.4S, v9.8H, v26.8H // ......................................................*...........................................................
- // smlal v29.4S, v9.4H, v26.4H // .......................................................*..........................................................
- // smlal v18.4S, v9.4H, v31.4H // ........................................................*.........................................................
- // smlal2 v13.4S, v4.8H, v12.8H // .........................................................*........................................................
- // smlal2 v24.4S, v4.8H, v31.8H // ..........................................................*.......................................................
- // smlal v29.4S, v4.4H, v31.4H // ...........................................................*......................................................
- // smlal v18.4S, v4.4H, v12.4H // ............................................................*.....................................................
- // smlal2 v13.4S, v30.8H, v10.8H // .............................................................*....................................................
- // smlal2 v24.4S, v30.8H, v1.8H // ..............................................................*...................................................
- // smlal v29.4S, v30.4H, v1.4H // ...............................................................*..................................................
- // smlal v18.4S, v30.4H, v10.4H // ................................................................*.................................................
- // smlal2 v13.4S, v11.8H, v22.8H // .................................................................*................................................
- // smlal2 v24.4S, v11.8H, v10.8H // ..................................................................*...............................................
- // smlal v29.4S, v11.4H, v10.4H // ...................................................................*..............................................
- // smlal v18.4S, v11.4H, v22.4H // ....................................................................*.............................................
- // ldr q22, [x1], #32 // .......................................................................................*..........................
- // uzp1 v31.8H, v29.8H, v24.8H // ......................................................................*...........................................
- // uzp1 v28.8H, v21.8H, v19.8H // .....................................................................................*............................
- // mul v19.8H, v31.8H, v2.8H // ........................................................................*.........................................
- // uzp1 v31.8H, v22.8H, v25.8H // ..........................................................................................*.......................
- // uzp2 v16.8H, v22.8H, v25.8H // ...........................................................................................*......................
- // uzp2 v21.8H, v3.8H, v17.8H // .........................................................................*........................................
- // smlal v29.4S, v19.4H, v0.4H // ............................................................................*.....................................
- // smlal2 v24.4S, v19.8H, v0.8H // .............................................................................*....................................
- // uzp1 v19.8H, v3.8H, v17.8H // ..........................................................................*.......................................
- // uzp1 v26.8H, v18.8H, v13.8H // .......................................................................*..........................................
- // mul v23.8H, v26.8H, v2.8H // ...........................................................................*......................................
- // uzp2 v15.8H, v29.8H, v24.8H // ................................................................................*.................................
- // smull2 v24.4S, v31.8H, v21.8H // ............................................................................................*.....................
- // ldr q3, [x7, #16] // .........................................................................................*........................
- // ldr q6, [x8, #16] // .............................................................................................*....................
- // ldr q8, [x10], #32 // ..............................................................................................*...................
- // ldr q26, [x10, #-16] // ................................................................................................*.................
- // ld1 {v22.8H}, [x12], #16 // .................................................................................................*................
- // uzp1 v30.8H, v8.8H, v26.8H // ..................................................................................................*...............
- // uzp2 v11.8H, v8.8H, v26.8H // ...................................................................................................*..............
- // ldr q8, [x4], #32 // ....................................................................................................*.............
- // ldr q26, [x4, #-16] // .....................................................................................................*............
- // ldr q4, [x7], #32 // ......................................................................................................*...........
- // uzp1 v20.8H, v8.8H, v26.8H // .......................................................................................................*..........
- // uzp2 v26.8H, v8.8H, v26.8H // ........................................................................................................*.........
- // ld1 {v8.8H}, [x6], #16 // ...........................................................................................................*......
- // uzp1 v9.8H, v4.8H, v3.8H // .........................................................................................................*........
- // ldr q25, [x11, #16] // ............................................................................................................*.....
- // ldr q29, [x11], #32 // .............................................................................................................*....
- // ld1 {v12.8H}, [x9], #16 // ..............................................................................................................*...
- // ldr q14, [x8], #32 // ................................................................................................................*.
- // smlal2 v24.4S, v16.8H, v19.8H // ...............................................................................................*..................
- // smlal2 v13.4S, v23.8H, v0.8H // ...............................................................................*..................................
- // smlal v18.4S, v23.4H, v0.4H // .................................................................................*................................
- // ld1 {v23.8H}, [x3], #16 // .................................................................................................................*
- // smlal2 v24.4S, v20.8H, v27.8H // ..........................................................................................................*.......
- // uzp2 v7.8H, v18.8H, v13.8H // ......................................................................................*...........................
- // uzp1 v10.8H, v29.8H, v25.8H // ...............................................................................................................*..
- // zip1 v5.8H, v7.8H, v15.8H // ........................................................................................*.........................
-
- sub count, count, #2
-1:
- // Instructions: 82
- // Expected cycles: 102
- // Expected IPC: 0.80
- //
- // Cycle bound: 102.0
- // IPC bound: 0.80
- //
- // Wall time: 15.93s
- // User time: 15.93s
- //
- // ------------------------------- original position ------------------------------->
- // 0 25 50 75
- // |------------------------|------------------------|------------------------|------
- smlal2 v24.4S, v26.8H, v28.8H // .................................*................................................
- uzp2 v4.8H, v4.8H, v3.8H // .....................................*............................................
- smull2 v13.4S, v31.8H, v19.8H // ..........*.......................................................................
- ldr q3, [x2], #32 // ....e.............................................................................
- uzp2 v1.8H, v29.8H, v25.8H // ..........................................................*.......................
- smlal2 v13.4S, v16.8H, v23.8H // ............*.....................................................................
- ldr q17, [x2, #-16] // .....e............................................................................
- smull v18.4S, v31.4H, v19.4H // .........*........................................................................
- smlal2 v13.4S, v20.8H, v28.8H // ...........................*......................................................
- smull v29.4S, v31.4H, v21.4H // .............*....................................................................
- ldr q21, [x5], #32 // .....................e............................................................
- smlal2 v13.4S, v26.8H, v8.8H // .............................*....................................................
- smlal v29.4S, v16.4H, v19.4H // ...............*..................................................................
- ldr q19, [x5, #-16] // ......................e...........................................................
- smlal v18.4S, v16.4H, v23.4H // ...........*......................................................................
- smlal v29.4S, v20.4H, v27.4H // ..............................*...................................................
- uzp1 v31.8H, v14.8H, v6.8H // ........................................*.........................................
- uzp2 v27.8H, v21.8H, v19.8H // ........................e.........................................................
- smlal v18.4S, v20.4H, v28.4H // ..........................*.......................................................
- ldr q25, [x1, #16] // .e................................................................................
- smlal v29.4S, v26.4H, v28.4H // ................................*.................................................
- smlal v18.4S, v26.4H, v8.4H // ............................*.....................................................
- uzp2 v26.8H, v14.8H, v6.8H // .........................................*........................................
- smlal2 v13.4S, v9.8H, v31.8H // ............................................*.....................................
- smlal2 v24.4S, v9.8H, v26.8H // ................................................*.................................
- smlal v29.4S, v9.4H, v26.4H // ...............................................*..................................
- smlal v18.4S, v9.4H, v31.4H // ...........................................*......................................
- smlal2 v13.4S, v4.8H, v12.8H // ..............................................*...................................
- smlal2 v24.4S, v4.8H, v31.8H // ..................................................*...............................
- smlal v29.4S, v4.4H, v31.4H // .................................................*................................
- smlal v18.4S, v4.4H, v12.4H // .............................................*....................................
- smlal2 v13.4S, v30.8H, v10.8H // .............................................................*....................
- smlal2 v24.4S, v30.8H, v1.8H // .................................................................*................
- smlal v29.4S, v30.4H, v1.4H // ................................................................*.................
- smlal v18.4S, v30.4H, v10.4H // ............................................................*.....................
- smlal2 v13.4S, v11.8H, v22.8H // ...............................................................*..................
- smlal2 v24.4S, v11.8H, v10.8H // ...................................................................*..............
- smlal v29.4S, v11.4H, v10.4H // ..................................................................*...............
- smlal v18.4S, v11.4H, v22.4H // ..............................................................*...................
- ldr q22, [x1], #32 // e.................................................................................
- uzp1 v31.8H, v29.8H, v24.8H // .........................................................................*........
- uzp1 v28.8H, v21.8H, v19.8H // .......................e..........................................................
- mul v19.8H, v31.8H, v2.8H // ..........................................................................*.......
- uzp1 v31.8H, v22.8H, v25.8H // ..e...............................................................................
- uzp2 v16.8H, v22.8H, v25.8H // ...e..............................................................................
- uzp2 v21.8H, v3.8H, v17.8H // .......e..........................................................................
- smlal v29.4S, v19.4H, v0.4H // ...........................................................................*......
- smlal2 v24.4S, v19.8H, v0.8H // ............................................................................*.....
- uzp1 v19.8H, v3.8H, v17.8H // ......e...........................................................................
- uzp1 v26.8H, v18.8H, v13.8H // ....................................................................*.............
- zip2 v14.8H, v7.8H, v15.8H // ...............................................................................l..
- mul v23.8H, v26.8H, v2.8H // .....................................................................*............
- uzp2 v15.8H, v29.8H, v24.8H // .............................................................................*....
- smull2 v24.4S, v31.8H, v21.8H // ..............e...................................................................
- str q14, [x0, #16] // .................................................................................l
- ldr q3, [x7, #16] // ...................................e..............................................
- ldr q6, [x8, #16] // .......................................e..........................................
- ldr q8, [x10], #32 // ...................................................e..............................
- ldr q26, [x10, #-16] // ....................................................e.............................
- ld1 {v22.8H}, [x12], #16 // ...........................................................e......................
- uzp1 v30.8H, v8.8H, v26.8H // .....................................................e............................
- uzp2 v11.8H, v8.8H, v26.8H // ......................................................e...........................
- ldr q8, [x4], #32 // .................e................................................................
- ldr q26, [x4, #-16] // ..................e...............................................................
- ldr q4, [x7], #32 // ..................................e...............................................
- uzp1 v20.8H, v8.8H, v26.8H // ...................e..............................................................
- uzp2 v26.8H, v8.8H, v26.8H // ....................e.............................................................
- ld1 {v8.8H}, [x6], #16 // .........................e........................................................
- uzp1 v9.8H, v4.8H, v3.8H // ....................................e.............................................
- ldr q25, [x11, #16] // ........................................................e.........................
- ldr q29, [x11], #32 // .......................................................e..........................
- ld1 {v12.8H}, [x9], #16 // ..........................................e.......................................
- ldr q14, [x8], #32 // ......................................e...........................................
- smlal2 v24.4S, v16.8H, v19.8H // ................e.................................................................
- smlal2 v13.4S, v23.8H, v0.8H // .......................................................................*..........
- smlal v18.4S, v23.4H, v0.4H // ......................................................................*...........
- ld1 {v23.8H}, [x3], #16 // ........e.........................................................................
- smlal2 v24.4S, v20.8H, v27.8H // ...............................e..................................................
- uzp2 v7.8H, v18.8H, v13.8H // ........................................................................*.........
- uzp1 v10.8H, v29.8H, v25.8H // .........................................................e........................
- str q5, [x0], #32 // ................................................................................l.
- zip1 v5.8H, v7.8H, v15.8H // ..............................................................................*...
-
- // ----------------------------------------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------------------------------------>
- // 0 25 50 75 100 125 150 175 200 225
- // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------
- // ldr q12, [x1], #32 // ....................................e..........................................'......................................~..........................................'......................................~.........................................
- // ldr q13, [x1, #-16] // ................e..............................................................'..................~..............................................................'..................~.............................................................
- // uzp1 v3.8h, v12.8h, v13.8h // ........................................e......................................'..........................................~......................................'..........................................~.....................................
- // uzp2 v4.8h, v12.8h, v13.8h // .........................................e.....................................'...........................................~.....................................'...........................................~....................................
- // ldr q12, [x2], #32 // e..............................................................................'..~..............................................................................'..~.............................................................................
- // ldr q13, [x2, #-16] // ...e...........................................................................'.....~...........................................................................'.....~..........................................................................
- // uzp1 v5.8h, v12.8h, v13.8h // .............................................e.................................'...............................................~.................................'...............................................~................................
- // uzp2 v6.8h, v12.8h, v13.8h // ..........................................e....................................'............................................~....................................'............................................~...................................
- // ld1 {v7.8h}, [x3], #16 // .........................................................................e.....'...........................................................................~.....'...........................................................................~....
- // smull v8.4s, v3.4h, v5.4h // ....~..........................................................................'......*..........................................................................'......~.........................................................................
- // smull2 v10.4s, v3.8h, v5.8h // ...............................................................................'.*...............................................................................'.~..............................................................................
- // smlal v8.4s, v4.4h, v7.4h // ...........~...................................................................'.............*...................................................................'.............~..................................................................
- // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................................................'....*............................................................................'....~...........................................................................
- // smull v9.4s, v3.4h, v6.4h // ......~........................................................................'........*........................................................................'........~.......................................................................
- // smull2 v11.4s, v3.8h, v6.8h // ..................................................e............................'....................................................~............................'....................................................~...........................
- // smlal v9.4s, v4.4h, v5.4h // .........~.....................................................................'...........*.....................................................................'...........~....................................................................
- // smlal2 v11.4s, v4.8h, v5.8h // ......................................................................e........'........................................................................~........'........................................................................~.......
- // ldr q12, [x4], #32 // ...........................................................e...................'.............................................................~...................'.............................................................~..................
- // ldr q13, [x4, #-16] // ............................................................e..................'..............................................................~..................'..............................................................~.................
- // uzp1 v3.8h, v12.8h, v13.8h // ..............................................................e................'................................................................~................'................................................................~...............
- // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................e...............'.................................................................~...............'.................................................................~..............
- // ldr q12, [x5], #32 // .......e.......................................................................'.........~.......................................................................'.........~......................................................................
- // ldr q13, [x5, #-16] // ..........e....................................................................'............~....................................................................'............~...................................................................
- // uzp1 v5.8h, v12.8h, v13.8h // ......................................e........................................'........................................~........................................'........................................~.......................................
- // uzp2 v6.8h, v12.8h, v13.8h // ..............e................................................................'................~................................................................'................~...............................................................
- // ld1 {v7.8h}, [x6], #16 // ................................................................e..............'..................................................................~..............'..................................................................~.............
- // smlal v8.4s, v3.4h, v5.4h // ...............~...............................................................'.................*...............................................................'.................~..............................................................
- // smlal2 v10.4s, v3.8h, v5.8h // .....~.........................................................................'.......*.........................................................................'.......~........................................................................
- // smlal v8.4s, v4.4h, v7.4h // ..................~............................................................'....................*............................................................'....................~...........................................................
- // smlal2 v10.4s, v4.8h, v7.8h // ........~......................................................................'..........*......................................................................'..........~.....................................................................
- // smlal v9.4s, v3.4h, v6.4h // ............~..................................................................'..............*..................................................................'..............~.................................................................
- // smlal2 v11.4s, v3.8h, v6.8h // ..........................................................................e....'............................................................................~....'............................................................................~...
- // smlal v9.4s, v4.4h, v5.4h // .................~.............................................................'...................*.............................................................'...................~............................................................
- // smlal2 v11.4s, v4.8h, v5.8h // ...............................................................................*.................................................................................~................................................................................
- // ldr q12, [x7], #32 // .............................................................e.................'...............................................................~.................'...............................................................~................
- // ldr q13, [x7, #-16] // ....................................................e..........................'......................................................~..........................'......................................................~.........................
- // uzp1 v3.8h, v12.8h, v13.8h // .................................................................e.............'...................................................................~.............'...................................................................~............
- // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................................'*................................................................................'~...............................................................................
- // ldr q12, [x8], #32 // .....................................................................e.........'.......................................................................~.........'.......................................................................~........
- // ldr q13, [x8, #-16] // .....................................................e.........................'.......................................................~.........................'.......................................................~........................
- // uzp1 v5.8h, v12.8h, v13.8h // .............~.................................................................'...............*.................................................................'...............~................................................................
- // uzp2 v6.8h, v12.8h, v13.8h // ...................~...........................................................'.....................*...........................................................'.....................~..........................................................
- // ld1 {v7.8h}, [x9], #16 // ....................................................................e..........'......................................................................~..........'......................................................................~.........
- // smlal v8.4s, v3.4h, v5.4h // .......................~.......................................................'.........................*.......................................................'.........................~......................................................
- // smlal2 v10.4s, v3.8h, v5.8h // ....................~..........................................................'......................*..........................................................'......................~.........................................................
- // smlal v8.4s, v4.4h, v7.4h // ...........................~...................................................'.............................*...................................................'.............................~..................................................
- // smlal2 v10.4s, v4.8h, v7.8h // ........................~......................................................'..........................*......................................................'..........................~.....................................................
- // smlal v9.4s, v3.4h, v6.4h // ......................~........................................................'........................*........................................................'........................~.......................................................
- // smlal2 v11.4s, v3.8h, v6.8h // .....................~.........................................................'.......................*.........................................................'.......................~........................................................
- // smlal v9.4s, v4.4h, v5.4h // ..........................~....................................................'............................*....................................................'............................~...................................................
- // smlal2 v11.4s, v4.8h, v5.8h // .........................~.....................................................'...........................*.....................................................'...........................~....................................................
- // ldr q12, [x10], #32 // ......................................................e........................'........................................................~........................'........................................................~.......................
- // ldr q13, [x10, #-16] // .......................................................e.......................'.........................................................~.......................'.........................................................~......................
- // uzp1 v3.8h, v12.8h, v13.8h // .........................................................e.....................'...........................................................~.....................'...........................................................~....................
- // uzp2 v4.8h, v12.8h, v13.8h // ..........................................................e....................'............................................................~....................'............................................................~...................
- // ldr q12, [x11], #32 // ...................................................................e...........'.....................................................................~...........'.....................................................................~..........
- // ldr q13, [x11, #-16] // ..................................................................e............'....................................................................~............'....................................................................~...........
- // uzp1 v5.8h, v12.8h, v13.8h // ............................................................................e..'..............................................................................~..'..............................................................................~.
- // uzp2 v6.8h, v12.8h, v13.8h // .~.............................................................................'...*.............................................................................'...~............................................................................
- // ld1 {v7.8h}, [x12], #16 // ........................................................e......................'..........................................................~......................'..........................................................~.....................
- // smlal v8.4s, v3.4h, v5.4h // ...............................~...............................................'.................................*...............................................'.................................~..............................................
- // smlal2 v10.4s, v3.8h, v5.8h // ............................~..................................................'..............................*..................................................'..............................~.................................................
- // smlal v8.4s, v4.4h, v7.4h // ...................................~...........................................'.....................................*...........................................'.....................................~..........................................
- // smlal2 v10.4s, v4.8h, v7.8h // ................................~..............................................'..................................*..............................................'..................................~.............................................
- // smlal v9.4s, v3.4h, v6.4h // ..............................~................................................'................................*................................................'................................~...............................................
- // smlal2 v11.4s, v3.8h, v6.8h // .............................~.................................................'...............................*.................................................'...............................~................................................
- // smlal v9.4s, v4.4h, v5.4h // ..................................~............................................'....................................*............................................'....................................~...........................................
- // smlal2 v11.4s, v4.8h, v5.8h // .................................~.............................................'...................................*.............................................'...................................~............................................
- // uzp1 v28.8h, v8.8h, v10.8h // ..............................................~................................'................................................*................................'................................................~...............................
- // mul v28.8h, v28.8h, v2.8h // ................................................~..............................'..................................................*..............................'..................................................~.............................
- // smlal v8.4s, v28.4h, v0.4h // ........................................................................~......'..........................................................................*......'..........................................................................~.....
- // smlal2 v10.4s, v28.8h, v0.8h // .......................................................................~.......'.........................................................................*.......'.........................................................................~......
- // uzp2 v26.8h, v8.8h, v10.8h // ...........................................................................~...'.............................................................................*...'.............................................................................~..
- // uzp1 v28.8h, v9.8h, v11.8h // .....................................~.........................................'.......................................*.........................................'.......................................~........................................
- // mul v28.8h, v28.8h, v2.8h // .......................................~.......................................'.........................................*.......................................'.........................................~......................................
- // smlal v9.4s, v28.4h, v0.4h // ...........................................~...................................'.............................................*...................................'.............................................~..................................
- // smlal2 v11.4s, v28.8h, v0.8h // ............................................~..................................'..............................................*..................................'..............................................~.................................
- // uzp2 v27.8h, v9.8h, v11.8h // .................................................~.............................'...................................................*.............................'...................................................~............................
- // zip1 v12.8h, v26.8h, v27.8h // ..............................................................................~'................................................................................*'................................................................................
- // zip2 v13.8h, v26.8h, v27.8h // ...............................................~...............................'.................................................~...............................'.................................................l..............................
- // str q12, [x0], #32 // .............................................................................~.'...............................................................................~.'...............................................................................l
- // str q13, [x0, #-16] // ...................................................~...........................'.....................................................~...........................'.....................................................l..........................
-
- sub count, count, #1
- cbnz count, 1b
- // Instructions: 50
- // Expected cycles: 56
- // Expected IPC: 0.89
- //
- // Cycle bound: 56.0
- // IPC bound: 0.89
- //
- // Wall time: 4.16s
- // User time: 4.16s
- //
- // --------------- original position --------------->
- // 0 25
- // |------------------------|
- smull2 v17.4S, v31.8H, v19.8H // ..*...............................................
- uzp2 v1.8H, v14.8H, v6.8H // ................*.................................
- smull v18.4S, v31.4H, v21.4H // .......*..........................................
- smlal2 v24.4S, v26.8H, v28.8H // *.................................................
- smlal2 v17.4S, v16.8H, v23.8H // ....*.............................................
- smull v21.4S, v31.4H, v19.4H // .....*............................................
- smlal v18.4S, v16.4H, v19.4H // .........*........................................
- uzp2 v31.8H, v4.8H, v3.8H // .*................................................
- uzp1 v3.8H, v14.8H, v6.8H // ............*.....................................
- smlal v21.4S, v16.4H, v23.4H // ..........*.......................................
- smlal v18.4S, v20.4H, v27.4H // ...........*......................................
- uzp2 v14.8H, v29.8H, v25.8H // ...*..............................................
- smlal2 v17.4S, v20.8H, v28.8H // ......*...........................................
- smlal v21.4S, v20.4H, v28.4H // .............*....................................
- smlal v18.4S, v26.4H, v28.4H // ..............*...................................
- smlal2 v24.4S, v9.8H, v1.8H // ..................*...............................
- smlal2 v17.4S, v26.8H, v8.8H // ........*.........................................
- smlal v21.4S, v26.4H, v8.4H // ...............*..................................
- smlal v18.4S, v9.4H, v1.4H // ...................*..............................
- smlal2 v24.4S, v31.8H, v3.8H // ......................*...........................
- smlal2 v17.4S, v9.8H, v3.8H // .................*................................
- smlal v21.4S, v9.4H, v3.4H // ....................*.............................
- smlal v18.4S, v31.4H, v3.4H // .......................*..........................
- smlal2 v24.4S, v30.8H, v14.8H // ..........................*.......................
- smlal2 v17.4S, v31.8H, v12.8H // .....................*............................
- smlal v21.4S, v31.4H, v12.4H // ........................*.........................
- smlal v18.4S, v30.4H, v14.4H // ...........................*......................
- smlal2 v24.4S, v11.8H, v10.8H // ..............................*...................
- smlal2 v17.4S, v30.8H, v10.8H // .........................*........................
- smlal v21.4S, v30.4H, v10.4H // ............................*.....................
- smlal v18.4S, v11.4H, v10.4H // ...............................*..................
- zip2 v19.8H, v7.8H, v15.8H // ......................................*...........
- smlal2 v17.4S, v11.8H, v22.8H // .............................*....................
- smlal v21.4S, v11.4H, v22.4H // ................................*.................
- uzp1 v23.8H, v18.8H, v24.8H // .................................*................
- str q19, [x0, #16] // .........................................*........
- mul v19.8H, v23.8H, v2.8H // ..................................*...............
- uzp1 v23.8H, v21.8H, v17.8H // .....................................*............
- str q5, [x0], #32 // .............................................*....
- mul v26.8H, v23.8H, v2.8H // .......................................*..........
- smlal v18.4S, v19.4H, v0.4H // ...................................*..............
- smlal2 v24.4S, v19.8H, v0.8H // ....................................*.............
- smlal v21.4S, v26.4H, v0.4H // ...........................................*......
- smlal2 v17.4S, v26.8H, v0.8H // ..........................................*.......
- uzp2 v13.8H, v18.8H, v24.8H // ........................................*.........
- uzp2 v19.8H, v21.8H, v17.8H // ............................................*.....
- zip1 v23.8H, v19.8H, v13.8H // ..............................................*...
- zip2 v19.8H, v19.8H, v13.8H // ...............................................*..
- str q23, [x0], #32 // .................................................*
- str q19, [x0, #-16] // ................................................*.
-
- // ----------------- new position ------------------>
- // 0 25
- // |------------------------|------------------------
- // smlal2 v24.4S, v26.8H, v28.8H // ...*..............................................
- // uzp2 v4.8H, v4.8H, v3.8H // .......*..........................................
- // smull2 v13.4S, v31.8H, v19.8H // *.................................................
- // uzp2 v1.8H, v29.8H, v25.8H // ...........*......................................
- // smlal2 v13.4S, v16.8H, v23.8H // ....*.............................................
- // smull v18.4S, v31.4H, v19.4H // .....*............................................
- // smlal2 v13.4S, v20.8H, v28.8H // ............*.....................................
- // smull v29.4S, v31.4H, v21.4H // ..*...............................................
- // smlal2 v13.4S, v26.8H, v8.8H // ................*.................................
- // smlal v29.4S, v16.4H, v19.4H // ......*...........................................
- // smlal v18.4S, v16.4H, v23.4H // .........*........................................
- // smlal v29.4S, v20.4H, v27.4H // ..........*.......................................
- // uzp1 v31.8H, v14.8H, v6.8H // ........*.........................................
- // smlal v18.4S, v20.4H, v28.4H // .............*....................................
- // smlal v29.4S, v26.4H, v28.4H // ..............*...................................
- // smlal v18.4S, v26.4H, v8.4H // .................*................................
- // uzp2 v26.8H, v14.8H, v6.8H // .*................................................
- // smlal2 v13.4S, v9.8H, v31.8H // ....................*.............................
- // smlal2 v24.4S, v9.8H, v26.8H // ...............*..................................
- // smlal v29.4S, v9.4H, v26.4H // ..................*...............................
- // smlal v18.4S, v9.4H, v31.4H // .....................*............................
- // smlal2 v13.4S, v4.8H, v12.8H // ........................*.........................
- // smlal2 v24.4S, v4.8H, v31.8H // ...................*..............................
- // smlal v29.4S, v4.4H, v31.4H // ......................*...........................
- // smlal v18.4S, v4.4H, v12.4H // .........................*........................
- // smlal2 v13.4S, v30.8H, v10.8H // ............................*.....................
- // smlal2 v24.4S, v30.8H, v1.8H // .......................*..........................
- // smlal v29.4S, v30.4H, v1.4H // ..........................*.......................
- // smlal v18.4S, v30.4H, v10.4H // .............................*....................
- // smlal2 v13.4S, v11.8H, v22.8H // ................................*.................
- // smlal2 v24.4S, v11.8H, v10.8H // ...........................*......................
- // smlal v29.4S, v11.4H, v10.4H // ..............................*...................
- // smlal v18.4S, v11.4H, v22.4H // .................................*................
- // uzp1 v31.8H, v29.8H, v24.8H // ..................................*...............
- // mul v19.8H, v31.8H, v2.8H // ....................................*.............
- // smlal v29.4S, v19.4H, v0.4H // ........................................*.........
- // smlal2 v24.4S, v19.8H, v0.8H // .........................................*........
- // uzp1 v26.8H, v18.8H, v13.8H // .....................................*............
- // zip2 v14.8H, v7.8H, v15.8H // ...............................*..................
- // mul v23.8H, v26.8H, v2.8H // .......................................*..........
- // uzp2 v15.8H, v29.8H, v24.8H // ............................................*.....
- // str q14, [x0, #16] // ...................................*..............
- // smlal2 v13.4S, v23.8H, v0.8H // ...........................................*......
- // smlal v18.4S, v23.4H, v0.4H // ..........................................*.......
- // uzp2 v7.8H, v18.8H, v13.8H // .............................................*....
- // str q5, [x0], #32 // ......................................*...........
- // zip1 v5.8H, v7.8H, v15.8H // ..............................................*...
- // zip2 v14.8H, v7.8H, v15.8H // ...............................................*..
- // str q14, [x0, #16] // .................................................*
- // str q5, [x0], #32 // ................................................*.
-
-
- pop_stack
- ret
-#endif /* MLKEM_K == 4 */
-
-/****************** REGISTER DEALLOCATIONS *******************/
- .unreq out
- .unreq a0_ptr
- .unreq b0_ptr
- .unreq b0_cache_ptr
- .unreq a1_ptr
- .unreq b1_ptr
- .unreq b1_cache_ptr
- .unreq a2_ptr
- .unreq b2_ptr
- .unreq b2_cache_ptr
- .unreq a3_ptr
- .unreq b3_ptr
- .unreq b3_cache_ptr
- .unreq count
- .unreq modulus
- .unreq modulus_twisted
- .unreq wtmp
- .unreq aa0
- .unreq aa1
- .unreq bb0
- .unreq bb1
- .unreq bb1t
- .unreq res0l
- .unreq res1l
- .unreq res0h
- .unreq res1h
- .unreq tmp0
- .unreq tmp1
- .unreq q_tmp0
- .unreq q_tmp1
- .unreq out0
- .unreq out1
- .unreq t0
-
-#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_asm_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_asm_clean.S
index 8302d2a3e..f2451815a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_asm_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_asm_clean.S
@@ -19,8 +19,8 @@
* Returns number of sampled 16-bit integers (at most MLKEM_N).
**************************************************/
#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \
- defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+/* simpasm: header-end */
// We save the output on the stack first, and copy to the actual
// output buffer only in the end. This is because the main loop can overwrite
@@ -112,9 +112,9 @@
mlkem_q .req v30
bits .req v31
-.text
-.global MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean)
-.balign 4
+ .text
+ .global MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean)
+ .balign 4
MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean):
push_stack
@@ -402,5 +402,5 @@ return:
.unreq mlkem_q
.unreq bits
-#endif /* defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) ||
- defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) */
+/* simpasm: footer-start */
+#endif /* defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_table.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_table.c
index becdf303b..592c15fb0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_table.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_table.c
@@ -10,8 +10,7 @@
#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \
- defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
#include
#include "arith_native_aarch64.h"
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/default.h
index f9fe4310a..df43dc5b0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/default.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/default.h
@@ -11,21 +11,10 @@
#include "../sys.h"
#ifdef SYS_AARCH64
-/*
- * For AArch64, we currently we have one clean and one opt profile.
- * We default to the opt profile.
- *
- * In the future, this may branch further depending on the microarchitecture.
- */
#include "aarch64/opt.h"
#endif /* SYS_AARCH64 */
#ifdef SYS_X86_64_AVX2
-/*
- * For now, there's only one x86_64 profile, based on
- * the AVX2 code from the Kyber repository.
- * https://github.com/pq-crystals/kyber
- */
#include "x86_64/default.h"
#endif /* SYS_X86_64 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/native/default.h
index f9fe4310a..df43dc5b0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/native/default.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/native/default.h
@@ -11,21 +11,10 @@
#include "../sys.h"
#ifdef SYS_AARCH64
-/*
- * For AArch64, we currently we have one clean and one opt profile.
- * We default to the opt profile.
- *
- * In the future, this may branch further depending on the microarchitecture.
- */
#include "aarch64/opt.h"
#endif /* SYS_AARCH64 */
#ifdef SYS_X86_64_AVX2
-/*
- * For now, there's only one x86_64 profile, based on
- * the AVX2 code from the Kyber repository.
- * https://github.com/pq-crystals/kyber
- */
#include "x86_64/default.h"
#endif /* SYS_X86_64 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/default.h
index f9fe4310a..df43dc5b0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/default.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/default.h
@@ -11,21 +11,10 @@
#include "../sys.h"
#ifdef SYS_AARCH64
-/*
- * For AArch64, we currently we have one clean and one opt profile.
- * We default to the opt profile.
- *
- * In the future, this may branch further depending on the microarchitecture.
- */
#include "aarch64/opt.h"
#endif /* SYS_AARCH64 */
#ifdef SYS_X86_64_AVX2
-/*
- * For now, there's only one x86_64 profile, based on
- * the AVX2 code from the Kyber repository.
- * https://github.com/pq-crystals/kyber
- */
#include "x86_64/default.h"
#endif /* SYS_X86_64 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.S
index 5fdc3d0a0..3063d20ae 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.S
@@ -8,6 +8,7 @@
#include "../../../common.h"
#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
#include "consts.h"
@@ -113,6 +114,7 @@ vmovdqa %ymm11,(64*\off+48)*2(%rdi)
.text
.global MLKEM_ASM_NAMESPACE(basemul_avx2)
+.balign 4
MLKEM_ASM_NAMESPACE(basemul_avx2):
mov %rsp,%r8
and $-32,%rsp
@@ -133,4 +135,5 @@ schoolbook 3
mov %r8,%rsp
ret
+/* simpasm: footer-start */
#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/intt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/intt.S
index 7b1f22624..e74199930 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/intt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/intt.S
@@ -12,6 +12,7 @@
#include "../../../common.h"
#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
#include "consts.h"
#include "shuffle.inc"
@@ -242,6 +243,7 @@ vmovdqa %ymm11,(64*\off+176)*2(%rdi)
.text
.global MLKEM_ASM_NAMESPACE(invntt_avx2)
+.balign 4
MLKEM_ASM_NAMESPACE(invntt_avx2):
vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0
@@ -252,4 +254,5 @@ intt_level6 0
intt_level6 1
ret
+/* simpasm: footer-start */
#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntt.S
index 5d928b4cc..70582fbc1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntt.S
@@ -8,6 +8,7 @@
#include "../../../common.h"
#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
#include "consts.h"
#include "shuffle.inc"
@@ -205,6 +206,7 @@ vmovdqa %ymm11,(128*\off+112)*2(%rdi)
.text
.global MLKEM_ASM_NAMESPACE(ntt_avx2)
+.balign 4
MLKEM_ASM_NAMESPACE(ntt_avx2):
vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0
@@ -216,4 +218,5 @@ levels1t6 1
ret
+/* simpasm: footer-start */
#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttfrombytes.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttfrombytes.S
new file mode 100644
index 000000000..71f2af000
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttfrombytes.S
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// Implementation from Kyber reference repository
+// https://github.com/pq-crystals/kyber/blob/main/avx2
+
+#include "../../../common.h"
+
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
+
+#include "consts.h"
+#include "fq.inc"
+#include "shuffle.inc"
+
+.text
+.global MLKEM_ASM_NAMESPACE(nttfrombytes_avx2)
+.balign 4
+MLKEM_ASM_NAMESPACE(nttfrombytes_avx2):
+#consts
+vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMASK*2(%rdx),%ymm0
+call nttfrombytes128_avx
+add $256,%rdi
+add $192,%rsi
+call nttfrombytes128_avx
+ret
+
+nttfrombytes128_avx:
+#load
+vmovdqu (%rsi),%ymm4
+vmovdqu 32(%rsi),%ymm5
+vmovdqu 64(%rsi),%ymm6
+vmovdqu 96(%rsi),%ymm7
+vmovdqu 128(%rsi),%ymm8
+vmovdqu 160(%rsi),%ymm9
+
+shuffle8 4,7,3,7
+shuffle8 5,8,4,8
+shuffle8 6,9,5,9
+
+shuffle4 3,8,6,8
+shuffle4 7,5,3,5
+shuffle4 4,9,7,9
+
+shuffle2 6,5,4,5
+shuffle2 8,7,6,7
+shuffle2 3,9,8,9
+
+shuffle1 4,7,10,7
+shuffle1 5,8,4,8
+shuffle1 6,9,5,9
+
+#bitunpack
+vpsrlw $12,%ymm10,%ymm11
+vpsllw $4,%ymm7,%ymm12
+vpor %ymm11,%ymm12,%ymm11
+vpand %ymm0,%ymm10,%ymm10
+vpand %ymm0,%ymm11,%ymm11
+
+vpsrlw $8,%ymm7,%ymm12
+vpsllw $8,%ymm4,%ymm13
+vpor %ymm12,%ymm13,%ymm12
+vpand %ymm0,%ymm12,%ymm12
+
+vpsrlw $4,%ymm4,%ymm13
+vpand %ymm0,%ymm13,%ymm13
+
+vpsrlw $12,%ymm8,%ymm14
+vpsllw $4,%ymm5,%ymm15
+vpor %ymm14,%ymm15,%ymm14
+vpand %ymm0,%ymm8,%ymm8
+vpand %ymm0,%ymm14,%ymm14
+
+vpsrlw $8,%ymm5,%ymm15
+vpsllw $8,%ymm9,%ymm1
+vpor %ymm15,%ymm1,%ymm15
+vpand %ymm0,%ymm15,%ymm15
+
+vpsrlw $4,%ymm9,%ymm1
+vpand %ymm0,%ymm1,%ymm1
+
+#store
+vmovdqa %ymm10,(%rdi)
+vmovdqa %ymm11,32(%rdi)
+vmovdqa %ymm12,64(%rdi)
+vmovdqa %ymm13,96(%rdi)
+vmovdqa %ymm8,128(%rdi)
+vmovdqa %ymm14,160(%rdi)
+vmovdqa %ymm15,192(%rdi)
+vmovdqa %ymm1,224(%rdi)
+
+ret
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttpack.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttpack.S
new file mode 100644
index 000000000..4c10ef366
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttpack.S
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// Implementation from Kyber reference repository
+// https://github.com/pq-crystals/kyber/blob/main/avx2
+
+#include "../../../common.h"
+
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
+
+#include "consts.h"
+#include "fq.inc"
+#include "shuffle.inc"
+
+.text
+.global MLKEM_ASM_NAMESPACE(nttpack_avx2)
+.balign 4
+MLKEM_ASM_NAMESPACE(nttpack_avx2):
+#load
+vmovdqa (%rdi),%ymm4
+vmovdqa 32(%rdi),%ymm5
+vmovdqa 64(%rdi),%ymm6
+vmovdqa 96(%rdi),%ymm7
+vmovdqa 128(%rdi),%ymm8
+vmovdqa 160(%rdi),%ymm9
+vmovdqa 192(%rdi),%ymm10
+vmovdqa 224(%rdi),%ymm11
+
+shuffle1 4,5,3,5
+shuffle1 6,7,4,7
+shuffle1 8,9,6,9
+shuffle1 10,11,8,11
+
+shuffle2 3,4,10,4
+shuffle2 6,8,3,8
+shuffle2 5,7,6,7
+shuffle2 9,11,5,11
+
+shuffle4 10,3,9,3
+shuffle4 6,5,10,5
+shuffle4 4,8,6,8
+shuffle4 7,11,4,11
+
+shuffle8 9,10,7,10
+shuffle8 6,4,9,4
+shuffle8 3,5,6,5
+shuffle8 8,11,3,11
+
+#store
+vmovdqa %ymm7,(%rdi)
+vmovdqa %ymm9,32(%rdi)
+vmovdqa %ymm6,64(%rdi)
+vmovdqa %ymm3,96(%rdi)
+vmovdqa %ymm10,128(%rdi)
+vmovdqa %ymm4,160(%rdi)
+vmovdqa %ymm5,192(%rdi)
+vmovdqa %ymm11,224(%rdi)
+
+ret
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntttobytes.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntttobytes.S
new file mode 100644
index 000000000..4f0b01e83
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntttobytes.S
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// Implementation from Kyber reference repository
+// https://github.com/pq-crystals/kyber/blob/main/avx2
+
+#include "../../../common.h"
+
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
+
+#include "consts.h"
+#include "fq.inc"
+#include "shuffle.inc"
+
+.text
+.global MLKEM_ASM_NAMESPACE(ntttobytes_avx2)
+.balign 4
+MLKEM_ASM_NAMESPACE(ntttobytes_avx2):
+#consts
+vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rdx),%ymm0
+call ntttobytes128_avx
+add $256,%rsi
+add $192,%rdi
+call ntttobytes128_avx
+ret
+
+ntttobytes128_avx:
+#load
+vmovdqa (%rsi),%ymm5
+vmovdqa 32(%rsi),%ymm6
+vmovdqa 64(%rsi),%ymm7
+vmovdqa 96(%rsi),%ymm8
+vmovdqa 128(%rsi),%ymm9
+vmovdqa 160(%rsi),%ymm10
+vmovdqa 192(%rsi),%ymm11
+vmovdqa 224(%rsi),%ymm12
+
+#bitpack
+vpsllw $12,%ymm6,%ymm4
+vpor %ymm4,%ymm5,%ymm4
+
+vpsrlw $4,%ymm6,%ymm5
+vpsllw $8,%ymm7,%ymm6
+vpor %ymm5,%ymm6,%ymm5
+
+vpsrlw $8,%ymm7,%ymm6
+vpsllw $4,%ymm8,%ymm7
+vpor %ymm6,%ymm7,%ymm6
+
+vpsllw $12,%ymm10,%ymm7
+vpor %ymm7,%ymm9,%ymm7
+
+vpsrlw $4,%ymm10,%ymm8
+vpsllw $8,%ymm11,%ymm9
+vpor %ymm8,%ymm9,%ymm8
+
+vpsrlw $8,%ymm11,%ymm9
+vpsllw $4,%ymm12,%ymm10
+vpor %ymm9,%ymm10,%ymm9
+
+shuffle1 4,5,3,5
+shuffle1 6,7,4,7
+shuffle1 8,9,6,9
+
+shuffle2 3,4,8,4
+shuffle2 6,5,3,5
+shuffle2 7,9,6,9
+
+shuffle4 8,3,7,3
+shuffle4 6,4,8,4
+shuffle4 5,9,6,9
+
+shuffle8 7,8,5,8
+shuffle8 6,3,7,3
+shuffle8 4,9,6,9
+
+#store
+vmovdqu %ymm5,(%rdi)
+vmovdqu %ymm7,32(%rdi)
+vmovdqu %ymm6,64(%rdi)
+vmovdqu %ymm8,96(%rdi)
+vmovdqu %ymm3,128(%rdi)
+vmovdqu %ymm9,160(%rdi)
+
+ret
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttunpack.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttunpack.S
new file mode 100644
index 000000000..0cf45c671
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttunpack.S
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// Implementation from Kyber reference repository
+// https://github.com/pq-crystals/kyber/blob/main/avx2
+
+#include "../../../common.h"
+
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
+
+#include "consts.h"
+#include "fq.inc"
+#include "shuffle.inc"
+
+.text
+.global MLKEM_ASM_NAMESPACE(nttunpack_avx2)
+.balign 4
+MLKEM_ASM_NAMESPACE(nttunpack_avx2):
+call nttunpack128_avx2
+add $256,%rdi
+call nttunpack128_avx2
+ret
+
+nttunpack128_avx2:
+#load
+vmovdqa (%rdi),%ymm4
+vmovdqa 32(%rdi),%ymm5
+vmovdqa 64(%rdi),%ymm6
+vmovdqa 96(%rdi),%ymm7
+vmovdqa 128(%rdi),%ymm8
+vmovdqa 160(%rdi),%ymm9
+vmovdqa 192(%rdi),%ymm10
+vmovdqa 224(%rdi),%ymm11
+
+shuffle8 4,8,3,8
+shuffle8 5,9,4,9
+shuffle8 6,10,5,10
+shuffle8 7,11,6,11
+
+shuffle4 3,5,7,5
+shuffle4 8,10,3,10
+shuffle4 4,6,8,6
+shuffle4 9,11,4,11
+
+shuffle2 7,8,9,8
+shuffle2 5,6,7,6
+shuffle2 3,4,5,4
+shuffle2 10,11,3,11
+
+shuffle1 9,5,10,5
+shuffle1 8,4,9,4
+shuffle1 7,3,8,3
+shuffle1 6,11,7,11
+
+#store
+vmovdqa %ymm10,(%rdi)
+vmovdqa %ymm5,32(%rdi)
+vmovdqa %ymm9,64(%rdi)
+vmovdqa %ymm4,96(%rdi)
+vmovdqa %ymm8,128(%rdi)
+vmovdqa %ymm3,160(%rdi)
+vmovdqa %ymm7,192(%rdi)
+vmovdqa %ymm11,224(%rdi)
+
+ret
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/reduce.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/reduce.S
new file mode 100644
index 000000000..78bad0559
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/reduce.S
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// Implementation based on Kyber reference repository
+// https://github.com/pq-crystals/kyber/blob/main/avx2
+
+// Changes:
+// - Add call to csub in reduce128_avx to produce outputs
+// in [0,1,...,q-1] rather than [0,1,...,q], matching the
+// semantics of poly_reduce().
+
+#include "../../../common.h"
+
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
+
+#include "consts.h"
+#include "fq.inc"
+
+.text
+.global MLKEM_ASM_NAMESPACE(reduce_avx2)
+.balign 4
+MLKEM_ASM_NAMESPACE(reduce_avx2):
+#consts
+vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0
+vmovdqa AVX2_BACKEND_DATA_OFFSET_16XV*2(%rsi),%ymm1
+call reduce128_avx2
+add $256,%rdi
+call reduce128_avx2
+ret
+
+reduce128_avx2:
+#load
+vmovdqa (%rdi),%ymm2
+vmovdqa 32(%rdi),%ymm3
+vmovdqa 64(%rdi),%ymm4
+vmovdqa 96(%rdi),%ymm5
+vmovdqa 128(%rdi),%ymm6
+vmovdqa 160(%rdi),%ymm7
+vmovdqa 192(%rdi),%ymm8
+vmovdqa 224(%rdi),%ymm9
+
+red16 2
+red16 3
+red16 4
+red16 5
+red16 6
+red16 7
+red16 8
+red16 9
+
+csubq 2
+csubq 3
+csubq 4
+csubq 5
+csubq 6
+csubq 7
+csubq 8
+csubq 9
+
+#store
+vmovdqa %ymm2,(%rdi)
+vmovdqa %ymm3,32(%rdi)
+vmovdqa %ymm4,64(%rdi)
+vmovdqa %ymm5,96(%rdi)
+vmovdqa %ymm6,128(%rdi)
+vmovdqa %ymm7,160(%rdi)
+vmovdqa %ymm8,192(%rdi)
+vmovdqa %ymm9,224(%rdi)
+
+ret
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/shuffle.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/shuffle.S
deleted file mode 100644
index 9bcd04896..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/shuffle.S
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-// Implementation from Kyber reference repository
-// https://github.com/pq-crystals/kyber/blob/main/avx2
-
-#include "../../../common.h"
-
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
-
-#include "consts.h"
-#include "fq.inc"
-#include "shuffle.inc"
-
-.global MLKEM_ASM_NAMESPACE(nttpack_avx2)
-MLKEM_ASM_NAMESPACE(nttpack_avx2):
-#load
-vmovdqa (%rdi),%ymm4
-vmovdqa 32(%rdi),%ymm5
-vmovdqa 64(%rdi),%ymm6
-vmovdqa 96(%rdi),%ymm7
-vmovdqa 128(%rdi),%ymm8
-vmovdqa 160(%rdi),%ymm9
-vmovdqa 192(%rdi),%ymm10
-vmovdqa 224(%rdi),%ymm11
-
-shuffle1 4,5,3,5
-shuffle1 6,7,4,7
-shuffle1 8,9,6,9
-shuffle1 10,11,8,11
-
-shuffle2 3,4,10,4
-shuffle2 6,8,3,8
-shuffle2 5,7,6,7
-shuffle2 9,11,5,11
-
-shuffle4 10,3,9,3
-shuffle4 6,5,10,5
-shuffle4 4,8,6,8
-shuffle4 7,11,4,11
-
-shuffle8 9,10,7,10
-shuffle8 6,4,9,4
-shuffle8 3,5,6,5
-shuffle8 8,11,3,11
-
-#store
-vmovdqa %ymm7,(%rdi)
-vmovdqa %ymm9,32(%rdi)
-vmovdqa %ymm6,64(%rdi)
-vmovdqa %ymm3,96(%rdi)
-vmovdqa %ymm10,128(%rdi)
-vmovdqa %ymm4,160(%rdi)
-vmovdqa %ymm5,192(%rdi)
-vmovdqa %ymm11,224(%rdi)
-
-ret
-
-nttunpack128_avx2:
-#load
-vmovdqa (%rdi),%ymm4
-vmovdqa 32(%rdi),%ymm5
-vmovdqa 64(%rdi),%ymm6
-vmovdqa 96(%rdi),%ymm7
-vmovdqa 128(%rdi),%ymm8
-vmovdqa 160(%rdi),%ymm9
-vmovdqa 192(%rdi),%ymm10
-vmovdqa 224(%rdi),%ymm11
-
-shuffle8 4,8,3,8
-shuffle8 5,9,4,9
-shuffle8 6,10,5,10
-shuffle8 7,11,6,11
-
-shuffle4 3,5,7,5
-shuffle4 8,10,3,10
-shuffle4 4,6,8,6
-shuffle4 9,11,4,11
-
-shuffle2 7,8,9,8
-shuffle2 5,6,7,6
-shuffle2 3,4,5,4
-shuffle2 10,11,3,11
-
-shuffle1 9,5,10,5
-shuffle1 8,4,9,4
-shuffle1 7,3,8,3
-shuffle1 6,11,7,11
-
-#store
-vmovdqa %ymm10,(%rdi)
-vmovdqa %ymm5,32(%rdi)
-vmovdqa %ymm9,64(%rdi)
-vmovdqa %ymm4,96(%rdi)
-vmovdqa %ymm8,128(%rdi)
-vmovdqa %ymm3,160(%rdi)
-vmovdqa %ymm7,192(%rdi)
-vmovdqa %ymm11,224(%rdi)
-
-ret
-
-.global MLKEM_ASM_NAMESPACE(nttunpack_avx2)
-MLKEM_ASM_NAMESPACE(nttunpack_avx2):
-call nttunpack128_avx2
-add $256,%rdi
-call nttunpack128_avx2
-ret
-
-ntttobytes128_avx:
-#load
-vmovdqa (%rsi),%ymm5
-vmovdqa 32(%rsi),%ymm6
-vmovdqa 64(%rsi),%ymm7
-vmovdqa 96(%rsi),%ymm8
-vmovdqa 128(%rsi),%ymm9
-vmovdqa 160(%rsi),%ymm10
-vmovdqa 192(%rsi),%ymm11
-vmovdqa 224(%rsi),%ymm12
-
-#bitpack
-vpsllw $12,%ymm6,%ymm4
-vpor %ymm4,%ymm5,%ymm4
-
-vpsrlw $4,%ymm6,%ymm5
-vpsllw $8,%ymm7,%ymm6
-vpor %ymm5,%ymm6,%ymm5
-
-vpsrlw $8,%ymm7,%ymm6
-vpsllw $4,%ymm8,%ymm7
-vpor %ymm6,%ymm7,%ymm6
-
-vpsllw $12,%ymm10,%ymm7
-vpor %ymm7,%ymm9,%ymm7
-
-vpsrlw $4,%ymm10,%ymm8
-vpsllw $8,%ymm11,%ymm9
-vpor %ymm8,%ymm9,%ymm8
-
-vpsrlw $8,%ymm11,%ymm9
-vpsllw $4,%ymm12,%ymm10
-vpor %ymm9,%ymm10,%ymm9
-
-shuffle1 4,5,3,5
-shuffle1 6,7,4,7
-shuffle1 8,9,6,9
-
-shuffle2 3,4,8,4
-shuffle2 6,5,3,5
-shuffle2 7,9,6,9
-
-shuffle4 8,3,7,3
-shuffle4 6,4,8,4
-shuffle4 5,9,6,9
-
-shuffle8 7,8,5,8
-shuffle8 6,3,7,3
-shuffle8 4,9,6,9
-
-#store
-vmovdqu %ymm5,(%rdi)
-vmovdqu %ymm7,32(%rdi)
-vmovdqu %ymm6,64(%rdi)
-vmovdqu %ymm8,96(%rdi)
-vmovdqu %ymm3,128(%rdi)
-vmovdqu %ymm9,160(%rdi)
-
-ret
-
-.global MLKEM_ASM_NAMESPACE(ntttobytes_avx2)
-MLKEM_ASM_NAMESPACE(ntttobytes_avx2):
-#consts
-vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rdx),%ymm0
-call ntttobytes128_avx
-add $256,%rsi
-add $192,%rdi
-call ntttobytes128_avx
-ret
-
-nttfrombytes128_avx:
-#load
-vmovdqu (%rsi),%ymm4
-vmovdqu 32(%rsi),%ymm5
-vmovdqu 64(%rsi),%ymm6
-vmovdqu 96(%rsi),%ymm7
-vmovdqu 128(%rsi),%ymm8
-vmovdqu 160(%rsi),%ymm9
-
-shuffle8 4,7,3,7
-shuffle8 5,8,4,8
-shuffle8 6,9,5,9
-
-shuffle4 3,8,6,8
-shuffle4 7,5,3,5
-shuffle4 4,9,7,9
-
-shuffle2 6,5,4,5
-shuffle2 8,7,6,7
-shuffle2 3,9,8,9
-
-shuffle1 4,7,10,7
-shuffle1 5,8,4,8
-shuffle1 6,9,5,9
-
-#bitunpack
-vpsrlw $12,%ymm10,%ymm11
-vpsllw $4,%ymm7,%ymm12
-vpor %ymm11,%ymm12,%ymm11
-vpand %ymm0,%ymm10,%ymm10
-vpand %ymm0,%ymm11,%ymm11
-
-vpsrlw $8,%ymm7,%ymm12
-vpsllw $8,%ymm4,%ymm13
-vpor %ymm12,%ymm13,%ymm12
-vpand %ymm0,%ymm12,%ymm12
-
-vpsrlw $4,%ymm4,%ymm13
-vpand %ymm0,%ymm13,%ymm13
-
-vpsrlw $12,%ymm8,%ymm14
-vpsllw $4,%ymm5,%ymm15
-vpor %ymm14,%ymm15,%ymm14
-vpand %ymm0,%ymm8,%ymm8
-vpand %ymm0,%ymm14,%ymm14
-
-vpsrlw $8,%ymm5,%ymm15
-vpsllw $8,%ymm9,%ymm1
-vpor %ymm15,%ymm1,%ymm15
-vpand %ymm0,%ymm15,%ymm15
-
-vpsrlw $4,%ymm9,%ymm1
-vpand %ymm0,%ymm1,%ymm1
-
-#store
-vmovdqa %ymm10,(%rdi)
-vmovdqa %ymm11,32(%rdi)
-vmovdqa %ymm12,64(%rdi)
-vmovdqa %ymm13,96(%rdi)
-vmovdqa %ymm8,128(%rdi)
-vmovdqa %ymm14,160(%rdi)
-vmovdqa %ymm15,192(%rdi)
-vmovdqa %ymm1,224(%rdi)
-
-ret
-
-.global MLKEM_ASM_NAMESPACE(nttfrombytes_avx2)
-MLKEM_ASM_NAMESPACE(nttfrombytes_avx2):
-#consts
-vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMASK*2(%rdx),%ymm0
-call nttfrombytes128_avx
-add $256,%rdi
-add $192,%rsi
-call nttfrombytes128_avx
-ret
-
-#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/fq.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/tomont.S
similarity index 64%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/fq.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/tomont.S
index 3f013a5fa..7774cec0b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/fq.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/tomont.S
@@ -14,63 +14,24 @@
#include "../../../common.h"
#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
-#include "consts.h"
+/* simpasm: header-end */
+#include "consts.h"
#include "fq.inc"
.text
-reduce128_avx2:
-#load
-vmovdqa (%rdi),%ymm2
-vmovdqa 32(%rdi),%ymm3
-vmovdqa 64(%rdi),%ymm4
-vmovdqa 96(%rdi),%ymm5
-vmovdqa 128(%rdi),%ymm6
-vmovdqa 160(%rdi),%ymm7
-vmovdqa 192(%rdi),%ymm8
-vmovdqa 224(%rdi),%ymm9
-
-red16 2
-red16 3
-red16 4
-red16 5
-red16 6
-red16 7
-red16 8
-red16 9
-
-csubq 2
-csubq 3
-csubq 4
-csubq 5
-csubq 6
-csubq 7
-csubq 8
-csubq 9
-
-#store
-vmovdqa %ymm2,(%rdi)
-vmovdqa %ymm3,32(%rdi)
-vmovdqa %ymm4,64(%rdi)
-vmovdqa %ymm5,96(%rdi)
-vmovdqa %ymm6,128(%rdi)
-vmovdqa %ymm7,160(%rdi)
-vmovdqa %ymm8,192(%rdi)
-vmovdqa %ymm9,224(%rdi)
-
-ret
-
-.global MLKEM_ASM_NAMESPACE(reduce_avx2)
-MLKEM_ASM_NAMESPACE(reduce_avx2):
+.global MLKEM_ASM_NAMESPACE(tomont_avx2)
+.balign 4
+MLKEM_ASM_NAMESPACE(tomont_avx2):
#consts
vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0
-vmovdqa AVX2_BACKEND_DATA_OFFSET_16XV*2(%rsi),%ymm1
-call reduce128_avx2
+vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO*2(%rsi),%ymm1
+vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI*2(%rsi),%ymm2
+call tomont128_avx2
add $256,%rdi
-call reduce128_avx2
+call tomont128_avx2
ret
-
tomont128_avx2:
#load
vmovdqa (%rdi),%ymm3
@@ -103,15 +64,5 @@ vmovdqa %ymm10,224(%rdi)
ret
-.global MLKEM_ASM_NAMESPACE(tomont_avx2)
-MLKEM_ASM_NAMESPACE(tomont_avx2):
-#consts
-vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0
-vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO*2(%rsi),%ymm1
-vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI*2(%rsi),%ymm2
-call tomont128_avx2
-add $256,%rdi
-call tomont128_avx2
-ret
-
+/* simpasm: footer-start */
#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/README.md b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/README.md
index e499a4a22..a420f05b6 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/README.md
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/README.md
@@ -10,10 +10,9 @@ works:
- _Fast and Clean: Auditable high-performance assembly via constraint solving_, Amin Abdulrahman, Hanno Becker, Matthias
J. Kannwischer, Fabien Klein, [https://eprint.iacr.org/2022/1303](https://eprint.iacr.org/2022/1303)
-## Profiles
-This backend comes with two profiles: "clean" and optimized. The "clean" backend is handwritten and meant to be easy to
-read and modify; for example, is heavily leverages register aliases and assembly macros. The optimized profile is
-automatically generated from the clean profile via [SLOTHY](https://github.com/slothy-optimizer/slothy). Currently, the
+## Variants
+
+This backend comes in two versions: "clean" and optimized. The "clean" backend is handwritten and meant to be easy to read and modify; for example, is heavily leverages register aliases and assembly macros. This directory contains the optimized version, which is automatically generated from the clean one via [SLOTHY](https://github.com/slothy-optimizer/slothy). Currently, the
target architecture is Cortex-A55, but you can easily re-optimize the code for a different microarchitecture supported
-by SLOTHY, by adjusting the parameters in [optimize.sh](src/optimize.sh).
+by SLOTHY, by adjusting the parameters in [optimize.sh](../../../test/aarch64_clean/src/optimize.sh).
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/clean.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/clean.h
deleted file mode 100644
index f124702a4..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/clean.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/* ML-KEM arithmetic native profile for clean assembly */
-
-#ifdef MLKEM_NATIVE_ARITH_PROFILE_H
-#error Only one MLKEM_ARITH assembly profile can be defined -- did you include multiple profiles?
-#else
-#define MLKEM_NATIVE_ARITH_PROFILE_H
-
-/* Identifier for this backend so that source and assembly files
- * in the build can be appropriately guarded. */
-#define MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN
-
-#define MLKEM_NATIVE_ARITH_BACKEND_NAME AARCH64_CLEAN
-
-/* Filename of the C backend implementation.
- * This is not inlined here because this header is included in assembly
- * files as well. */
-#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/aarch64/src/clean_impl.h"
-
-#endif /* MLKEM_NATIVE_ARITH_PROFILE_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/opt.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/opt.h
index a7217163f..4a0243279 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/opt.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/opt.h
@@ -3,8 +3,6 @@
* SPDX-License-Identifier: Apache-2.0
*/
-/* ML-KEM arithmetic native profile for clean assembly */
-
#ifdef MLKEM_NATIVE_ARITH_PROFILE_H
#error Only one MLKEM_ARITH assembly profile can be defined -- did you include multiple profiles?
#else
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/aarch64_zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/aarch64_zetas.c
index 2c1bb31e1..23e7949d3 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/aarch64_zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/aarch64_zetas.c
@@ -10,8 +10,7 @@
#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \
- defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
#include
#include "arith_native_aarch64.h"
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/arith_native_aarch64.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/arith_native_aarch64.h
index ed0825892..60779598d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/arith_native_aarch64.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/arith_native_aarch64.h
@@ -29,62 +29,49 @@ extern const int16_t aarch64_zetas_mulcache_native[];
extern const int16_t aarch64_zetas_mulcache_twisted_native[];
extern const uint8_t rej_uniform_table[];
-#define ntt_asm_clean MLKEM_NAMESPACE(ntt_asm_clean)
-void ntt_asm_clean(int16_t *, const int16_t *, const int16_t *);
-
#define ntt_asm_opt MLKEM_NAMESPACE(ntt_asm_opt)
void ntt_asm_opt(int16_t *, const int16_t *, const int16_t *);
-#define intt_asm_clean MLKEM_NAMESPACE(intt_asm_clean)
-void intt_asm_clean(int16_t *, const int16_t *, const int16_t *);
-
#define intt_asm_opt MLKEM_NAMESPACE(intt_asm_opt)
void intt_asm_opt(int16_t *, const int16_t *, const int16_t *);
-#define rej_uniform_asm_clean MLKEM_NAMESPACE(rej_uniform_asm_clean)
-unsigned rej_uniform_asm_clean(int16_t *r, const uint8_t *buf, unsigned buflen,
- const uint8_t *table);
-
-#define poly_reduce_asm_clean MLKEM_NAMESPACE(poly_reduce_asm_clean)
-void poly_reduce_asm_clean(int16_t *);
-
#define poly_reduce_asm_opt MLKEM_NAMESPACE(poly_reduce_asm_opt)
void poly_reduce_asm_opt(int16_t *);
-#define poly_tomont_asm_clean MLKEM_NAMESPACE(poly_tomont_asm_clean)
-void poly_tomont_asm_clean(int16_t *);
-
#define poly_tomont_asm_opt MLKEM_NAMESPACE(poly_tomont_asm_opt)
void poly_tomont_asm_opt(int16_t *);
-#define poly_mulcache_compute_asm_clean \
- MLKEM_NAMESPACE(poly_mulcache_compute_asm_clean)
-void poly_mulcache_compute_asm_clean(int16_t *, const int16_t *,
- const int16_t *, const int16_t *);
-
-
#define poly_mulcache_compute_asm_opt \
MLKEM_NAMESPACE(poly_mulcache_compute_asm_opt)
void poly_mulcache_compute_asm_opt(int16_t *, const int16_t *, const int16_t *,
const int16_t *);
-#define poly_tobytes_asm_clean MLKEM_NAMESPACE(poly_tobytes_asm_clean)
-void poly_tobytes_asm_clean(uint8_t *r, const int16_t *a);
-
#define poly_tobytes_asm_opt MLKEM_NAMESPACE(poly_tobytes_asm_opt)
void poly_tobytes_asm_opt(uint8_t *r, const int16_t *a);
-#define polyvec_basemul_acc_montgomery_cached_asm_clean \
- MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
-void polyvec_basemul_acc_montgomery_cached_asm_clean(int16_t *r,
- const int16_t *a,
- const int16_t *b,
- const int16_t *b_cache);
+#define polyvec_basemul_acc_montgomery_cached_asm_k2_opt \
+ MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2_opt)
+void polyvec_basemul_acc_montgomery_cached_asm_k2_opt(int16_t *r,
+ const int16_t *a,
+ const int16_t *b,
+ const int16_t *b_cache);
+
+#define polyvec_basemul_acc_montgomery_cached_asm_k3_opt \
+ MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3_opt)
+void polyvec_basemul_acc_montgomery_cached_asm_k3_opt(int16_t *r,
+ const int16_t *a,
+ const int16_t *b,
+ const int16_t *b_cache);
+
+#define polyvec_basemul_acc_montgomery_cached_asm_k4_opt \
+ MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4_opt)
+void polyvec_basemul_acc_montgomery_cached_asm_k4_opt(int16_t *r,
+ const int16_t *a,
+ const int16_t *b,
+ const int16_t *b_cache);
-#define polyvec_basemul_acc_montgomery_cached_asm_opt \
- MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
-void polyvec_basemul_acc_montgomery_cached_asm_opt(int16_t *r, const int16_t *a,
- const int16_t *b,
- const int16_t *b_cache);
+#define rej_uniform_asm_clean MLKEM_NAMESPACE(rej_uniform_asm_clean)
+unsigned rej_uniform_asm_clean(int16_t *r, const uint8_t *buf, unsigned buflen,
+ const uint8_t *table);
#endif /* MLKEM_AARCH64_NATIVE_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/clean_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/clean_impl.h
deleted file mode 100644
index 4be90fb24..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/clean_impl.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/* ML-KEM arithmetic native profile for clean assembly */
-
-#ifdef MLKEM_NATIVE_ARITH_PROFILE_IMPL_H
-#error Only one MLKEM_ARITH assembly profile can be defined -- did you include multiple profiles?
-#else
-#define MLKEM_NATIVE_ARITH_PROFILE_IMPL_H
-
-#include "arith_native_aarch64.h"
-
-/* Set of primitives that this backend replaces */
-#define MLKEM_USE_NATIVE_NTT
-#define MLKEM_USE_NATIVE_INTT
-#define MLKEM_USE_NATIVE_POLY_REDUCE
-#define MLKEM_USE_NATIVE_POLY_TOMONT
-#define MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE
-#define MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED
-#define MLKEM_USE_NATIVE_POLY_TOBYTES
-#define MLKEM_USE_NATIVE_REJ_UNIFORM
-
-static INLINE void ntt_native(int16_t data[MLKEM_N])
-{
- ntt_asm_clean(data, aarch64_ntt_zetas_layer01234, aarch64_ntt_zetas_layer56);
-}
-
-static INLINE void intt_native(int16_t data[MLKEM_N])
-{
- intt_asm_clean(data, aarch64_invntt_zetas_layer01234,
- aarch64_invntt_zetas_layer56);
-}
-
-static INLINE void poly_reduce_native(int16_t data[MLKEM_N])
-{
- poly_reduce_asm_clean(data);
-}
-
-static INLINE void poly_tomont_native(int16_t data[MLKEM_N])
-{
- poly_tomont_asm_clean(data);
-}
-
-static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2],
- const int16_t y[MLKEM_N])
-{
- poly_mulcache_compute_asm_clean(x, y, aarch64_zetas_mulcache_native,
- aarch64_zetas_mulcache_twisted_native);
-}
-
-static INLINE void polyvec_basemul_acc_montgomery_cached_native(
- int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
- const int16_t b[MLKEM_K * MLKEM_N],
- const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)])
-{
- polyvec_basemul_acc_montgomery_cached_asm_clean(r, a, b, b_cache);
-}
-
-static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
- const int16_t a[MLKEM_N])
-{
- poly_tobytes_asm_clean(r, a);
-}
-
-static INLINE int rej_uniform_native(int16_t *r, unsigned len,
- const uint8_t *buf, unsigned buflen)
-{
- if (len != MLKEM_N || buflen % 24 != 0)
- {
- return -1;
- }
- return (int)rej_uniform_asm_clean(r, buf, buflen, rej_uniform_table);
-}
-
-#endif /* MLKEM_NATIVE_ARITH_PROFILE_IMPL_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_clean.S
deleted file mode 100644
index b0ae1ad46..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_clean.S
+++ /dev/null
@@ -1,389 +0,0 @@
-/// Copyright (c) 2024 The mlkem-native project authors
-/// Copyright (c) 2022 Arm Limited
-/// Copyright (c) 2022 Hanno Becker
-/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer
-/// SPDX-License-Identifier: MIT
-///
-/// Permission is hereby granted, free of charge, to any person obtaining a copy
-/// of this software and associated documentation files (the "Software"), to deal
-/// in the Software without restriction, including without limitation the rights
-/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-/// copies of the Software, and to permit persons to whom the Software is
-/// furnished to do so, subject to the following conditions:
-///
-/// The above copyright notice and this permission notice shall be included in all
-/// copies or substantial portions of the Software.
-///
-/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-/// SOFTWARE.
-///
-
-#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
-
-// Bounds:
-// If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2)
-//
-// See mlken/reduce.c and test/test_bounds.py for more details.
-.macro mulmodq dst, src, const, idx0, idx1
- // Signed barrett multiplication using
- // round-to-nearest-even-integer approximation.
- // Following https://eprint.iacr.org/2021/986.pdf, this
- // is functionally the same as a signed Montgomery multiplication
- // with a suitable constant of absolute value < q.
- sqrdmulh t2.8h, \src\().8h, \const\().h[\idx1\()]
- mul \dst\().8h, \src\().8h, \const\().h[\idx0\()]
- mls \dst\().8h, t2.8h, consts.h[0]
-.endm
-
-.macro mulmod dst, src, const, const_twisted
- sqrdmulh t2.8h, \src\().8h, \const_twisted\().8h
- mul \dst\().8h, \src\().8h, \const\().8h
- mls \dst\().8h, t2.8h, consts.h[0]
-.endm
-
-.macro gs_butterfly a, b, root, idx0, idx1
- sub tmp.8h, \a\().8h, \b\().8h
- add \a\().8h, \a\().8h, \b\().8h
- mulmodq \b, tmp, \root, \idx0, \idx1
-.endm
-
-.macro gs_butterfly_v a, b, root, root_twisted
- sub tmp.8h, \a\().8h, \b\().8h
- add \a\().8h, \a\().8h, \b\().8h
- mulmod \b, tmp, \root, \root_twisted
-.endm
-
-.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3
- mulmod \dst0, \src0, ninv, ninv_tw
- mulmod \dst1, \src1, ninv, ninv_tw
- mulmod \dst2, \src2, ninv, ninv_tw
- mulmod \dst3, \src3, ninv, ninv_tw
-.endm
-
-.macro barrett_reduce a
- sqdmulh t0.8h, \a\().8h, consts.h[1]
- srshr t0.8h, t0.8h, #11
- mls \a\().8h, t0.8h, consts.h[0]
-.endm
-
-.macro load_roots_012
- ldr q_root0, [r01234_ptr], #32
- ldr q_root1, [r01234_ptr, #-16]
-.endm
-
-.macro load_next_roots_34
- ldr q_root0, [r01234_ptr], #16
-.endm
-
-.macro load_next_roots_56
- ldr q_root0, [r56_ptr], #(6*16)
- ldr q_root0_tw, [r56_ptr, #(-6*16 + 1*16)]
- ldr q_root1, [r56_ptr, #(-6*16 + 2*16)]
- ldr q_root1_tw, [r56_ptr, #(-6*16 + 3*16)]
- ldr q_root2, [r56_ptr, #(-6*16 + 4*16)]
- ldr q_root2_tw, [r56_ptr, #(-6*16 + 5*16)]
-.endm
-
-.macro transpose4 data
- trn1 t0.4s, \data\()0.4s, \data\()1.4s
- trn2 t1.4s, \data\()0.4s, \data\()1.4s
- trn1 t2.4s, \data\()2.4s, \data\()3.4s
- trn2 t3.4s, \data\()2.4s, \data\()3.4s
-
- trn2 \data\()2.2d, t0.2d, t2.2d
- trn2 \data\()3.2d, t1.2d, t3.2d
- trn1 \data\()0.2d, t0.2d, t2.2d
- trn1 \data\()1.2d, t1.2d, t3.2d
-.endm
-
-.macro transpose_single data_out, data_in
- trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s
- trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s
- trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s
- trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s
-.endm
-
-.macro save_vregs
- sub sp, sp, #(16*4)
- stp d8, d9, [sp, #16*0]
- stp d10, d11, [sp, #16*1]
- stp d12, d13, [sp, #16*2]
- stp d14, d15, [sp, #16*3]
-.endm
-
-.macro restore_vregs
- ldp d8, d9, [sp, #16*0]
- ldp d10, d11, [sp, #16*1]
- ldp d12, d13, [sp, #16*2]
- ldp d14, d15, [sp, #16*3]
- add sp, sp, #(16*4)
-.endm
-
-.macro push_stack
- save_vregs
-.endm
-
-.macro pop_stack
- restore_vregs
-.endm
-
-// For comparability reasons, the output range for the coefficients of this
-// invNTT code is supposed to match the implementation from PQClean on commit
-// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients
-// are NOT canonically reduced. The ordering of the coefficients is canonical,
-// also matching PQClean.
-
-.text
- .global MLKEM_ASM_NAMESPACE(intt_asm_clean)
-
- in .req x0
- r01234_ptr .req x1
- r56_ptr .req x2
-
- inp .req x3
- count .req x4
- wtmp .req w5
-
- data0 .req v8
- data1 .req v9
- data2 .req v10
- data3 .req v11
- data4 .req v12
- data5 .req v13
- data6 .req v14
- data7 .req v15
-
- q_data0 .req q8
- q_data1 .req q9
- q_data2 .req q10
- q_data3 .req q11
- q_data4 .req q12
- q_data5 .req q13
- q_data6 .req q14
- q_data7 .req q15
-
- root0 .req v0
- root1 .req v1
- root2 .req v2
- root0_tw .req v4
- root1_tw .req v5
- root2_tw .req v6
-
- consts .req v7
- q_consts .req q7
-
- q_root0 .req q0
- q_root1 .req q1
- q_root2 .req q2
- q_root0_tw .req q4
- q_root1_tw .req q5
- q_root2_tw .req q6
-
- tmp .req v24
- t0 .req v25
- t1 .req v26
- t2 .req v27
- t3 .req v28
-
- ninv .req v29
- ninv_tw .req v30
-
-.balign 4
-MLKEM_ASM_NAMESPACE(intt_asm_clean):
- push_stack
-
- // Setup constants
- mov wtmp, #3329
- mov consts.h[0], wtmp
- mov wtmp, #20159
- mov consts.h[1], wtmp
- mov wtmp, #512
- dup ninv.8h, wtmp
- mov wtmp, #5040
- dup ninv_tw.8h, wtmp
-
- mov inp, in
- mov count, #8
-
-scale_start:
-
- ldr q_data0, [inp, #(16*0)]
- ldr q_data1, [inp, #(16*1)]
- ldr q_data2, [inp, #(16*2)]
- ldr q_data3, [inp, #(16*3)]
-
- mul_ninv data0, data1, data2, data3, data0, data1, data2, data3
- // Bounds: Absolute value < q
-
- str q_data0, [inp], #64
- str q_data1, [inp, #(-64 + 16*1)]
- str q_data2, [inp, #(-64 + 16*2)]
- str q_data3, [inp, #(-64 + 16*3)]
-
- subs count, count, #1
- cbnz count, scale_start
-
- mov inp, in
- mov count, #8
-
- .p2align 2
-layer3456_start:
-
- ldr q_data0, [inp, #(16*0)]
- ldr q_data1, [inp, #(16*1)]
- ldr q_data2, [inp, #(16*2)]
- ldr q_data3, [inp, #(16*3)]
-
- transpose4 data // manual ld4
-
- load_next_roots_56
-
- // Layer 7
- gs_butterfly_v data0, data1, root1, root1_tw
- gs_butterfly_v data2, data3, root2, root2_tw
- // Bounds:
- // data0, data2: < 2q
- // data1, data3: < q
-
- // Layer 6
- gs_butterfly_v data0, data2, root0, root0_tw
- gs_butterfly_v data1, data3, root0, root0_tw
- // Bounds:
- // data0: < 4q
- // data1: < 2q
- // data2, data3: < q
-
- transpose4 data
-
- load_next_roots_34
-
- // Layer 5
- gs_butterfly data0, data1, root0, 2, 3
- gs_butterfly data2, data3, root0, 4, 5
- // Max bound: 8q
-
- // Not all of those reductions are needed, but the bounds tracking
- // is easier if we uniformly reduce at this point.
- barrett_reduce data0
- barrett_reduce data2
- barrett_reduce data1
- barrett_reduce data3
-
- // Bounds: q/2
-
- // Layer 4
- gs_butterfly data0, data2, root0, 0, 1
- gs_butterfly data1, data3, root0, 0, 1
- // Bounds: < q
-
- str q_data0, [inp], #(64)
- str q_data1, [inp, #(-64 + 16*1)]
- str q_data2, [inp, #(-64 + 16*2)]
- str q_data3, [inp, #(-64 + 16*3)]
-
- subs count, count, #1
- cbnz count, layer3456_start
-
- // ---------------------------------------------------------------------
-
- mov count, #4
- load_roots_012
-
- .p2align 2
-
-layer012_start:
-
- ldr q_data0, [in, #0]
- ldr q_data1, [in, #(1*(512/8))]
- ldr q_data2, [in, #(2*(512/8))]
- ldr q_data3, [in, #(3*(512/8))]
- ldr q_data4, [in, #(4*(512/8))]
- ldr q_data5, [in, #(5*(512/8))]
- ldr q_data6, [in, #(6*(512/8))]
- ldr q_data7, [in, #(7*(512/8))]
-
- gs_butterfly data0, data1, root0, 6, 7
- gs_butterfly data2, data3, root1, 0, 1
- gs_butterfly data4, data5, root1, 2, 3
- gs_butterfly data6, data7, root1, 4, 5
-
- gs_butterfly data0, data2, root0, 2, 3
- gs_butterfly data1, data3, root0, 2, 3
- gs_butterfly data4, data6, root0, 4, 5
- gs_butterfly data5, data7, root0, 4, 5
-
- gs_butterfly data0, data4, root0, 0, 1
- gs_butterfly data1, data5, root0, 0, 1
- gs_butterfly data2, data6, root0, 0, 1
- gs_butterfly data3, data7, root0, 0, 1
-
- // Bounds: < 8q
-
- str q_data4, [in, #(4*(512/8))]
- str q_data5, [in, #(5*(512/8))]
- str q_data6, [in, #(6*(512/8))]
- str q_data7, [in, #(7*(512/8))]
-
- str q_data0, [in], #(16)
- str q_data1, [in, #(-16 + 1*(512/8))]
- str q_data2, [in, #(-16 + 2*(512/8))]
- str q_data3, [in, #(-16 + 3*(512/8))]
-
- subs count, count, #1
- cbnz count, layer012_start
-
- pop_stack
- ret
-
-/****************** REGISTER DEALLOCATIONS *******************/
- .unreq in
- .unreq r01234_ptr
- .unreq r56_ptr
- .unreq inp
- .unreq count
- .unreq wtmp
- .unreq data0
- .unreq data1
- .unreq data2
- .unreq data3
- .unreq data4
- .unreq data5
- .unreq data6
- .unreq data7
- .unreq q_data0
- .unreq q_data1
- .unreq q_data2
- .unreq q_data3
- .unreq q_data4
- .unreq q_data5
- .unreq q_data6
- .unreq q_data7
- .unreq root0
- .unreq root1
- .unreq root2
- .unreq root0_tw
- .unreq root1_tw
- .unreq root2_tw
- .unreq consts
- .unreq q_consts
- .unreq q_root0
- .unreq q_root1
- .unreq q_root2
- .unreq q_root0_tw
- .unreq q_root1_tw
- .unreq q_root2_tw
- .unreq tmp
- .unreq t0
- .unreq t1
- .unreq t2
- .unreq t3
- .unreq ninv
- .unreq ninv_tw
-
-#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_opt.S
index 191de3c4d..0f9e44307 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_opt.S
@@ -25,6 +25,7 @@
#include "../../../common.h"
#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+/* simpasm: header-end */
// Bounds:
// If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2)
@@ -139,9 +140,6 @@
// are NOT canonically reduced. The ordering of the coefficients is canonical,
// also matching PQClean.
-.text
- .global MLKEM_ASM_NAMESPACE(intt_asm_opt)
-
in .req x0
r01234_ptr .req x1
r56_ptr .req x2
@@ -194,7 +192,9 @@
ninv .req v29
ninv_tw .req v30
-.balign 4
+ .text
+ .global MLKEM_ASM_NAMESPACE(intt_asm_opt)
+ .balign 4
MLKEM_ASM_NAMESPACE(intt_asm_opt):
push_stack
@@ -1042,4 +1042,5 @@ layer012_start:
.unreq ninv
.unreq ninv_tw
+/* simpasm: footer-start */
#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_clean.S
deleted file mode 100644
index 4f844e212..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_clean.S
+++ /dev/null
@@ -1,317 +0,0 @@
-///
-/// Copyright (c) 2022 Arm Limited
-/// Copyright (c) 2022 Hanno Becker
-/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer
-/// Copyright (c) 2024 The mlkem-native project authors
-// SPDX-License-Identifier: MIT
-///
-/// Permission is hereby granted, free of charge, to any person obtaining a copy
-/// of this software and associated documentation files (the "Software"), to deal
-/// in the Software without restriction, including without limitation the rights
-/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-/// copies of the Software, and to permit persons to whom the Software is
-/// furnished to do so, subject to the following conditions:
-///
-/// The above copyright notice and this permission notice shall be included in all
-/// copies or substantial portions of the Software.
-///
-/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-/// SOFTWARE.
-///
-
-#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
-
-// Bounds:
-// If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2)
-//
-// See mlken/reduce.c and test/test_bounds.py for more details.
-.macro mulmodq dst, src, const, idx0, idx1
- // Signed barrett multiplication using
- // round-to-nearest-even-integer approximation.
- // Following https://eprint.iacr.org/2021/986.pdf, this
- // is functionally the same as a signed Montgomery multiplication
- // with a suitable constant of absolute value < q.
- sqrdmulh t2.8h, \src\().8h, \const\().h[\idx1\()]
- mul \dst\().8h, \src\().8h, \const\().h[\idx0\()]
- mls \dst\().8h, t2.8h, consts.h[0]
-.endm
-
-.macro mulmod dst, src, const, const_twisted
- sqrdmulh t2.8h, \src\().8h, \const_twisted\().8h
- mul \dst\().8h, \src\().8h, \const\().8h
- mls \dst\().8h, t2.8h, consts.h[0]
-.endm
-
-.macro ct_butterfly a, b, root, idx0, idx1
- mulmodq tmp, \b, \root, \idx0, \idx1
- sub \b\().8h, \a\().8h, tmp.8h
- add \a\().8h, \a\().8h, tmp.8h
-.endm
-
-.macro ct_butterfly_v a, b, root, root_twisted
- mulmod tmp, \b, \root, \root_twisted
- sub \b\().8h, \a\().8h, tmp.8h
- add \a\().8h, \a\().8h, tmp.8h
-.endm
-
-.macro load_roots_012
- ldr q_root0, [r01234_ptr], #32
- ldr q_root1, [r01234_ptr, #-16]
-.endm
-
-.macro load_next_roots_34
- ldr q_root0, [r01234_ptr], #16
-.endm
-
-.macro load_next_roots_56
- ldr q_root0, [r56_ptr], #(6*16)
- ldr q_root0_tw, [r56_ptr, #(-6*16 + 1*16)]
- ldr q_root1, [r56_ptr, #(-6*16 + 2*16)]
- ldr q_root1_tw, [r56_ptr, #(-6*16 + 3*16)]
- ldr q_root2, [r56_ptr, #(-6*16 + 4*16)]
- ldr q_root2_tw, [r56_ptr, #(-6*16 + 5*16)]
-.endm
-
-.macro transpose4 data
- trn1 t0.4s, \data\()0.4s, \data\()1.4s
- trn2 t1.4s, \data\()0.4s, \data\()1.4s
- trn1 t2.4s, \data\()2.4s, \data\()3.4s
- trn2 t3.4s, \data\()2.4s, \data\()3.4s
-
- trn2 \data\()2.2d, t0.2d, t2.2d
- trn2 \data\()3.2d, t1.2d, t3.2d
- trn1 \data\()0.2d, t0.2d, t2.2d
- trn1 \data\()1.2d, t1.2d, t3.2d
-.endm
-
-.macro save_vregs
- sub sp, sp, #(16*4)
- stp d8, d9, [sp, #16*0]
- stp d10, d11, [sp, #16*1]
- stp d12, d13, [sp, #16*2]
- stp d14, d15, [sp, #16*3]
-.endm
-
-.macro restore_vregs
- ldp d8, d9, [sp, #16*0]
- ldp d10, d11, [sp, #16*1]
- ldp d12, d13, [sp, #16*2]
- ldp d14, d15, [sp, #16*3]
- add sp, sp, #(16*4)
-.endm
-
-.macro push_stack
- save_vregs
-.endm
-
-.macro pop_stack
- restore_vregs
-.endm
-
- // Arguments
- in .req x0 // Input/output buffer
- r01234_ptr .req x1 // twiddles for layer 0,1,2,3,4
- r56_ptr .req x2 // twiddles for layer 5,6
-
- inp .req x3
- count .req x4
- wtmp .req w5
-
- data0 .req v8
- data1 .req v9
- data2 .req v10
- data3 .req v11
- data4 .req v12
- data5 .req v13
- data6 .req v14
- data7 .req v15
-
- q_data0 .req q8
- q_data1 .req q9
- q_data2 .req q10
- q_data3 .req q11
- q_data4 .req q12
- q_data5 .req q13
- q_data6 .req q14
- q_data7 .req q15
-
- root0 .req v0
- root1 .req v1
- root2 .req v2
- root0_tw .req v4
- root1_tw .req v5
- root2_tw .req v6
-
- q_root0 .req q0
- q_root1 .req q1
- q_root2 .req q2
- q_root0_tw .req q4
- q_root1_tw .req q5
- q_root2_tw .req q6
-
- consts .req v7
-
- tmp .req v24
- t0 .req v25
- t1 .req v26
- t2 .req v27
- t3 .req v28
-
- .text
- .global MLKEM_ASM_NAMESPACE(ntt_asm_clean)
-
- .balign 4
-MLKEM_ASM_NAMESPACE(ntt_asm_clean):
- push_stack
-
- mov wtmp, #3329
- mov consts.h[0], wtmp
- mov wtmp, #20159
- mov consts.h[1], wtmp
-
- mov inp, in
- mov count, #4
-
- load_roots_012
-
- .p2align 2
-
- // Bounds reasoning:
- // - There are 7 layers
- // - When passing from layer N to layer N+1, each layer-N value
- // is modified through the addition/subtraction of a Montgomery
- // product of a twiddle of absolute value < q/2 and a layer-N value.
- // - Recalling that for C such that |a| < C * q and |t|> 0);
- xtn out0.8b, data0.8h
-
- // r[3 * i + 1] = (t0 >> 8);
- shrn out1.8b, data0.8h, #8
- xtn tmp.8b, data1.8h
- // r[3 * i + 1] = (t0 >> 8) | (t1 << 4);
- sli out1.8b, tmp.8b, #4
-
- // r[3 * i + 2] = (t1 >> 4);
- shrn out2.8b, data1.8h, #4
-
- st3 {out0.8b, out1.8b, out2.8b}, [dst], #24
-
- subs count, count, #1
- cbnz count, poly_tobytes_asm_clean_asm_loop_start
- ret
-
- .unreq data0
- .unreq data1
- .unreq out0
- .unreq out1
- .unreq out2
- .unreq tmp
- .unreq dst
- .unreq src
- .unreq count
-
-/**********************************
- * poly_tomont() *
- **********************************/
-.global MLKEM_ASM_NAMESPACE(poly_tomont_asm_clean)
-
- src .req x0
- count .req x1
- wtmp .req w2
-
- data .req v0
- q_data .req q0
- res .req v1
- q_res .req q1
-
- factor .req v2
- factor_t .req v3
- modulus .req v4
- modulus_twisted .req v5
-
- tmp0 .req v6
-
-.balign 4
-MLKEM_ASM_NAMESPACE(poly_tomont_asm_clean):
-
- mov wtmp, #3329 // ML-KEM modulus
- dup modulus.8h, wtmp
-
- mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
- dup modulus_twisted.8h, wtmp
-
- mov wtmp, #-1044 // 2^16 % 3329
- dup factor.8h, wtmp
-
- mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16)
- dup factor_t.8h, wtmp
-
- mov count, #8
-poly_tomont_asm_loop:
-
- ldr q_data, [src], #64
- mulmod res, data, factor, factor_t
- str q_res, [src, #-64]
-
- ldr q_data, [src, #-48]
- mulmod res, data, factor, factor_t
- str q_res, [src, #-48]
-
- ldr q_data, [src, #-32]
- mulmod res, data, factor, factor_t
- str q_res, [src, #-32]
-
- ldr q_data, [src, #-16]
- mulmod res, data, factor, factor_t
- str q_res, [src, #-16]
-
- sub count, count, #1
- cbnz count, poly_tomont_asm_loop
-
- ret
-
- .unreq src
- .unreq count
- .unreq wtmp
-
- .unreq data
- .unreq q_data
- .unreq res
- .unreq q_res
-
- .unreq factor
- .unreq factor_t
- .unreq modulus
- .unreq modulus_twisted
-
- .unreq tmp0
-
-#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S
new file mode 100644
index 000000000..a3593b7fd
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "../../../common.h"
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+/* simpasm: header-end */
+
+/* Montgomery multiplication, with precomputed Montgomery twist
+ * Expects modulus in consts.h[0]. */
+.macro mulmod dst, src, const, const_twisted
+ sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h
+ mul \dst\().8h, \src\().8h, \const\().8h
+ mls \dst\().8h, tmp0.8h, modulus.h[0]
+.endm
+
+/********************************************
+ * poly_mulcache_compute() *
+ ********************************************/
+
+
+ cache_ptr .req x0
+ data_ptr .req x1
+ zeta_ptr .req x2
+ zeta_twisted_ptr .req x3
+ count .req x4
+ wtmp .req w5
+
+ data_odd .req v0
+ zeta .req v1
+ q_zeta .req q1
+ zeta_twisted .req v2
+ q_zeta_twisted .req q2
+
+ tmp0 .req v3
+ q_tmp0 .req q3
+ tmp1 .req v4
+ q_tmp1 .req q4
+ dst .req v5
+ q_dst .req q5
+
+ modulus .req v6
+ modulus_twisted .req v7
+
+ .text
+ .global MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt)
+ .balign 4
+MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt):
+ mov wtmp, #3329
+ dup modulus.8h, wtmp
+
+ mov wtmp, #20159
+ dup modulus_twisted.8h, wtmp
+
+ mov count, #16
+ // Instructions: 7
+ // Expected cycles: 12
+ // Expected IPC: 0.58
+
+ // Cycle bound: 12.0
+ // IPC bound: 0.58
+
+ // Wall time: 0.01s
+ // User time: 0.01s
+
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ ldr q1, [x1, #16] // *.............................
+ ldr q27, [x1], #32 // ..*...........................
+ ldr q23, [x2], #16 // ....*.........................
+ uzp2 v27.8H, v27.8H, v1.8H // ......*.......................
+ ldr q1, [x3], #16 // .......*......................
+ mul v2.8H, v27.8H, v23.8H // .........*....................
+ sqrdmulh v27.8H, v27.8H, v1.8H // ...........*..................
+
+ // ------ cycle (expected) ------>
+ // 0 25
+ // |------------------------|-----
+ // ldr q29, [x1, #16] // *..............................
+ // ldr q21, [x2], #16 // ....*..........................
+ // ldr q27, [x1], #32 // ..*............................
+ // ldr q7, [x3], #16 // .......*.......................
+ // uzp2 v28.8H, v27.8H, v29.8H // ......*........................
+ // mul v2.8H, v28.8H, v21.8H // .........*.....................
+ // sqrdmulh v27.8H, v28.8H, v7.8H // ...........*...................
+
+ sub count, count, #1
+poly_mulcache_compute_asm_opt_loop:
+ // Instructions: 9
+ // Expected cycles: 13
+ // Expected IPC: 0.69
+
+ // Cycle bound: 13.0
+ // IPC bound: 0.69
+
+ // Wall time: 0.09s
+ // User time: 0.09s
+
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ ldr q29, [x1, #16] // e.............................
+ ldr q21, [x2], #16 // ..e...........................
+ mls v2.8H, v27.8H, v6.H[0] // ....*.........................
+ ldr q27, [x1], #32 // .....e........................
+ ldr q7, [x3], #16 // .......e......................
+ uzp2 v28.8H, v27.8H, v29.8H // .........e....................
+ str q2, [x0], #16 // ..........*...................
+ mul v2.8H, v28.8H, v21.8H // ...........e..................
+ sqrdmulh v27.8H, v28.8H, v7.8H // ............e.................
+
+ // ------ cycle (expected) ------>
+ // 0 25
+ // |------------------------|-----
+ // ldr q3, [x1], #32 // .....e.......'....~.......'....
+ // ldr q4, [x1, #-16] // e............~............~....
+ // ldr q1, [x2], #16 // ..e..........'.~..........'.~..
+ // ldr q2, [x3], #16 // .......e.....'......~.....'....
+ // uzp2 v0.8h, v3.8h, v4.8h // .........e...'........~...'....
+ // sqrdmulh v3.8h, v0.8h, v2.8h // ............e'...........~'....
+ // mul v5.8h, v0.8h, v1.8h // ...........e.'..........~.'....
+ // mls v5.8h, v3.8h, v6.h[0] // ....~........'...*........'....
+ // str q5, [x0], #16 // ..........~..'.........*..'....
+
+ sub count, count, 1
+ cbnz count, poly_mulcache_compute_asm_opt_loop
+ // Instructions: 2
+ // Expected cycles: 5
+ // Expected IPC: 0.40
+
+ // Cycle bound: 5.0
+ // IPC bound: 0.40
+
+ // Wall time: 0.00s
+ // User time: 0.00s
+
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ mls v2.8H, v27.8H, v6.H[0] // *.............................
+ str q2, [x0], #16 // ....*.........................
+
+ // ------ cycle (expected) ------>
+ // 0 25
+ // |------------------------|-----
+ // mls v2.8H, v27.8H, v6.H[0] // *..............................
+ // str q2, [x0], #16 // ....*..........................
+
+
+ ret
+
+ .unreq cache_ptr
+ .unreq data_ptr
+ .unreq zeta_ptr
+ .unreq zeta_twisted_ptr
+ .unreq count
+ .unreq wtmp
+
+ .unreq data_odd
+ .unreq zeta
+ .unreq q_zeta
+ .unreq zeta_twisted
+ .unreq q_zeta_twisted
+
+ .unreq tmp0
+ .unreq q_tmp0
+ .unreq tmp1
+ .unreq q_tmp1
+ .unreq dst
+ .unreq q_dst
+
+ .unreq modulus
+ .unreq modulus_twisted
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_opt.S
deleted file mode 100644
index 79605818f..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_opt.S
+++ /dev/null
@@ -1,670 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
-
-/*
- * Some modular arithmetic macros
- */
-
-/* Barrett reduction */
-.macro barrett_reduce a
- sqdmulh tmp.8h, \a\().8h, modulus_twisted.h[0]
- srshr tmp.8h, tmp.8h, #11
- mls \a\().8h, tmp.8h, modulus.h[0]
-.endm
-
-/* Montgomery multiplication, with precomputed Montgomery twist
- * Expects modulus in consts.h[0]. */
-.macro mulmod dst, src, const, const_twisted
- sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h
- mul \dst\().8h, \src\().8h, \const\().8h
- mls \dst\().8h, tmp0.8h, modulus.h[0]
-.endm
-
-/* Turns signed-canonical to unsigned canonical representative
- * through conditional addition of the modulus.
- *
- * Expected modulus in `modulus`. */
-.macro scalar_signed_to_unsigned a
- sshr mask.8h, \a\().8h, #15
- and mask.16b, modulus.16b, mask.16b
- add \a\().8h, \a\().8h, mask.8h
-.endm
-
-/**********************************
- * poly_reduce() *
- **********************************/
-
-.global MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt)
-
- ptr .req x0
- count .req x1
- wtmp .req w2
-
- data .req v0
- q_data .req q0
-
- tmp .req v1
- mask .req v2
- modulus .req v3
- modulus_twisted .req v4
-
-.balign 4
-MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
-
- mov wtmp, #3329 // ML-KEM modulus
- dup modulus.8h, wtmp
-
- mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
- dup modulus_twisted.8h, wtmp
-
- mov count, #8
- // Instructions: 15
- // Expected cycles: 22
- // Expected IPC: 0.68
-
- // Cycle bound: 22.0
- // IPC bound: 0.68
-
- // Wall time: 0.05s
- // User time: 0.05s
-
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- ldr q21, [x0, #32] // *.............................
- ldr q23, [x0, #48] // ..*...........................
- sqdmulh v7.8H, v21.8H, v4.H[0] // ....*.........................
- sqdmulh v30.8H, v23.8H, v4.H[0] // ......*.......................
- srshr v7.8H, v7.8H, #11 // ........*.....................
- srshr v30.8H, v30.8H, #11 // ..........*...................
- mls v21.8H, v7.8H, v3.H[0] // ...........*..................
- mls v23.8H, v30.8H, v3.H[0] // .............*................
- ldr q5, [x0, #16] // ..............*...............
- sshr v7.8H, v21.8H, #15 // ................*.............
- sshr v30.8H, v23.8H, #15 // .................*............
- and v7.16B, v3.16B, v7.16B // ..................*...........
- add v21.8H, v21.8H, v7.8H // ...................*..........
- and v7.16B, v3.16B, v30.16B // ....................*.........
- add v16.8H, v23.8H, v7.8H // .....................*........
-
- // ------ cycle (expected) ------>
- // 0 25
- // |------------------------|-----
- // ldr q30, [x0, #32] // *..............................
- // sqdmulh v22.8H, v30.8H, v4.H[0] // ....*..........................
- // ldr q2, [x0, #48] // ..*............................
- // srshr v19.8H, v22.8H, #11 // ........*......................
- // mls v30.8H, v19.8H, v3.H[0] // ...........*...................
- // sqdmulh v25.8H, v2.8H, v4.H[0] // ......*........................
- // sshr v31.8H, v30.8H, #15 // ................*..............
- // srshr v25.8H, v25.8H, #11 // ..........*....................
- // and v18.16B, v3.16B, v31.16B // ..................*............
- // mls v2.8H, v25.8H, v3.H[0] // .............*.................
- // add v21.8H, v30.8H, v18.8H // ...................*...........
- // ldr q5, [x0, #16] // ..............*................
- // sshr v18.8H, v2.8H, #15 // .................*.............
- // and v27.16B, v3.16B, v18.16B // ....................*..........
- // add v16.8H, v2.8H, v27.8H // .....................*.........
-
- sub count, count, #1
-1:
- // Instructions: 32
- // Expected cycles: 36
- // Expected IPC: 0.89
-
- // Cycle bound: 36.0
- // IPC bound: 0.89
-
- // Wall time: 1.05s
- // User time: 1.05s
-
- // -------- cycle (expected) --------->
- // 0 25
- // |------------------------|----------
- ldr q6, [x0], #64 // *...................................
- ldr q30, [x0, #32] // ..e.................................
- sqdmulh v31.8H, v6.8H, v4.H[0] // ....*...............................
- sqdmulh v29.8H, v5.8H, v4.H[0] // .....*..............................
- sqdmulh v22.8H, v30.8H, v4.H[0] // ......e.............................
- str q16, [x0, #-16] // .......*............................
- srshr v20.8H, v31.8H, #11 // ........*...........................
- srshr v28.8H, v29.8H, #11 // .........*..........................
- str q21, [x0, #-32] // ..........*.........................
- mls v6.8H, v20.8H, v3.H[0] // ...........*........................
- mls v5.8H, v28.8H, v3.H[0] // ............*.......................
- ldr q2, [x0, #48] // .............e......................
- sshr v31.8H, v6.8H, #15 // ...............*....................
- srshr v19.8H, v22.8H, #11 // ................e...................
- and v22.16B, v3.16B, v31.16B // .................*..................
- add v0.8H, v6.8H, v22.8H // ..................*.................
- mls v30.8H, v19.8H, v3.H[0] // ...................e................
- sshr v26.8H, v5.8H, #15 // ....................*...............
- sqdmulh v25.8H, v2.8H, v4.H[0] // .....................e..............
- and v17.16B, v3.16B, v26.16B // ......................*.............
- add v1.8H, v5.8H, v17.8H // .......................*............
- sshr v31.8H, v30.8H, #15 // ........................e...........
- srshr v25.8H, v25.8H, #11 // .........................e..........
- str q1, [x0, #-48] // ..........................*.........
- and v18.16B, v3.16B, v31.16B // ...........................e........
- mls v2.8H, v25.8H, v3.H[0] // ............................e.......
- add v21.8H, v30.8H, v18.8H // .............................e......
- ldr q5, [x0, #16] // ..............................e.....
- sshr v18.8H, v2.8H, #15 // ................................e...
- str q0, [x0, #-64] // .................................*..
- and v27.16B, v3.16B, v18.16B // ..................................e.
- add v16.8H, v2.8H, v27.8H // ...................................e
-
- // ------------------------ cycle (expected) ------------------------->
- // 0 25 50
- // |------------------------|------------------------|-----------------
- // ldr q0, [x0], #64 // ..................................*.................................
- // sqdmulh v1.8h, v0.8h, v4.h[0] // ..~...............................'...*.............................
- // srshr v1.8h, v1.8h, #11 // ......~...........................'.......*.........................
- // mls v0.8h, v1.8h, v3.h[0] // .........~........................'..........*......................
- // sshr v2.8h, v0.8h, #15 // .............~....................'..............*..................
- // and v2.16b, v3.16b, v2.16b // ...............~..................'................*................
- // add v0.8h, v0.8h, v2.8h // ................~.................'.................*...............
- // str q0, [x0, #-64] // ...............................~..'................................*
- // ldr q0, [x0, #-48] // ............................e.....'.............................~...
- // sqdmulh v1.8h, v0.8h, v4.h[0] // ...~..............................'....*............................
- // srshr v1.8h, v1.8h, #11 // .......~..........................'........*........................
- // mls v0.8h, v1.8h, v3.h[0] // ..........~.......................'...........*.....................
- // sshr v2.8h, v0.8h, #15 // ..................~...............'...................*.............
- // and v2.16b, v3.16b, v2.16b // ....................~.............'.....................*...........
- // add v0.8h, v0.8h, v2.8h // .....................~............'......................*..........
- // str q0, [x0, #-48] // ........................~.........'.........................*.......
- // ldr q0, [x0, #-32] // e.................................'.~...............................
- // sqdmulh v1.8h, v0.8h, v4.h[0] // ....e.............................'.....~...........................
- // srshr v1.8h, v1.8h, #11 // ..............e...................'...............~.................
- // mls v0.8h, v1.8h, v3.h[0] // .................e................'..................~..............
- // sshr v2.8h, v0.8h, #15 // ......................e...........'.......................~.........
- // and v2.16b, v3.16b, v2.16b // .........................e........'..........................~......
- // add v0.8h, v0.8h, v2.8h // ...........................e......'............................~....
- // str q0, [x0, #-32] // ........~.........................'.........*.......................
- // ldr q0, [x0, #-16] // ...........e......................'............~....................
- // sqdmulh v1.8h, v0.8h, v4.h[0] // ...................e..............'....................~............
- // srshr v1.8h, v1.8h, #11 // .......................e..........'........................~........
- // mls v0.8h, v1.8h, v3.h[0] // ..........................e.......'...........................~.....
- // sshr v2.8h, v0.8h, #15 // ..............................e...'...............................~.
- // and v2.16b, v3.16b, v2.16b // ................................e.'.................................
- // add v0.8h, v0.8h, v2.8h // .................................e'.................................
- // str q0, [x0, #-16] // .....~............................'......*..........................
-
- sub count, count, 1
- cbnz count, 1b
- // Instructions: 17
- // Expected cycles: 23
- // Expected IPC: 0.74
-
- // Cycle bound: 23.0
- // IPC bound: 0.74
-
- // Wall time: 0.05s
- // User time: 0.05s
-
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- sqdmulh v20.8H, v5.8H, v4.H[0] // *.............................
- ldr q24, [x0], #64 // .*............................
- str q21, [x0, #-32] // ...*..........................
- srshr v20.8H, v20.8H, #11 // ....*.........................
- sqdmulh v25.8H, v24.8H, v4.H[0] // .....*........................
- str q16, [x0, #-16] // ......*.......................
- mls v5.8H, v20.8H, v3.H[0] // .......*......................
- srshr v20.8H, v25.8H, #11 // .........*....................
- sshr v2.8H, v5.8H, #15 // ...........*..................
- mls v24.8H, v20.8H, v3.H[0] // ............*.................
- and v20.16B, v3.16B, v2.16B // .............*................
- add v31.8H, v5.8H, v20.8H // ..............*...............
- sshr v20.8H, v24.8H, #15 // ................*.............
- str q31, [x0, #-48] // .................*............
- and v31.16B, v3.16B, v20.16B // ..................*...........
- add v24.8H, v24.8H, v31.8H // ...................*..........
- str q24, [x0, #-64] // ......................*.......
-
- // ------ cycle (expected) ------>
- // 0 25
- // |------------------------|-----
- // ldr q6, [x0], #64 // .*.............................
- // sqdmulh v31.8H, v6.8H, v4.H[0] // .....*.........................
- // sqdmulh v29.8H, v5.8H, v4.H[0] // *..............................
- // str q16, [x0, #-16] // ......*........................
- // srshr v20.8H, v31.8H, #11 // .........*.....................
- // srshr v28.8H, v29.8H, #11 // ....*..........................
- // str q21, [x0, #-32] // ...*...........................
- // mls v6.8H, v20.8H, v3.H[0] // ............*..................
- // mls v5.8H, v28.8H, v3.H[0] // .......*.......................
- // sshr v31.8H, v6.8H, #15 // ................*..............
- // and v22.16B, v3.16B, v31.16B // ..................*............
- // add v0.8H, v6.8H, v22.8H // ...................*...........
- // sshr v26.8H, v5.8H, #15 // ...........*...................
- // and v17.16B, v3.16B, v26.16B // .............*.................
- // add v1.8H, v5.8H, v17.8H // ..............*................
- // str q1, [x0, #-48] // .................*.............
- // str q0, [x0, #-64] // ......................*........
-
-
- ret
-
- .unreq ptr
- .unreq count
- .unreq wtmp
-
- .unreq data
- .unreq q_data
-
- .unreq tmp
- .unreq mask
- .unreq modulus
- .unreq modulus_twisted
-
-/********************************************
- * poly_mulcache_compute() *
- ********************************************/
-
-.global MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt)
-
- cache_ptr .req x0
- data_ptr .req x1
- zeta_ptr .req x2
- zeta_twisted_ptr .req x3
- count .req x4
- wtmp .req w5
-
- data_odd .req v0
- zeta .req v1
- q_zeta .req q1
- zeta_twisted .req v2
- q_zeta_twisted .req q2
-
- tmp0 .req v3
- q_tmp0 .req q3
- tmp1 .req v4
- q_tmp1 .req q4
- dst .req v5
- q_dst .req q5
-
- modulus .req v6
- modulus_twisted .req v7
-
-.balign 4
-MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt):
- mov wtmp, #3329
- dup modulus.8h, wtmp
-
- mov wtmp, #20159
- dup modulus_twisted.8h, wtmp
-
- mov count, #16
- // Instructions: 7
- // Expected cycles: 12
- // Expected IPC: 0.58
-
- // Cycle bound: 12.0
- // IPC bound: 0.58
-
- // Wall time: 0.01s
- // User time: 0.01s
-
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- ldr q1, [x1, #16] // *.............................
- ldr q27, [x1], #32 // ..*...........................
- ldr q23, [x2], #16 // ....*.........................
- uzp2 v27.8H, v27.8H, v1.8H // ......*.......................
- ldr q1, [x3], #16 // .......*......................
- mul v2.8H, v27.8H, v23.8H // .........*....................
- sqrdmulh v27.8H, v27.8H, v1.8H // ...........*..................
-
- // ------ cycle (expected) ------>
- // 0 25
- // |------------------------|-----
- // ldr q29, [x1, #16] // *..............................
- // ldr q21, [x2], #16 // ....*..........................
- // ldr q27, [x1], #32 // ..*............................
- // ldr q7, [x3], #16 // .......*.......................
- // uzp2 v28.8H, v27.8H, v29.8H // ......*........................
- // mul v2.8H, v28.8H, v21.8H // .........*.....................
- // sqrdmulh v27.8H, v28.8H, v7.8H // ...........*...................
-
- sub count, count, #1
-1:
- // Instructions: 9
- // Expected cycles: 13
- // Expected IPC: 0.69
-
- // Cycle bound: 13.0
- // IPC bound: 0.69
-
- // Wall time: 0.09s
- // User time: 0.09s
-
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- ldr q29, [x1, #16] // e.............................
- ldr q21, [x2], #16 // ..e...........................
- mls v2.8H, v27.8H, v6.H[0] // ....*.........................
- ldr q27, [x1], #32 // .....e........................
- ldr q7, [x3], #16 // .......e......................
- uzp2 v28.8H, v27.8H, v29.8H // .........e....................
- str q2, [x0], #16 // ..........*...................
- mul v2.8H, v28.8H, v21.8H // ...........e..................
- sqrdmulh v27.8H, v28.8H, v7.8H // ............e.................
-
- // ------ cycle (expected) ------>
- // 0 25
- // |------------------------|-----
- // ldr q3, [x1], #32 // .....e.......'....~.......'....
- // ldr q4, [x1, #-16] // e............~............~....
- // ldr q1, [x2], #16 // ..e..........'.~..........'.~..
- // ldr q2, [x3], #16 // .......e.....'......~.....'....
- // uzp2 v0.8h, v3.8h, v4.8h // .........e...'........~...'....
- // sqrdmulh v3.8h, v0.8h, v2.8h // ............e'...........~'....
- // mul v5.8h, v0.8h, v1.8h // ...........e.'..........~.'....
- // mls v5.8h, v3.8h, v6.h[0] // ....~........'...*........'....
- // str q5, [x0], #16 // ..........~..'.........*..'....
-
- sub count, count, 1
- cbnz count, 1b
- // Instructions: 2
- // Expected cycles: 5
- // Expected IPC: 0.40
-
- // Cycle bound: 5.0
- // IPC bound: 0.40
-
- // Wall time: 0.00s
- // User time: 0.00s
-
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- mls v2.8H, v27.8H, v6.H[0] // *.............................
- str q2, [x0], #16 // ....*.........................
-
- // ------ cycle (expected) ------>
- // 0 25
- // |------------------------|-----
- // mls v2.8H, v27.8H, v6.H[0] // *..............................
- // str q2, [x0], #16 // ....*..........................
-
-
- ret
-
- .unreq cache_ptr
- .unreq data_ptr
- .unreq zeta_ptr
- .unreq zeta_twisted_ptr
- .unreq count
- .unreq wtmp
-
- .unreq data_odd
- .unreq zeta
- .unreq q_zeta
- .unreq zeta_twisted
- .unreq q_zeta_twisted
-
- .unreq tmp0
- .unreq q_tmp0
- .unreq tmp1
- .unreq q_tmp1
- .unreq dst
- .unreq q_dst
-
- .unreq modulus
- .unreq modulus_twisted
-
-/********************************************
- * poly_tobytes() *
- ********************************************/
-.global MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt)
-
- data0 .req v0
- data1 .req v1
- out0 .req v2
- out1 .req v3
- out2 .req v4
- tmp .req v5
-
- dst .req x0
- src .req x1
- count .req x2
-
-.balign 4
-MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt):
-
- mov count, #16
-poly_tobytes_asm_opt_asm_loop_start:
- ld2 {data0.8h, data1.8h}, [src], #32
-
- // r[3 * i + 0] = (t0 >> 0);
- xtn out0.8b, data0.8h
-
- // r[3 * i + 1] = (t0 >> 8);
- shrn out1.8b, data0.8h, #8
- xtn tmp.8b, data1.8h
- // r[3 * i + 1] = (t0 >> 8) | (t1 << 4);
- sli out1.8b, tmp.8b, #4
-
- // r[3 * i + 2] = (t1 >> 4);
- shrn out2.8b, data1.8h, #4
-
- st3 {out0.8b, out1.8b, out2.8b}, [dst], #24
-
- subs count, count, #1
- cbnz count, poly_tobytes_asm_opt_asm_loop_start
- ret
-
- .unreq data0
- .unreq data1
- .unreq out0
- .unreq out1
- .unreq out2
- .unreq tmp
- .unreq dst
- .unreq src
- .unreq count
-
-/**********************************
- * poly_tomont() *
- **********************************/
-.global MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt)
-
- src .req x0
- count .req x1
- wtmp .req w2
-
- data .req v0
- q_data .req q0
- res .req v1
- q_res .req q1
-
- factor .req v2
- factor_t .req v3
- modulus .req v4
- modulus_twisted .req v5
-
- tmp0 .req v6
-
-.balign 4
-MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt):
-
- mov wtmp, #3329 // ML-KEM modulus
- dup modulus.8h, wtmp
-
- mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
- dup modulus_twisted.8h, wtmp
-
- mov wtmp, #-1044 // 2^16 % 3329
- dup factor.8h, wtmp
-
- mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16)
- dup factor_t.8h, wtmp
-
- mov count, #8
- // Instructions: 5
- // Expected cycles: 7
- // Expected IPC: 0.71
- //
- // Cycle bound: 7.0
- // IPC bound: 0.71
- //
- // Wall time: 0.01s
- // User time: 0.01s
- //
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- ldr q26, [x0, #48] // *.............................
- ldr q23, [x0, #16] // ..*...........................
- mul v17.8H, v26.8H, v2.8H // ....*.........................
- sqrdmulh v7.8H, v26.8H, v3.8H // .....*........................
- ldr q27, [x0, #32] // ......*.......................
-
- // ------ cycle (expected) ------>
- // 0 25
- // |------------------------|-----
- // ldr q7, [x0, #48] // *..............................
- // ldr q23, [x0, #16] // ..*............................
- // mul v17.8H, v7.8H, v2.8H // ....*..........................
- // sqrdmulh v7.8H, v7.8H, v3.8H // .....*.........................
- // ldr q27, [x0, #32] // ......*........................
-
- sub count, count, #1
-1:
- // Instructions: 20
- // Expected cycles: 24
- // Expected IPC: 0.83
- //
- // Cycle bound: 24.0
- // IPC bound: 0.83
- //
- // Wall time: 0.73s
- // User time: 0.73s
- //
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- mls v17.8H, v7.8H, v4.H[0] // *.............................
- sqrdmulh v5.8H, v23.8H, v3.8H // .*............................
- ldr q7, [x0], #64 // ..*...........................
- str q17, [x0, #-16] // ....*.........................
- sqrdmulh v29.8H, v27.8H, v3.8H // .....*........................
- sqrdmulh v19.8H, v7.8H, v3.8H // ......*.......................
- mul v25.8H, v23.8H, v2.8H // .......*......................
- mul v0.8H, v7.8H, v2.8H // ........*.....................
- mul v26.8H, v27.8H, v2.8H // .........*....................
- ldr q7, [x0, #48] // ..........e...................
- mls v25.8H, v5.8H, v4.H[0] // ............*.................
- ldr q23, [x0, #16] // .............e................
- mls v26.8H, v29.8H, v4.H[0] // ...............*..............
- mls v0.8H, v19.8H, v4.H[0] // ................*.............
- str q25, [x0, #-48] // .................*............
- mul v17.8H, v7.8H, v2.8H // ..................e...........
- sqrdmulh v7.8H, v7.8H, v3.8H // ...................e..........
- str q0, [x0, #-64] // ....................*.........
- ldr q27, [x0, #32] // .....................e........
- str q26, [x0, #-32] // .......................*......
-
- // --------- cycle (expected) ---------->
- // 0 25
- // |------------------------|------------
- // ldr q0, [x0], #64 // ..............'.*.....................
- // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'.....*.................
- // mul v1.8h, v0.8h, v2.8h // ..............'.......*...............
- // mls v1.8h, v6.8h, v4.h[0] // ......~.......'...............*.......
- // str q1, [x0, #-64] // ..........~...'...................*...
- // ldr q0, [x0, #-48] // ...e..........'............~..........
- // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'*......................
- // mul v1.8h, v0.8h, v2.8h // ..............'......*................
- // mls v1.8h, v6.8h, v4.h[0] // ..~...........'...........*...........
- // str q1, [x0, #-48] // .......~......'................*......
- // ldr q0, [x0, #-32] // ...........e..'....................~..
- // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'....*..................
- // mul v1.8h, v0.8h, v2.8h // ..............'........*..............
- // mls v1.8h, v6.8h, v4.h[0] // .....~........'..............*........
- // str q1, [x0, #-32] // .............~'......................*
- // ldr q0, [x0, #-16] // e.............'.........~.............
- // sqrdmulh v6.8h, v0.8h, v3.8h // .........e....'..................~....
- // mul v1.8h, v0.8h, v2.8h // ........e.....'.................~.....
- // mls v1.8h, v6.8h, v4.h[0] // ..............*.......................
- // str q1, [x0, #-16] // ..............'...*...................
-
- sub count, count, 1
- cbnz count, 1b
- // Instructions: 15
- // Expected cycles: 18
- // Expected IPC: 0.83
- //
- // Cycle bound: 18.0
- // IPC bound: 0.83
- //
- // Wall time: 0.07s
- // User time: 0.07s
- //
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- mls v17.8H, v7.8H, v4.H[0] // *.............................
- sqrdmulh v7.8H, v23.8H, v3.8H // .*............................
- mul v26.8H, v23.8H, v2.8H // ..*...........................
- sqrdmulh v25.8H, v27.8H, v3.8H // ...*..........................
- ldr q23, [x0], #64 // ....*.........................
- mul v27.8H, v27.8H, v2.8H // ......*.......................
- mls v26.8H, v7.8H, v4.H[0] // .......*......................
- sqrdmulh v7.8H, v23.8H, v3.8H // ........*.....................
- mul v23.8H, v23.8H, v2.8H // .........*....................
- str q17, [x0, #-16] // ..........*...................
- mls v27.8H, v25.8H, v4.H[0] // ...........*..................
- str q26, [x0, #-48] // ............*.................
- mls v23.8H, v7.8H, v4.H[0] // .............*................
- str q27, [x0, #-32] // ...............*..............
- str q23, [x0, #-64] // .................*............
-
- // ------ cycle (expected) ------>
- // 0 25
- // |------------------------|-----
- // mls v17.8H, v7.8H, v4.H[0] // *..............................
- // sqrdmulh v5.8H, v23.8H, v3.8H // .*.............................
- // ldr q7, [x0], #64 // ....*..........................
- // str q17, [x0, #-16] // ..........*....................
- // sqrdmulh v29.8H, v27.8H, v3.8H // ...*...........................
- // sqrdmulh v19.8H, v7.8H, v3.8H // ........*......................
- // mul v25.8H, v23.8H, v2.8H // ..*............................
- // mul v0.8H, v7.8H, v2.8H // .........*.....................
- // mul v26.8H, v27.8H, v2.8H // ......*........................
- // mls v25.8H, v5.8H, v4.H[0] // .......*.......................
- // mls v26.8H, v29.8H, v4.H[0] // ...........*...................
- // mls v0.8H, v19.8H, v4.H[0] // .............*.................
- // str q25, [x0, #-48] // ............*..................
- // str q0, [x0, #-64] // .................*.............
- // str q26, [x0, #-32] // ...............*...............
-
-
- ret
-
- .unreq src
- .unreq count
- .unreq wtmp
-
- .unreq data
- .unreq q_data
- .unreq res
- .unreq q_res
-
- .unreq factor
- .unreq factor_t
- .unreq modulus
- .unreq modulus_twisted
-
- .unreq tmp0
-
-#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_reduce_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_reduce_asm_opt.S
new file mode 100644
index 000000000..410950730
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_reduce_asm_opt.S
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "../../../common.h"
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+/* simpasm: header-end */
+
+/* Barrett reduction */
+.macro barrett_reduce a
+ sqdmulh tmp.8h, \a\().8h, modulus_twisted.h[0]
+ srshr tmp.8h, tmp.8h, #11
+ mls \a\().8h, tmp.8h, modulus.h[0]
+.endm
+
+/* Turns signed-canonical to unsigned canonical representative
+ * through conditional addition of the modulus.
+ *
+ * Expected modulus in `modulus`. */
+.macro scalar_signed_to_unsigned a
+ sshr mask.8h, \a\().8h, #15
+ and mask.16b, modulus.16b, mask.16b
+ add \a\().8h, \a\().8h, mask.8h
+.endm
+
+/**********************************
+ * poly_reduce() *
+ **********************************/
+
+ ptr .req x0
+ count .req x1
+ wtmp .req w2
+
+ data .req v0
+ q_data .req q0
+
+ tmp .req v1
+ mask .req v2
+ modulus .req v3
+ modulus_twisted .req v4
+
+ .text
+ .global MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt)
+ .balign 4
+MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
+
+ mov wtmp, #3329 // ML-KEM modulus
+ dup modulus.8h, wtmp
+
+ mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
+ dup modulus_twisted.8h, wtmp
+
+ mov count, #8
+ // Instructions: 15
+ // Expected cycles: 22
+ // Expected IPC: 0.68
+
+ // Cycle bound: 22.0
+ // IPC bound: 0.68
+
+ // Wall time: 0.05s
+ // User time: 0.05s
+
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ ldr q21, [x0, #32] // *.............................
+ ldr q23, [x0, #48] // ..*...........................
+ sqdmulh v7.8H, v21.8H, v4.H[0] // ....*.........................
+ sqdmulh v30.8H, v23.8H, v4.H[0] // ......*.......................
+ srshr v7.8H, v7.8H, #11 // ........*.....................
+ srshr v30.8H, v30.8H, #11 // ..........*...................
+ mls v21.8H, v7.8H, v3.H[0] // ...........*..................
+ mls v23.8H, v30.8H, v3.H[0] // .............*................
+ ldr q5, [x0, #16] // ..............*...............
+ sshr v7.8H, v21.8H, #15 // ................*.............
+ sshr v30.8H, v23.8H, #15 // .................*............
+ and v7.16B, v3.16B, v7.16B // ..................*...........
+ add v21.8H, v21.8H, v7.8H // ...................*..........
+ and v7.16B, v3.16B, v30.16B // ....................*.........
+ add v16.8H, v23.8H, v7.8H // .....................*........
+
+ // ------ cycle (expected) ------>
+ // 0 25
+ // |------------------------|-----
+ // ldr q30, [x0, #32] // *..............................
+ // sqdmulh v22.8H, v30.8H, v4.H[0] // ....*..........................
+ // ldr q2, [x0, #48] // ..*............................
+ // srshr v19.8H, v22.8H, #11 // ........*......................
+ // mls v30.8H, v19.8H, v3.H[0] // ...........*...................
+ // sqdmulh v25.8H, v2.8H, v4.H[0] // ......*........................
+ // sshr v31.8H, v30.8H, #15 // ................*..............
+ // srshr v25.8H, v25.8H, #11 // ..........*....................
+ // and v18.16B, v3.16B, v31.16B // ..................*............
+ // mls v2.8H, v25.8H, v3.H[0] // .............*.................
+ // add v21.8H, v30.8H, v18.8H // ...................*...........
+ // ldr q5, [x0, #16] // ..............*................
+ // sshr v18.8H, v2.8H, #15 // .................*.............
+ // and v27.16B, v3.16B, v18.16B // ....................*..........
+ // add v16.8H, v2.8H, v27.8H // .....................*.........
+
+ sub count, count, #1
+poly_reduce_asm_opt_loop:
+ // Instructions: 32
+ // Expected cycles: 36
+ // Expected IPC: 0.89
+
+ // Cycle bound: 36.0
+ // IPC bound: 0.89
+
+ // Wall time: 1.05s
+ // User time: 1.05s
+
+ // -------- cycle (expected) --------->
+ // 0 25
+ // |------------------------|----------
+ ldr q6, [x0], #64 // *...................................
+ ldr q30, [x0, #32] // ..e.................................
+ sqdmulh v31.8H, v6.8H, v4.H[0] // ....*...............................
+ sqdmulh v29.8H, v5.8H, v4.H[0] // .....*..............................
+ sqdmulh v22.8H, v30.8H, v4.H[0] // ......e.............................
+ str q16, [x0, #-16] // .......*............................
+ srshr v20.8H, v31.8H, #11 // ........*...........................
+ srshr v28.8H, v29.8H, #11 // .........*..........................
+ str q21, [x0, #-32] // ..........*.........................
+ mls v6.8H, v20.8H, v3.H[0] // ...........*........................
+ mls v5.8H, v28.8H, v3.H[0] // ............*.......................
+ ldr q2, [x0, #48] // .............e......................
+ sshr v31.8H, v6.8H, #15 // ...............*....................
+ srshr v19.8H, v22.8H, #11 // ................e...................
+ and v22.16B, v3.16B, v31.16B // .................*..................
+ add v0.8H, v6.8H, v22.8H // ..................*.................
+ mls v30.8H, v19.8H, v3.H[0] // ...................e................
+ sshr v26.8H, v5.8H, #15 // ....................*...............
+ sqdmulh v25.8H, v2.8H, v4.H[0] // .....................e..............
+ and v17.16B, v3.16B, v26.16B // ......................*.............
+ add v1.8H, v5.8H, v17.8H // .......................*............
+ sshr v31.8H, v30.8H, #15 // ........................e...........
+ srshr v25.8H, v25.8H, #11 // .........................e..........
+ str q1, [x0, #-48] // ..........................*.........
+ and v18.16B, v3.16B, v31.16B // ...........................e........
+ mls v2.8H, v25.8H, v3.H[0] // ............................e.......
+ add v21.8H, v30.8H, v18.8H // .............................e......
+ ldr q5, [x0, #16] // ..............................e.....
+ sshr v18.8H, v2.8H, #15 // ................................e...
+ str q0, [x0, #-64] // .................................*..
+ and v27.16B, v3.16B, v18.16B // ..................................e.
+ add v16.8H, v2.8H, v27.8H // ...................................e
+
+ // ------------------------ cycle (expected) ------------------------->
+ // 0 25 50
+ // |------------------------|------------------------|-----------------
+ // ldr q0, [x0], #64 // ..................................*.................................
+ // sqdmulh v1.8h, v0.8h, v4.h[0] // ..~...............................'...*.............................
+ // srshr v1.8h, v1.8h, #11 // ......~...........................'.......*.........................
+ // mls v0.8h, v1.8h, v3.h[0] // .........~........................'..........*......................
+ // sshr v2.8h, v0.8h, #15 // .............~....................'..............*..................
+ // and v2.16b, v3.16b, v2.16b // ...............~..................'................*................
+ // add v0.8h, v0.8h, v2.8h // ................~.................'.................*...............
+ // str q0, [x0, #-64] // ...............................~..'................................*
+ // ldr q0, [x0, #-48] // ............................e.....'.............................~...
+ // sqdmulh v1.8h, v0.8h, v4.h[0] // ...~..............................'....*............................
+ // srshr v1.8h, v1.8h, #11 // .......~..........................'........*........................
+ // mls v0.8h, v1.8h, v3.h[0] // ..........~.......................'...........*.....................
+ // sshr v2.8h, v0.8h, #15 // ..................~...............'...................*.............
+ // and v2.16b, v3.16b, v2.16b // ....................~.............'.....................*...........
+ // add v0.8h, v0.8h, v2.8h // .....................~............'......................*..........
+ // str q0, [x0, #-48] // ........................~.........'.........................*.......
+ // ldr q0, [x0, #-32] // e.................................'.~...............................
+ // sqdmulh v1.8h, v0.8h, v4.h[0] // ....e.............................'.....~...........................
+ // srshr v1.8h, v1.8h, #11 // ..............e...................'...............~.................
+ // mls v0.8h, v1.8h, v3.h[0] // .................e................'..................~..............
+ // sshr v2.8h, v0.8h, #15 // ......................e...........'.......................~.........
+ // and v2.16b, v3.16b, v2.16b // .........................e........'..........................~......
+ // add v0.8h, v0.8h, v2.8h // ...........................e......'............................~....
+ // str q0, [x0, #-32] // ........~.........................'.........*.......................
+ // ldr q0, [x0, #-16] // ...........e......................'............~....................
+ // sqdmulh v1.8h, v0.8h, v4.h[0] // ...................e..............'....................~............
+ // srshr v1.8h, v1.8h, #11 // .......................e..........'........................~........
+ // mls v0.8h, v1.8h, v3.h[0] // ..........................e.......'...........................~.....
+ // sshr v2.8h, v0.8h, #15 // ..............................e...'...............................~.
+ // and v2.16b, v3.16b, v2.16b // ................................e.'.................................
+ // add v0.8h, v0.8h, v2.8h // .................................e'.................................
+ // str q0, [x0, #-16] // .....~............................'......*..........................
+
+ sub count, count, 1
+ cbnz count, poly_reduce_asm_opt_loop
+ // Instructions: 17
+ // Expected cycles: 23
+ // Expected IPC: 0.74
+
+ // Cycle bound: 23.0
+ // IPC bound: 0.74
+
+ // Wall time: 0.05s
+ // User time: 0.05s
+
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ sqdmulh v20.8H, v5.8H, v4.H[0] // *.............................
+ ldr q24, [x0], #64 // .*............................
+ str q21, [x0, #-32] // ...*..........................
+ srshr v20.8H, v20.8H, #11 // ....*.........................
+ sqdmulh v25.8H, v24.8H, v4.H[0] // .....*........................
+ str q16, [x0, #-16] // ......*.......................
+ mls v5.8H, v20.8H, v3.H[0] // .......*......................
+ srshr v20.8H, v25.8H, #11 // .........*....................
+ sshr v2.8H, v5.8H, #15 // ...........*..................
+ mls v24.8H, v20.8H, v3.H[0] // ............*.................
+ and v20.16B, v3.16B, v2.16B // .............*................
+ add v31.8H, v5.8H, v20.8H // ..............*...............
+ sshr v20.8H, v24.8H, #15 // ................*.............
+ str q31, [x0, #-48] // .................*............
+ and v31.16B, v3.16B, v20.16B // ..................*...........
+ add v24.8H, v24.8H, v31.8H // ...................*..........
+ str q24, [x0, #-64] // ......................*.......
+
+ // ------ cycle (expected) ------>
+ // 0 25
+ // |------------------------|-----
+ // ldr q6, [x0], #64 // .*.............................
+ // sqdmulh v31.8H, v6.8H, v4.H[0] // .....*.........................
+ // sqdmulh v29.8H, v5.8H, v4.H[0] // *..............................
+ // str q16, [x0, #-16] // ......*........................
+ // srshr v20.8H, v31.8H, #11 // .........*.....................
+ // srshr v28.8H, v29.8H, #11 // ....*..........................
+ // str q21, [x0, #-32] // ...*...........................
+ // mls v6.8H, v20.8H, v3.H[0] // ............*..................
+ // mls v5.8H, v28.8H, v3.H[0] // .......*.......................
+ // sshr v31.8H, v6.8H, #15 // ................*..............
+ // and v22.16B, v3.16B, v31.16B // ..................*............
+ // add v0.8H, v6.8H, v22.8H // ...................*...........
+ // sshr v26.8H, v5.8H, #15 // ...........*...................
+ // and v17.16B, v3.16B, v26.16B // .............*.................
+ // add v1.8H, v5.8H, v17.8H // ..............*................
+ // str q1, [x0, #-48] // .................*.............
+ // str q0, [x0, #-64] // ......................*........
+
+
+ ret
+
+ .unreq ptr
+ .unreq count
+ .unreq wtmp
+
+ .unreq data
+ .unreq q_data
+
+ .unreq tmp
+ .unreq mask
+ .unreq modulus
+ .unreq modulus_twisted
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S
new file mode 100644
index 000000000..bc33afd43
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "../../../common.h"
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+/* simpasm: header-end */
+
+/********************************************
+ * poly_tobytes() *
+ ********************************************/
+
+ data0 .req v0
+ data1 .req v1
+ out0 .req v2
+ out1 .req v3
+ out2 .req v4
+ tmp .req v5
+
+ dst .req x0
+ src .req x1
+ count .req x2
+
+ .text
+ .global MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt)
+ .balign 4
+MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt):
+
+ mov count, #16
+poly_tobytes_asm_opt_asm_loop_start:
+ ld2 {data0.8h, data1.8h}, [src], #32
+
+ // r[3 * i + 0] = (t0 >> 0);
+ xtn out0.8b, data0.8h
+
+ // r[3 * i + 1] = (t0 >> 8);
+ shrn out1.8b, data0.8h, #8
+ xtn tmp.8b, data1.8h
+ // r[3 * i + 1] = (t0 >> 8) | (t1 << 4);
+ sli out1.8b, tmp.8b, #4
+
+ // r[3 * i + 2] = (t1 >> 4);
+ shrn out2.8b, data1.8h, #4
+
+ st3 {out0.8b, out1.8b, out2.8b}, [dst], #24
+
+ subs count, count, #1
+ cbnz count, poly_tobytes_asm_opt_asm_loop_start
+ ret
+
+ .unreq data0
+ .unreq data1
+ .unreq out0
+ .unreq out1
+ .unreq out2
+ .unreq tmp
+ .unreq dst
+ .unreq src
+ .unreq count
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_tomont_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_tomont_asm_opt.S
new file mode 100644
index 000000000..bcbff9adb
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_tomont_asm_opt.S
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "../../../common.h"
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+/* simpasm: header-end */
+
+/* Montgomery multiplication, with precomputed Montgomery twist
+ * Expects modulus in consts.h[0]. */
+.macro mulmod dst, src, const, const_twisted
+ sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h
+ mul \dst\().8h, \src\().8h, \const\().8h
+ mls \dst\().8h, tmp0.8h, modulus.h[0]
+.endm
+
+/**********************************
+ * poly_tomont() *
+ **********************************/
+
+ src .req x0
+ count .req x1
+ wtmp .req w2
+
+ data .req v0
+ q_data .req q0
+ res .req v1
+ q_res .req q1
+
+ factor .req v2
+ factor_t .req v3
+ modulus .req v4
+ modulus_twisted .req v5
+
+ tmp0 .req v6
+
+
+ .text
+ .global MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt)
+ .balign 4
+MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt):
+
+ mov wtmp, #3329 // ML-KEM modulus
+ dup modulus.8h, wtmp
+
+ mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
+ dup modulus_twisted.8h, wtmp
+
+ mov wtmp, #-1044 // 2^16 % 3329
+ dup factor.8h, wtmp
+
+ mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16)
+ dup factor_t.8h, wtmp
+
+ mov count, #8
+ // Instructions: 5
+ // Expected cycles: 7
+ // Expected IPC: 0.71
+ //
+ // Cycle bound: 7.0
+ // IPC bound: 0.71
+ //
+ // Wall time: 0.01s
+ // User time: 0.01s
+ //
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ ldr q26, [x0, #48] // *.............................
+ ldr q23, [x0, #16] // ..*...........................
+ mul v17.8H, v26.8H, v2.8H // ....*.........................
+ sqrdmulh v7.8H, v26.8H, v3.8H // .....*........................
+ ldr q27, [x0, #32] // ......*.......................
+
+ // ------ cycle (expected) ------>
+ // 0 25
+ // |------------------------|-----
+ // ldr q7, [x0, #48] // *..............................
+ // ldr q23, [x0, #16] // ..*............................
+ // mul v17.8H, v7.8H, v2.8H // ....*..........................
+ // sqrdmulh v7.8H, v7.8H, v3.8H // .....*.........................
+ // ldr q27, [x0, #32] // ......*........................
+
+ sub count, count, #1
+poly_tomont_asm_opt_loop:
+ // Instructions: 20
+ // Expected cycles: 24
+ // Expected IPC: 0.83
+ //
+ // Cycle bound: 24.0
+ // IPC bound: 0.83
+ //
+ // Wall time: 0.73s
+ // User time: 0.73s
+ //
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ mls v17.8H, v7.8H, v4.H[0] // *.............................
+ sqrdmulh v5.8H, v23.8H, v3.8H // .*............................
+ ldr q7, [x0], #64 // ..*...........................
+ str q17, [x0, #-16] // ....*.........................
+ sqrdmulh v29.8H, v27.8H, v3.8H // .....*........................
+ sqrdmulh v19.8H, v7.8H, v3.8H // ......*.......................
+ mul v25.8H, v23.8H, v2.8H // .......*......................
+ mul v0.8H, v7.8H, v2.8H // ........*.....................
+ mul v26.8H, v27.8H, v2.8H // .........*....................
+ ldr q7, [x0, #48] // ..........e...................
+ mls v25.8H, v5.8H, v4.H[0] // ............*.................
+ ldr q23, [x0, #16] // .............e................
+ mls v26.8H, v29.8H, v4.H[0] // ...............*..............
+ mls v0.8H, v19.8H, v4.H[0] // ................*.............
+ str q25, [x0, #-48] // .................*............
+ mul v17.8H, v7.8H, v2.8H // ..................e...........
+ sqrdmulh v7.8H, v7.8H, v3.8H // ...................e..........
+ str q0, [x0, #-64] // ....................*.........
+ ldr q27, [x0, #32] // .....................e........
+ str q26, [x0, #-32] // .......................*......
+
+ // --------- cycle (expected) ---------->
+ // 0 25
+ // |------------------------|------------
+ // ldr q0, [x0], #64 // ..............'.*.....................
+ // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'.....*.................
+ // mul v1.8h, v0.8h, v2.8h // ..............'.......*...............
+ // mls v1.8h, v6.8h, v4.h[0] // ......~.......'...............*.......
+ // str q1, [x0, #-64] // ..........~...'...................*...
+ // ldr q0, [x0, #-48] // ...e..........'............~..........
+ // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'*......................
+ // mul v1.8h, v0.8h, v2.8h // ..............'......*................
+ // mls v1.8h, v6.8h, v4.h[0] // ..~...........'...........*...........
+ // str q1, [x0, #-48] // .......~......'................*......
+ // ldr q0, [x0, #-32] // ...........e..'....................~..
+ // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'....*..................
+ // mul v1.8h, v0.8h, v2.8h // ..............'........*..............
+ // mls v1.8h, v6.8h, v4.h[0] // .....~........'..............*........
+ // str q1, [x0, #-32] // .............~'......................*
+ // ldr q0, [x0, #-16] // e.............'.........~.............
+ // sqrdmulh v6.8h, v0.8h, v3.8h // .........e....'..................~....
+ // mul v1.8h, v0.8h, v2.8h // ........e.....'.................~.....
+ // mls v1.8h, v6.8h, v4.h[0] // ..............*.......................
+ // str q1, [x0, #-16] // ..............'...*...................
+
+ sub count, count, 1
+ cbnz count, poly_tomont_asm_opt_loop
+ // Instructions: 15
+ // Expected cycles: 18
+ // Expected IPC: 0.83
+ //
+ // Cycle bound: 18.0
+ // IPC bound: 0.83
+ //
+ // Wall time: 0.07s
+ // User time: 0.07s
+ //
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ mls v17.8H, v7.8H, v4.H[0] // *.............................
+ sqrdmulh v7.8H, v23.8H, v3.8H // .*............................
+ mul v26.8H, v23.8H, v2.8H // ..*...........................
+ sqrdmulh v25.8H, v27.8H, v3.8H // ...*..........................
+ ldr q23, [x0], #64 // ....*.........................
+ mul v27.8H, v27.8H, v2.8H // ......*.......................
+ mls v26.8H, v7.8H, v4.H[0] // .......*......................
+ sqrdmulh v7.8H, v23.8H, v3.8H // ........*.....................
+ mul v23.8H, v23.8H, v2.8H // .........*....................
+ str q17, [x0, #-16] // ..........*...................
+ mls v27.8H, v25.8H, v4.H[0] // ...........*..................
+ str q26, [x0, #-48] // ............*.................
+ mls v23.8H, v7.8H, v4.H[0] // .............*................
+ str q27, [x0, #-32] // ...............*..............
+ str q23, [x0, #-64] // .................*............
+
+ // ------ cycle (expected) ------>
+ // 0 25
+ // |------------------------|-----
+ // mls v17.8H, v7.8H, v4.H[0] // *..............................
+ // sqrdmulh v5.8H, v23.8H, v3.8H // .*.............................
+ // ldr q7, [x0], #64 // ....*..........................
+ // str q17, [x0, #-16] // ..........*....................
+ // sqrdmulh v29.8H, v27.8H, v3.8H // ...*...........................
+ // sqrdmulh v19.8H, v7.8H, v3.8H // ........*......................
+ // mul v25.8H, v23.8H, v2.8H // ..*............................
+ // mul v0.8H, v7.8H, v2.8H // .........*.....................
+ // mul v26.8H, v27.8H, v2.8H // ......*........................
+ // mls v25.8H, v5.8H, v4.H[0] // .......*.......................
+ // mls v26.8H, v29.8H, v4.H[0] // ...........*...................
+ // mls v0.8H, v19.8H, v4.H[0] // .............*.................
+ // str q25, [x0, #-48] // ............*..................
+ // str q0, [x0, #-64] // .................*.............
+ // str q26, [x0, #-32] // ...............*...............
+
+
+ ret
+
+ .unreq src
+ .unreq count
+ .unreq wtmp
+
+ .unreq data
+ .unreq q_data
+ .unreq res
+ .unreq q_res
+
+ .unreq factor
+ .unreq factor_t
+ .unreq modulus
+ .unreq modulus_twisted
+
+ .unreq tmp0
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S
new file mode 100644
index 000000000..e336b92cb
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S
@@ -0,0 +1,546 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// AArch64 re-implementation of the asymmetric base multiplication from:
+
+// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
+// https://eprint.iacr.org/2021/986
+// https://github.com/neon-ntt/neon-ntt
+
+#include "../../../common.h"
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) && MLKEM_K == 2
+/* simpasm: header-end */
+
+// Input:
+// - Vectors al, ah of 32-bit entries
+// Output:
+// - Montgomery reductions of al || ah, stored in al
+.macro montgomery_reduce_long x, a
+ uzp1 t0.8h, \a\()l.8h, \a\()h.8h
+ mul t0.8h, t0.8h, modulus_twisted.8h
+ smlal \a\()l.4s, t0.4h, modulus.4h
+ smlal2 \a\()h.4s, t0.8h, modulus.8h
+ uzp2 \x\().8h, \a\()l.8h, \a\()h.8h
+.endm
+
+// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit.
+
+// Bounds:
+// - Assume |a| < 4096,
+// - Result: < 2*4096*2^15 = 2^28
+.macro pmull d, a, b
+ smull \d\()0l.4s, \a\()0.4h, \b\()0.4h
+ smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
+ smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
+ smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
+
+ smull \d\()1l.4s, \a\()0.4h, \b\()1.4h
+ smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
+ smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
+ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
+.endm
+
+.macro pmlal d, a, b
+ smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h
+ smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
+ smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
+ smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
+
+ smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h
+ smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
+ smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
+ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
+.endm
+
+.macro ld2_wrap a, ptr
+ ldr q_tmp0, [\ptr\()], #32
+ ldr q_tmp1, [\ptr\(), #-16]
+ uzp1 \a\()0.8h, tmp0.8h, tmp1.8h
+ uzp2 \a\()1.8h, tmp0.8h, tmp1.8h
+.endm
+
+.macro st2_wrap a, ptr
+ zip1 tmp0.8h, \a\()0.8h, \a\()1.8h
+ zip2 tmp1.8h, \a\()0.8h, \a\()1.8h
+ str q_tmp0, [\ptr\()], #32
+ str q_tmp1, [\ptr\(), #-16]
+.endm
+
+.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr
+ ld2_wrap \a\(), \a_ptr
+ ld2_wrap \b\(), \b_ptr
+ ld1 {\b\()1t.8h}, [\b_cache_ptr], #16
+.endm
+
+.macro save_vregs
+ sub sp, sp, #(16*4)
+ stp d8, d9, [sp, #16*0]
+ stp d10, d11, [sp, #16*1]
+ stp d12, d13, [sp, #16*2]
+ stp d14, d15, [sp, #16*3]
+.endm
+
+.macro restore_vregs
+ ldp d8, d9, [sp, #16*0]
+ ldp d10, d11, [sp, #16*1]
+ ldp d12, d13, [sp, #16*2]
+ ldp d14, d15, [sp, #16*3]
+ add sp, sp, #(16*4)
+.endm
+
+.macro push_stack
+ save_vregs
+.endm
+
+.macro pop_stack
+ restore_vregs
+.endm
+
+ out .req x0
+ a0_ptr .req x1
+ b0_ptr .req x2
+ b0_cache_ptr .req x3
+ a1_ptr .req x4
+ b1_ptr .req x5
+ b1_cache_ptr .req x6
+ a2_ptr .req x7
+ b2_ptr .req x8
+ b2_cache_ptr .req x9
+ a3_ptr .req x10
+ b3_ptr .req x11
+ b3_cache_ptr .req x12
+ count .req x13
+ wtmp .req w14
+
+ modulus .req v0
+ modulus_twisted .req v2
+
+ aa0 .req v3
+ aa1 .req v4
+ bb0 .req v5
+ bb1 .req v6
+ bb1t .req v7
+
+ res0l .req v8
+ res1l .req v9
+ res0h .req v10
+ res1h .req v11
+
+ tmp0 .req v12
+ tmp1 .req v13
+ q_tmp0 .req q12
+ q_tmp1 .req q13
+
+ out0 .req v26
+ out1 .req v27
+
+ t0 .req v28
+
+ .text
+ .global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2_opt)
+ .balign 4
+MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2_opt):
+ push_stack
+
+ mov wtmp, #3329
+ dup modulus.8h, wtmp
+
+ mov wtmp, #3327
+ dup modulus_twisted.8h, wtmp
+
+ // Computed bases of vector entries
+
+ add a1_ptr, a0_ptr, #(1 * 512)
+ add b1_ptr, b0_ptr, #(1 * 512)
+ add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
+
+ mov count, #(MLKEM_N / 16)
+ // Instructions: 75
+ // Expected cycles: 94
+ // Expected IPC: 0.80
+
+ // Cycle bound: 94.0
+ // IPC bound: 0.80
+
+ // Wall time: 1.49s
+ // User time: 1.49s
+
+ // --------------------------- original position ---------------------------->
+ // 0 25 50
+ // |------------------------|------------------------|
+ ldr q9, [x4], #32 // *..........................................................................
+ ldr q5, [x4, #-16] // ......*....................................................................
+ ldr q11, [x5], #32 // .*.........................................................................
+ uzp1 v23.8H, v9.8H, v5.8H // .........*.................................................................
+ uzp2 v9.8H, v9.8H, v5.8H // .....................*.....................................................
+ ldr q5, [x2], #32 // ..*........................................................................
+ ldr q7, [x5, #-16] // ..............*............................................................
+ ldr q21, [x2, #-16] // ...*.......................................................................
+ uzp2 v10.8H, v11.8H, v7.8H // .................*.........................................................
+ uzp1 v11.8H, v11.8H, v7.8H // ..................*........................................................
+ uzp1 v7.8H, v5.8H, v21.8H // ....*......................................................................
+ uzp2 v5.8H, v5.8H, v21.8H // .....*.....................................................................
+ ldr q21, [x1], #32 // .......*...................................................................
+ ldr q25, [x1, #-16] // ........*..................................................................
+ ld1 {v6.8H}, [x3], #16 // ............................*..............................................
+ uzp1 v26.8H, v21.8H, v25.8H // ..........*................................................................
+ uzp2 v21.8H, v21.8H, v25.8H // ...........*...............................................................
+ smull v25.4S, v26.4H, v5.4H // ............*..............................................................
+ smull2 v5.4S, v26.8H, v5.8H // .............*.............................................................
+ smull v19.4S, v26.4H, v7.4H // ..........................*................................................
+ smull2 v26.4S, v26.8H, v7.8H // ..............................*............................................
+ smlal v25.4S, v21.4H, v7.4H // ...............*...........................................................
+ smlal2 v5.4S, v21.8H, v7.8H // ................*..........................................................
+ smlal v19.4S, v21.4H, v6.4H // ...................................*.......................................
+ smlal2 v26.4S, v21.8H, v6.8H // .................................*.........................................
+ smlal v25.4S, v23.4H, v10.4H // ...................*.......................................................
+ smlal2 v5.4S, v23.8H, v10.8H // ....................*......................................................
+ smlal v19.4S, v23.4H, v11.4H // ......................................*....................................
+ smlal2 v26.4S, v23.8H, v11.8H // ....................................*......................................
+ ld1 {v23.8H}, [x6], #16 // ........................*..................................................
+ smlal v25.4S, v9.4H, v11.4H // ......................*....................................................
+ smlal2 v5.4S, v9.8H, v11.8H // .......................*...................................................
+ smlal2 v26.4S, v9.8H, v23.8H // .......................................*...................................
+ smlal v19.4S, v9.4H, v23.4H // .........................................*.................................
+ ldr q9, [x4], #32 // ...............................*...........................................
+ uzp1 v11.8H, v25.8H, v5.8H // .........................*.................................................
+ uzp1 v23.8H, v19.8H, v26.8H // .............................................*.............................
+ mul v11.8H, v11.8H, v2.8H // ...........................*...............................................
+ mul v23.8H, v23.8H, v2.8H // ..............................................*............................
+ ldr q7, [x5], #32 // ................................*..........................................
+ smlal2 v5.4S, v11.8H, v0.8H // .............................*.............................................
+ smlal v25.4S, v11.4H, v0.4H // ..................................*........................................
+ ldr q11, [x2], #32 // .....................................*.....................................
+ ldr q21, [x2, #-16] // ........................................*..................................
+ ldr q6, [x4, #-16] // ...............................................*...........................
+ uzp1 v17.8H, v11.8H, v21.8H // ...........................................*...............................
+ ldr q10, [x1], #32 // ................................................*..........................
+ ldr q29, [x1, #-16] // .................................................*.........................
+ uzp2 v11.8H, v11.8H, v21.8H // ............................................*..............................
+ uzp1 v13.8H, v9.8H, v6.8H // ...................................................*.......................
+ uzp1 v3.8H, v10.8H, v29.8H // ....................................................*......................
+ uzp2 v10.8H, v10.8H, v29.8H // .....................................................*.....................
+ smull v12.4S, v3.4H, v11.4H // ......................................................*....................
+ smull2 v11.4S, v3.8H, v11.8H // .......................................................*...................
+ ldr q21, [x5, #-16] // ........................................................*..................
+ smlal v12.4S, v10.4H, v17.4H // .........................................................*.................
+ smlal2 v11.4S, v10.8H, v17.8H // ..........................................................*................
+ uzp2 v29.8H, v7.8H, v21.8H // ...........................................................*...............
+ uzp1 v15.8H, v7.8H, v21.8H // ............................................................*..............
+ smlal v12.4S, v13.4H, v29.4H // .............................................................*.............
+ smlal2 v11.4S, v13.8H, v29.8H // ..............................................................*............
+ uzp2 v28.8H, v9.8H, v6.8H // ...............................................................*...........
+ smlal2 v26.4S, v23.8H, v0.8H // ..................................................*........................
+ smlal v12.4S, v28.4H, v15.4H // .................................................................*.........
+ smlal2 v11.4S, v28.8H, v15.8H // ..................................................................*........
+ smlal v19.4S, v23.4H, v0.4H // ................................................................*..........
+ uzp2 v27.8H, v25.8H, v5.8H // ..........................................*................................
+ smull v23.4S, v3.4H, v17.4H // ......................................................................*....
+ uzp1 v9.8H, v12.8H, v11.8H // .....................................................................*.....
+ uzp2 v19.8H, v19.8H, v26.8H // ....................................................................*......
+ mul v14.8H, v9.8H, v2.8H // .......................................................................*...
+ ld1 {v22.8H}, [x6], #16 // ...................................................................*.......
+ zip2 v9.8H, v19.8H, v27.8H // ........................................................................*..
+ smlal2 v11.4S, v14.8H, v0.8H // ..........................................................................*
+ ld1 {v4.8H}, [x3], #16 // .........................................................................*.
+
+ // ------------------------------ new position ------------------------------>
+ // 0 25 50
+ // |------------------------|------------------------|------------------------
+ // ldr q18, [x4], #32 // *..........................................................................
+ // ldr q30, [x5], #32 // ..*........................................................................
+ // ldr q8, [x2], #32 // .....*.....................................................................
+ // ldr q9, [x2, #-16] // .......*...................................................................
+ // uzp1 v17.8H, v8.8H, v9.8H // ..........*................................................................
+ // uzp2 v4.8H, v8.8H, v9.8H // ...........*...............................................................
+ // ldr q19, [x4, #-16] // .*.........................................................................
+ // ldr q29, [x1], #32 // ............*..............................................................
+ // ldr q12, [x1, #-16] // .............*.............................................................
+ // uzp1 v13.8H, v18.8H, v19.8H // ...*.......................................................................
+ // uzp1 v3.8H, v29.8H, v12.8H // ...............*...........................................................
+ // uzp2 v10.8H, v29.8H, v12.8H // ................*..........................................................
+ // smull v12.4S, v3.4H, v4.4H // .................*.........................................................
+ // smull2 v11.4S, v3.8H, v4.8H // ..................*........................................................
+ // ldr q5, [x5, #-16] // ......*....................................................................
+ // smlal v12.4S, v10.4H, v17.4H // .....................*.....................................................
+ // smlal2 v11.4S, v10.8H, v17.8H // ......................*....................................................
+ // uzp2 v14.8H, v30.8H, v5.8H // ........*..................................................................
+ // uzp1 v15.8H, v30.8H, v5.8H // .........*.................................................................
+ // smlal v12.4S, v13.4H, v14.4H // .........................*.................................................
+ // smlal2 v11.4S, v13.8H, v14.8H // ..........................*................................................
+ // uzp2 v28.8H, v18.8H, v19.8H // ....*......................................................................
+ // smlal v12.4S, v28.4H, v15.4H // ..............................*............................................
+ // smlal2 v11.4S, v28.8H, v15.8H // ...............................*...........................................
+ // ld1 {v22.8H}, [x6], #16 // .............................*.............................................
+ // uzp1 v1.8H, v12.8H, v11.8H // ...................................*.......................................
+ // smull v23.4S, v3.4H, v17.4H // ...................*.......................................................
+ // mul v14.8H, v1.8H, v2.8H // .....................................*.....................................
+ // ld1 {v4.8H}, [x3], #16 // ..............*............................................................
+ // smlal2 v11.4S, v14.8H, v0.8H // ........................................*..................................
+ // smull2 v20.4S, v3.8H, v17.8H // ....................*......................................................
+ // ldr q18, [x4], #32 // ..................................*........................................
+ // ldr q30, [x5], #32 // .......................................*...................................
+ // smlal2 v20.4S, v10.8H, v4.8H // ........................*..................................................
+ // smlal v12.4S, v14.4H, v0.4H // .........................................*.................................
+ // smlal v23.4S, v10.4H, v4.4H // .......................*...................................................
+ // smlal2 v20.4S, v13.8H, v15.8H // ............................*..............................................
+ // ldr q8, [x2], #32 // ..........................................*................................
+ // smlal v23.4S, v13.4H, v15.4H // ...........................*...............................................
+ // smlal2 v20.4S, v28.8H, v22.8H // ................................*..........................................
+ // ldr q9, [x2, #-16] // ...........................................*...............................
+ // smlal v23.4S, v28.4H, v22.4H // .................................*.........................................
+ // uzp2 v27.8H, v12.8H, v11.8H // ..................................................................*........
+ // uzp1 v17.8H, v8.8H, v9.8H // .............................................*.............................
+ // uzp2 v4.8H, v8.8H, v9.8H // ................................................*..........................
+ // uzp1 v5.8H, v23.8H, v20.8H // ....................................*......................................
+ // mul v31.8H, v5.8H, v2.8H // ......................................*....................................
+ // ldr q19, [x4, #-16] // ............................................*..............................
+ // ldr q29, [x1], #32 // ..............................................*............................
+ // ldr q12, [x1, #-16] // ...............................................*...........................
+ // smlal2 v20.4S, v31.8H, v0.8H // ..............................................................*............
+ // uzp1 v13.8H, v18.8H, v19.8H // .................................................*.........................
+ // uzp1 v3.8H, v29.8H, v12.8H // ..................................................*........................
+ // uzp2 v10.8H, v29.8H, v12.8H // ...................................................*.......................
+ // smull v12.4S, v3.4H, v4.4H // ....................................................*......................
+ // smull2 v11.4S, v3.8H, v4.8H // .....................................................*.....................
+ // ldr q5, [x5, #-16] // ......................................................*....................
+ // smlal v12.4S, v10.4H, v17.4H // .......................................................*...................
+ // smlal2 v11.4S, v10.8H, v17.8H // ........................................................*..................
+ // uzp2 v14.8H, v30.8H, v5.8H // .........................................................*.................
+ // uzp1 v15.8H, v30.8H, v5.8H // ..........................................................*................
+ // smlal v12.4S, v13.4H, v14.4H // ...........................................................*...............
+ // smlal2 v11.4S, v13.8H, v14.8H // ............................................................*..............
+ // uzp2 v28.8H, v18.8H, v19.8H // .............................................................*.............
+ // smlal v23.4S, v31.4H, v0.4H // .................................................................*.........
+ // smlal v12.4S, v28.4H, v15.4H // ...............................................................*...........
+ // smlal2 v11.4S, v28.8H, v15.8H // ................................................................*..........
+ // ld1 {v22.8H}, [x6], #16 // .......................................................................*...
+ // uzp2 v19.8H, v23.8H, v20.8H // .....................................................................*.....
+ // uzp1 v1.8H, v12.8H, v11.8H // ....................................................................*......
+ // smull v23.4S, v3.4H, v17.4H // ...................................................................*.......
+ // mul v14.8H, v1.8H, v2.8H // ......................................................................*....
+ // zip2 v9.8H, v19.8H, v27.8H // ........................................................................*..
+ // ld1 {v4.8H}, [x3], #16 // ..........................................................................*
+ // smlal2 v11.4S, v14.8H, v0.8H // .........................................................................*.
+
+ sub count, count, #2
+polyvec_basemul_acc_montgomery_cached_asm_k2_opt_loop:
+ // Instructions: 48
+ // Expected cycles: 58
+ // Expected IPC: 0.83
+
+ // Cycle bound: 58.0
+ // IPC bound: 0.83
+
+ // Wall time: 6.39s
+ // User time: 6.39s
+
+ // -------------- original position -------------->
+ // 0 25
+ // |------------------------|----------------------
+ smull2 v20.4S, v3.8H, v17.8H // ..........*.....................................
+ ldr q18, [x4], #32 // .................e..............................
+ ldr q30, [x5], #32 // .....................e..........................
+ smlal2 v20.4S, v10.8H, v4.8H // ............*...................................
+ smlal v12.4S, v14.4H, v0.4H // .........................................*......
+ smlal v23.4S, v10.4H, v4.4H // ...........*....................................
+ str q9, [x0, #16] // ...............................................l
+ smlal2 v20.4S, v13.8H, v15.8H // ...........................*....................
+ ldr q8, [x2], #32 // ....e...........................................
+ smlal v23.4S, v13.4H, v15.4H // ..........................*.....................
+ smlal2 v20.4S, v28.8H, v22.8H // .............................*..................
+ zip1 v26.8H, v19.8H, v27.8H // ............................................l...
+ ldr q9, [x2, #-16] // .....e..........................................
+ smlal v23.4S, v28.4H, v22.4H // ............................*...................
+ uzp2 v27.8H, v12.8H, v11.8H // ...........................................*....
+ uzp1 v17.8H, v8.8H, v9.8H // ......e.........................................
+ uzp2 v4.8H, v8.8H, v9.8H // .......e........................................
+ uzp1 v5.8H, v23.8H, v20.8H // ..................................*.............
+ str q26, [x0], #32 // ..............................................l.
+ mul v31.8H, v5.8H, v2.8H // ...................................*............
+ ldr q19, [x4, #-16] // ..................e.............................
+ ldr q29, [x1], #32 // e...............................................
+ ldr q12, [x1, #-16] // .e..............................................
+ smlal2 v20.4S, v31.8H, v0.8H // .....................................*..........
+ uzp1 v13.8H, v18.8H, v19.8H // ...................e............................
+ uzp1 v3.8H, v29.8H, v12.8H // ..e.............................................
+ uzp2 v10.8H, v29.8H, v12.8H // ...e............................................
+ smull v12.4S, v3.4H, v4.4H // .............e..................................
+ smull2 v11.4S, v3.8H, v4.8H // ..............e.................................
+ ldr q5, [x5, #-16] // ......................e.........................
+ smlal v12.4S, v10.4H, v17.4H // ...............e................................
+ smlal2 v11.4S, v10.8H, v17.8H // ................e...............................
+ uzp2 v14.8H, v30.8H, v5.8H // ........................e.......................
+ uzp1 v15.8H, v30.8H, v5.8H // .......................e........................
+ smlal v12.4S, v13.4H, v14.4H // ..............................e.................
+ smlal2 v11.4S, v13.8H, v14.8H // ...............................e................
+ uzp2 v28.8H, v18.8H, v19.8H // ....................e...........................
+ smlal v23.4S, v31.4H, v0.4H // ....................................*...........
+ smlal v12.4S, v28.4H, v15.4H // ................................e...............
+ smlal2 v11.4S, v28.8H, v15.8H // .................................e..............
+ ld1 {v22.8H}, [x6], #16 // .........................e......................
+ uzp2 v19.8H, v23.8H, v20.8H // ......................................*.........
+ uzp1 v1.8H, v12.8H, v11.8H // .......................................e........
+ smull v23.4S, v3.4H, v17.4H // .........e......................................
+ mul v14.8H, v1.8H, v2.8H // ........................................e.......
+ zip2 v9.8H, v19.8H, v27.8H // .............................................*..
+ ld1 {v4.8H}, [x3], #16 // ........e.......................................
+ smlal2 v11.4S, v14.8H, v0.8H // ..........................................e.....
+
+ // ------------------------------------------------- new position -------------------------------------------------->
+ // 0 25 50 75 100
+ // |------------------------|------------------------|------------------------|------------------------|-------------
+ // ldr q12, [x1], #32 // ....................e..........................'....................~..........................'..................
+ // ldr q13, [x1, #-16] // .....................e.........................'.....................~.........................'..................
+ // uzp1 v3.8h, v12.8h, v13.8h // ........................e......................'........................~......................'..................
+ // uzp2 v4.8h, v12.8h, v13.8h // .........................e.....................'.........................~.....................'..................
+ // ldr q12, [x2], #32 // .......e.......................................'.......~.......................................'.......~..........
+ // ldr q13, [x2, #-16] // ...........e...................................'...........~...................................'...........~......
+ // uzp1 v5.8h, v12.8h, v13.8h // ..............e................................'..............~................................'..............~...
+ // uzp2 v6.8h, v12.8h, v13.8h // ...............e...............................'...............~...............................'...............~..
+ // ld1 {v7.8h}, [x3], #16 // .............................................e.'.............................................~.'..................
+ // smull v8.4s, v3.4h, v5.4h // ..........................................e....'..........................................~....'..................
+ // smull2 v10.4s, v3.8h, v5.8h // ...............................................*...............................................~..................
+ // smlal v8.4s, v4.4h, v7.4h // ....~..........................................'....*..........................................'....~.............
+ // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................'..*............................................'..~...............
+ // smull v9.4s, v3.4h, v6.4h // ..........................e....................'..........................~....................'..................
+ // smull2 v11.4s, v3.8h, v6.8h // ...........................e...................'...........................~...................'..................
+ // smlal v9.4s, v4.4h, v5.4h // .............................e.................'.............................~.................'..................
+ // smlal2 v11.4s, v4.8h, v5.8h // ..............................e................'..............................~................'..................
+ // ldr q12, [x4], #32 // e..............................................'~..............................................'~.................
+ // ldr q13, [x4, #-16] // ...................e...........................'...................~...........................'..................
+ // uzp1 v3.8h, v12.8h, v13.8h // .......................e.......................'.......................~.......................'..................
+ // uzp2 v4.8h, v12.8h, v13.8h // ...................................e...........'...................................~...........'..................
+ // ldr q12, [x5], #32 // .e.............................................'.~.............................................'.~................
+ // ldr q13, [x5, #-16] // ............................e..................'............................~..................'..................
+ // uzp1 v5.8h, v12.8h, v13.8h // ................................e..............'................................~..............'..................
+ // uzp2 v6.8h, v12.8h, v13.8h // ...............................e...............'...............................~...............'..................
+ // ld1 {v7.8h}, [x6], #16 // .......................................e.......'.......................................~.......'..................
+ // smlal v8.4s, v3.4h, v5.4h // ........~......................................'........*......................................'........~.........
+ // smlal2 v10.4s, v3.8h, v5.8h // ......~........................................'......*........................................'......~...........
+ // smlal v8.4s, v4.4h, v7.4h // ............~..................................'............*..................................'............~.....
+ // smlal2 v10.4s, v4.8h, v7.8h // .........~.....................................'.........*.....................................'.........~........
+ // smlal v9.4s, v3.4h, v6.4h // .................................e.............'.................................~.............'..................
+ // smlal2 v11.4s, v3.8h, v6.8h // ..................................e............'..................................~............'..................
+ // smlal v9.4s, v4.4h, v5.4h // .....................................e.........'.....................................~.........'..................
+ // smlal2 v11.4s, v4.8h, v5.8h // ......................................e........'......................................~........'..................
+ // uzp1 v28.8h, v8.8h, v10.8h // ................~..............................'................*..............................'................~.
+ // mul v28.8h, v28.8h, v2.8h // ..................~............................'..................*............................'..................
+ // smlal v8.4s, v28.4h, v0.4h // ....................................~..........'....................................*..........'..................
+ // smlal2 v10.4s, v28.8h, v0.8h // ......................~........................'......................*........................'..................
+ // uzp2 v26.8h, v8.8h, v10.8h // ........................................~......'........................................*......'..................
+ // uzp1 v28.8h, v9.8h, v11.8h // .........................................e.....'.........................................~.....'..................
+ // mul v28.8h, v28.8h, v2.8h // ...........................................e...'...........................................~...'..................
+ // smlal v9.4s, v28.4h, v0.4h // ...~...........................................'...*...........................................'...~..............
+ // smlal2 v11.4s, v28.8h, v0.8h // ..............................................e'..............................................~'..................
+ // uzp2 v27.8h, v9.8h, v11.8h // .............~.................................'.............*.................................'.............~....
+ // zip1 v12.8h, v26.8h, v27.8h // ..........~....................................'..........~....................................'..........l.......
+ // zip2 v13.8h, v26.8h, v27.8h // ............................................~..'............................................*..'..................
+ // str q12, [x0], #32 // .................~.............................'.................~.............................'.................l
+ // str q13, [x0, #-16] // .....~.........................................'.....~.........................................'.....l............
+
+ sub count, count, #1
+ cbnz count, polyvec_basemul_acc_montgomery_cached_asm_k2_opt_loop
+ // Instructions: 21
+ // Expected cycles: 35
+ // Expected IPC: 0.60
+
+ // Cycle bound: 35.0
+ // IPC bound: 0.60
+
+ // Wall time: 0.08s
+ // User time: 0.08s
+
+ // ----- original position ----->
+ // 0 25
+ // |------------------------|----
+ smull2 v5.4S, v3.8H, v17.8H // *.............................
+ smlal v12.4S, v14.4H, v0.4H // ..*...........................
+ smlal v23.4S, v10.4H, v4.4H // ...*..........................
+ str q9, [x0, #16] // ....*.........................
+ smlal2 v5.4S, v10.8H, v4.8H // .*............................
+ uzp2 v11.8H, v12.8H, v11.8H // ..........*...................
+ zip1 v9.8H, v19.8H, v27.8H // ........*.....................
+ smlal v23.4S, v13.4H, v15.4H // ......*.......................
+ smlal2 v5.4S, v13.8H, v15.8H // .....*........................
+ str q9, [x0], #32 // ............*.................
+ smlal v23.4S, v28.4H, v22.4H // .........*....................
+ smlal2 v5.4S, v28.8H, v22.8H // .......*......................
+ uzp1 v9.8H, v23.8H, v5.8H // ...........*..................
+ mul v9.8H, v9.8H, v2.8H // .............*................
+ smlal2 v5.4S, v9.8H, v0.8H // ..............*...............
+ smlal v23.4S, v9.4H, v0.4H // ...............*..............
+ uzp2 v9.8H, v23.8H, v5.8H // ................*.............
+ zip2 v5.8H, v9.8H, v11.8H // .................*............
+ zip1 v9.8H, v9.8H, v11.8H // ...................*..........
+ str q5, [x0, #16] // ..................*...........
+ str q9, [x0], #32 // ....................*.........
+
+ // -------- new position -------->
+ // 0 25
+ // |------------------------|-----
+ // smull2 v20.4S, v3.8H, v17.8H // *..............................
+ // smlal2 v20.4S, v10.8H, v4.8H // ....*..........................
+ // smlal v12.4S, v14.4H, v0.4H // .*.............................
+ // smlal v23.4S, v10.4H, v4.4H // ..*............................
+ // str q9, [x0, #16] // ...*...........................
+ // smlal2 v20.4S, v13.8H, v15.8H // ........*......................
+ // smlal v23.4S, v13.4H, v15.4H // .......*.......................
+ // smlal2 v20.4S, v28.8H, v22.8H // ...........*...................
+ // zip1 v26.8H, v19.8H, v27.8H // ......*........................
+ // smlal v23.4S, v28.4H, v22.4H // ..........*....................
+ // uzp2 v27.8H, v12.8H, v11.8H // .....*.........................
+ // uzp1 v5.8H, v23.8H, v20.8H // ............*..................
+ // str q26, [x0], #32 // .........*.....................
+ // mul v31.8H, v5.8H, v2.8H // .............*.................
+ // smlal2 v20.4S, v31.8H, v0.8H // ..............*................
+ // smlal v23.4S, v31.4H, v0.4H // ...............*...............
+ // uzp2 v19.8H, v23.8H, v20.8H // ................*..............
+ // zip2 v9.8H, v19.8H, v27.8H // .................*.............
+ // str q9, [x0, #16] // ...................*...........
+ // zip1 v26.8H, v19.8H, v27.8H // ..................*............
+ // str q26, [x0], #32 // ....................*..........
+
+
+ pop_stack
+ ret
+
+/****************** REGISTER DEALLOCATIONS *******************/
+ .unreq out
+ .unreq a0_ptr
+ .unreq b0_ptr
+ .unreq b0_cache_ptr
+ .unreq a1_ptr
+ .unreq b1_ptr
+ .unreq b1_cache_ptr
+ .unreq a2_ptr
+ .unreq b2_ptr
+ .unreq b2_cache_ptr
+ .unreq a3_ptr
+ .unreq b3_ptr
+ .unreq b3_cache_ptr
+ .unreq count
+ .unreq modulus
+ .unreq modulus_twisted
+ .unreq wtmp
+ .unreq aa0
+ .unreq aa1
+ .unreq bb0
+ .unreq bb1
+ .unreq bb1t
+ .unreq res0l
+ .unreq res1l
+ .unreq res0h
+ .unreq res1h
+ .unreq tmp0
+ .unreq tmp1
+ .unreq q_tmp0
+ .unreq q_tmp1
+ .unreq out0
+ .unreq out1
+ .unreq t0
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT && MLKEM_K == 3 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S
new file mode 100644
index 000000000..1c30ed6aa
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S
@@ -0,0 +1,650 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// AArch64 re-implementation of the asymmetric base multiplication from:
+
+// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
+// https://eprint.iacr.org/2021/986
+// https://github.com/neon-ntt/neon-ntt
+
+#include "../../../common.h"
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) && MLKEM_K == 3
+/* simpasm: header-end */
+
+// Input:
+// - Vectors al, ah of 32-bit entries
+// Output:
+// - Montgomery reductions of al || ah, stored in al
+.macro montgomery_reduce_long x, a
+ uzp1 t0.8h, \a\()l.8h, \a\()h.8h
+ mul t0.8h, t0.8h, modulus_twisted.8h
+ smlal \a\()l.4s, t0.4h, modulus.4h
+ smlal2 \a\()h.4s, t0.8h, modulus.8h
+ uzp2 \x\().8h, \a\()l.8h, \a\()h.8h
+.endm
+
+// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit.
+
+// Bounds:
+// - Assume |a| < 4096,
+// - Result: < 2*4096*2^15 = 2^28
+.macro pmull d, a, b
+ smull \d\()0l.4s, \a\()0.4h, \b\()0.4h
+ smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
+ smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
+ smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
+
+ smull \d\()1l.4s, \a\()0.4h, \b\()1.4h
+ smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
+ smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
+ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
+.endm
+
+.macro pmlal d, a, b
+ smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h
+ smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
+ smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
+ smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
+
+ smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h
+ smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
+ smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
+ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
+.endm
+
+.macro ld2_wrap a, ptr
+ ldr q_tmp0, [\ptr\()], #32
+ ldr q_tmp1, [\ptr\(), #-16]
+ uzp1 \a\()0.8h, tmp0.8h, tmp1.8h
+ uzp2 \a\()1.8h, tmp0.8h, tmp1.8h
+.endm
+
+.macro st2_wrap a, ptr
+ zip1 tmp0.8h, \a\()0.8h, \a\()1.8h
+ zip2 tmp1.8h, \a\()0.8h, \a\()1.8h
+ str q_tmp0, [\ptr\()], #32
+ str q_tmp1, [\ptr\(), #-16]
+.endm
+
+.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr
+ ld2_wrap \a\(), \a_ptr
+ ld2_wrap \b\(), \b_ptr
+ ld1 {\b\()1t.8h}, [\b_cache_ptr], #16
+.endm
+
+.macro save_vregs
+ sub sp, sp, #(16*4)
+ stp d8, d9, [sp, #16*0]
+ stp d10, d11, [sp, #16*1]
+ stp d12, d13, [sp, #16*2]
+ stp d14, d15, [sp, #16*3]
+.endm
+
+.macro restore_vregs
+ ldp d8, d9, [sp, #16*0]
+ ldp d10, d11, [sp, #16*1]
+ ldp d12, d13, [sp, #16*2]
+ ldp d14, d15, [sp, #16*3]
+ add sp, sp, #(16*4)
+.endm
+
+.macro push_stack
+ save_vregs
+.endm
+
+.macro pop_stack
+ restore_vregs
+.endm
+
+ out .req x0
+ a0_ptr .req x1
+ b0_ptr .req x2
+ b0_cache_ptr .req x3
+ a1_ptr .req x4
+ b1_ptr .req x5
+ b1_cache_ptr .req x6
+ a2_ptr .req x7
+ b2_ptr .req x8
+ b2_cache_ptr .req x9
+ a3_ptr .req x10
+ b3_ptr .req x11
+ b3_cache_ptr .req x12
+ count .req x13
+ wtmp .req w14
+
+ modulus .req v0
+ modulus_twisted .req v2
+
+ aa0 .req v3
+ aa1 .req v4
+ bb0 .req v5
+ bb1 .req v6
+ bb1t .req v7
+
+ res0l .req v8
+ res1l .req v9
+ res0h .req v10
+ res1h .req v11
+
+ tmp0 .req v12
+ tmp1 .req v13
+ q_tmp0 .req q12
+ q_tmp1 .req q13
+
+ out0 .req v26
+ out1 .req v27
+
+ t0 .req v28
+
+ .text
+ .global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3_opt)
+ .balign 4
+MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3_opt):
+ push_stack
+ mov wtmp, #3329
+ dup modulus.8h, wtmp
+
+ mov wtmp, #3327
+ dup modulus_twisted.8h, wtmp
+
+ // Computed bases of vector entries
+
+ add a1_ptr, a0_ptr, #(1 * 512)
+ add b1_ptr, b0_ptr, #(1 * 512)
+ add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
+ add a2_ptr, a0_ptr, #(2 * 512)
+ add b2_ptr, b0_ptr, #(2 * 512)
+ add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2)
+
+ mov count, #(MLKEM_N / 16)
+ // Instructions: 75
+ // Expected cycles: 103
+ // Expected IPC: 0.73
+
+ // Cycle bound: 103.0
+ // IPC bound: 0.73
+
+ // Wall time: 0.94s
+ // User time: 0.94s
+
+ // --------------------------- original position ---------------------------->
+ // 0 25 50
+ // |------------------------|------------------------|
+ ldr q7, [x2, #16] // *..........................................................................
+ ldr q20, [x2], #32 // ..*........................................................................
+ ldr q15, [x1, #16] // .*.........................................................................
+ uzp1 v8.8H, v20.8H, v7.8H // ...............*...........................................................
+ uzp2 v7.8H, v20.8H, v7.8H // ................*..........................................................
+ ld1 {v20.8H}, [x3], #16 // ...*.......................................................................
+ ldr q30, [x1], #32 // ..............*............................................................
+ ldr q11, [x4], #32 // ....*......................................................................
+ uzp1 v16.8H, v30.8H, v15.8H // .................*.........................................................
+ uzp2 v15.8H, v30.8H, v15.8H // ..................*........................................................
+ smull v30.4S, v16.4H, v7.4H // ...................*.......................................................
+ smull2 v7.4S, v16.8H, v7.8H // ....................*......................................................
+ smull v9.4S, v16.4H, v8.4H // .....................*.....................................................
+ smull2 v16.4S, v16.8H, v8.8H // ......................*....................................................
+ smlal v30.4S, v15.4H, v8.4H // .......................*...................................................
+ smlal2 v7.4S, v15.8H, v8.8H // ........................*..................................................
+ smlal v9.4S, v15.4H, v20.4H // .........................*.................................................
+ smlal2 v16.4S, v15.8H, v20.8H // ..........................*................................................
+ ldr q20, [x4, #-16] // .....*.....................................................................
+ ldr q15, [x5], #32 // ......*....................................................................
+ uzp1 v8.8H, v11.8H, v20.8H // ...........................*...............................................
+ uzp2 v20.8H, v11.8H, v20.8H // ............................*..............................................
+ ldr q11, [x5, #-16] // .......*...................................................................
+ ld1 {v27.8H}, [x6], #16 // ........*..................................................................
+ uzp1 v10.8H, v15.8H, v11.8H // .............................*.............................................
+ uzp2 v15.8H, v15.8H, v11.8H // ..............................*............................................
+ smlal v9.4S, v8.4H, v10.4H // ...............................*...........................................
+ smlal2 v16.4S, v8.8H, v10.8H // ................................*..........................................
+ smlal v30.4S, v8.4H, v15.4H // .................................*.........................................
+ smlal2 v7.4S, v8.8H, v15.8H // ..................................*........................................
+ smlal v9.4S, v20.4H, v27.4H // ...................................*.......................................
+ smlal2 v16.4S, v20.8H, v27.8H // ....................................*......................................
+ smlal v30.4S, v20.4H, v10.4H // .....................................*.....................................
+ smlal2 v7.4S, v20.8H, v10.8H // ......................................*....................................
+ ldr q20, [x7], #32 // .........*.................................................................
+ ldr q15, [x7, #-16] // ..........*................................................................
+ ldr q8, [x8], #32 // ...........*...............................................................
+ uzp1 v11.8H, v20.8H, v15.8H // .......................................*...................................
+ uzp2 v20.8H, v20.8H, v15.8H // ........................................*..................................
+ ldr q15, [x8, #-16] // ............*..............................................................
+ ld1 {v27.8H}, [x9], #16 // .............*.............................................................
+ uzp1 v10.8H, v8.8H, v15.8H // .........................................*.................................
+ uzp2 v15.8H, v8.8H, v15.8H // ..........................................*................................
+ smlal v9.4S, v11.4H, v10.4H // ...........................................*...............................
+ smlal2 v16.4S, v11.8H, v10.8H // ............................................*..............................
+ smlal v30.4S, v11.4H, v15.4H // .............................................*.............................
+ smlal2 v7.4S, v11.8H, v15.8H // ..............................................*............................
+ smlal v9.4S, v20.4H, v27.4H // ...............................................*...........................
+ smlal2 v16.4S, v20.8H, v27.8H // ................................................*..........................
+ smlal v30.4S, v20.4H, v10.4H // .................................................*.........................
+ smlal2 v7.4S, v20.8H, v10.8H // ..................................................*........................
+ ldr q15, [x2], #32 // ...............................................................*...........
+ uzp1 v20.8H, v9.8H, v16.8H // ....................................................*......................
+ uzp1 v8.8H, v30.8H, v7.8H // .....................................................*.....................
+ mul v20.8H, v20.8H, v2.8H // ......................................................*....................
+ mul v8.8H, v8.8H, v2.8H // .......................................................*...................
+ ldr q21, [x4], #32 // .................................................................*.........
+ smlal v9.4S, v20.4H, v0.4H // ........................................................*..................
+ smlal2 v16.4S, v20.8H, v0.8H // .........................................................*.................
+ smlal v30.4S, v8.4H, v0.4H // ..........................................................*................
+ smlal2 v7.4S, v8.8H, v0.8H // ...........................................................*...............
+ ldr q6, [x4, #-16] // ..................................................................*........
+ uzp2 v27.8H, v9.8H, v16.8H // ............................................................*..............
+ uzp2 v10.8H, v30.8H, v7.8H // .............................................................*.............
+ ldr q16, [x2, #-16] // ...................................................*.......................
+ ldr q30, [x1, #16] // ..............................................................*............
+ ld1 {v9.8H}, [x3], #16 // ................................................................*..........
+ ldr q1, [x5], #32 // ...................................................................*.......
+ ldr q12, [x5, #-16] // ....................................................................*......
+ ld1 {v24.8H}, [x6], #16 // .....................................................................*.....
+ ldr q19, [x7], #32 // ......................................................................*....
+ ldr q31, [x7, #-16] // .......................................................................*...
+ ldr q17, [x8], #32 // ........................................................................*..
+ ldr q18, [x8, #-16] // .........................................................................*.
+ ld1 {v25.8H}, [x9], #16 // ..........................................................................*
+
+ // ------------------------------ new position ------------------------------>
+ // 0 25 50
+ // |------------------------|------------------------|------------------------
+ // ldr q16, [x2, #16] // *..........................................................................
+ // ldr q30, [x1, #16] // ..*........................................................................
+ // ldr q15, [x2], #32 // .*.........................................................................
+ // ld1 {v9.8H}, [x3], #16 // .....*.....................................................................
+ // ldr q21, [x4], #32 // .......*...................................................................
+ // ldr q6, [x4, #-16] // ..................*........................................................
+ // ldr q1, [x5], #32 // ...................*.......................................................
+ // ldr q12, [x5, #-16] // ......................*....................................................
+ // ld1 {v24.8H}, [x6], #16 // .......................*...................................................
+ // ldr q19, [x7], #32 // ..................................*........................................
+ // ldr q31, [x7, #-16] // ...................................*.......................................
+ // ldr q17, [x8], #32 // ....................................*......................................
+ // ldr q18, [x8, #-16] // .......................................*...................................
+ // ld1 {v25.8H}, [x9], #16 // ........................................*..................................
+ // ldr q20, [x1], #32 // ......*....................................................................
+ // uzp1 v7.8H, v15.8H, v16.8H // ...*.......................................................................
+ // uzp2 v15.8H, v15.8H, v16.8H // ....*......................................................................
+ // uzp1 v8.8H, v20.8H, v30.8H // ........*..................................................................
+ // uzp2 v20.8H, v20.8H, v30.8H // .........*.................................................................
+ // smull v30.4S, v8.4H, v15.4H // ..........*................................................................
+ // smull2 v15.4S, v8.8H, v15.8H // ...........*...............................................................
+ // smull v11.4S, v8.4H, v7.4H // ............*..............................................................
+ // smull2 v8.4S, v8.8H, v7.8H // .............*.............................................................
+ // smlal v30.4S, v20.4H, v7.4H // ..............*............................................................
+ // smlal2 v15.4S, v20.8H, v7.8H // ...............*...........................................................
+ // smlal v11.4S, v20.4H, v9.4H // ................*..........................................................
+ // smlal2 v8.4S, v20.8H, v9.8H // .................*.........................................................
+ // uzp1 v7.8H, v21.8H, v6.8H // ....................*......................................................
+ // uzp2 v20.8H, v21.8H, v6.8H // .....................*.....................................................
+ // uzp1 v16.8H, v1.8H, v12.8H // ........................*..................................................
+ // uzp2 v9.8H, v1.8H, v12.8H // .........................*.................................................
+ // smlal v11.4S, v7.4H, v16.4H // ..........................*................................................
+ // smlal2 v8.4S, v7.8H, v16.8H // ...........................*...............................................
+ // smlal v30.4S, v7.4H, v9.4H // ............................*..............................................
+ // smlal2 v15.4S, v7.8H, v9.8H // .............................*.............................................
+ // smlal v11.4S, v20.4H, v24.4H // ..............................*............................................
+ // smlal2 v8.4S, v20.8H, v24.8H // ...............................*...........................................
+ // smlal v30.4S, v20.4H, v16.4H // ................................*..........................................
+ // smlal2 v15.4S, v20.8H, v16.8H // .................................*.........................................
+ // uzp1 v7.8H, v19.8H, v31.8H // .....................................*.....................................
+ // uzp2 v20.8H, v19.8H, v31.8H // ......................................*....................................
+ // uzp1 v16.8H, v17.8H, v18.8H // .........................................*.................................
+ // uzp2 v9.8H, v17.8H, v18.8H // ..........................................*................................
+ // smlal v11.4S, v7.4H, v16.4H // ...........................................*...............................
+ // smlal2 v8.4S, v7.8H, v16.8H // ............................................*..............................
+ // smlal v30.4S, v7.4H, v9.4H // .............................................*.............................
+ // smlal2 v15.4S, v7.8H, v9.8H // ..............................................*............................
+ // smlal v11.4S, v20.4H, v25.4H // ...............................................*...........................
+ // smlal2 v8.4S, v20.8H, v25.8H // ................................................*..........................
+ // smlal v30.4S, v20.4H, v16.4H // .................................................*.........................
+ // smlal2 v15.4S, v20.8H, v16.8H // ..................................................*........................
+ // ldr q16, [x2, #16] // ................................................................*..........
+ // uzp1 v7.8H, v11.8H, v8.8H // ....................................................*......................
+ // uzp1 v20.8H, v30.8H, v15.8H // .....................................................*.....................
+ // mul v7.8H, v7.8H, v2.8H // ......................................................*....................
+ // mul v20.8H, v20.8H, v2.8H // .......................................................*...................
+ // smlal v11.4S, v7.4H, v0.4H // .........................................................*.................
+ // smlal2 v8.4S, v7.8H, v0.8H // ..........................................................*................
+ // smlal v30.4S, v20.4H, v0.4H // ...........................................................*...............
+ // smlal2 v15.4S, v20.8H, v0.8H // ............................................................*..............
+ // uzp2 v27.8H, v11.8H, v8.8H // ..............................................................*............
+ // uzp2 v10.8H, v30.8H, v15.8H // ...............................................................*...........
+ // ldr q30, [x1, #16] // .................................................................*.........
+ // ldr q15, [x2], #32 // ...................................................*.......................
+ // ld1 {v9.8H}, [x3], #16 // ..................................................................*........
+ // ldr q21, [x4], #32 // ........................................................*..................
+ // ldr q6, [x4, #-16] // .............................................................*.............
+ // ldr q1, [x5], #32 // ...................................................................*.......
+ // ldr q12, [x5, #-16] // ....................................................................*......
+ // ld1 {v24.8H}, [x6], #16 // .....................................................................*.....
+ // ldr q19, [x7], #32 // ......................................................................*....
+ // ldr q31, [x7, #-16] // .......................................................................*...
+ // ldr q17, [x8], #32 // ........................................................................*..
+ // ldr q18, [x8, #-16] // .........................................................................*.
+ // ld1 {v25.8H}, [x9], #16 // ..........................................................................*
+
+ sub count, count, #2
+polyvec_basemul_acc_montgomery_cached_asm_k3_opt_loop:
+ // Instructions: 65
+ // Expected cycles: 80
+ // Expected IPC: 0.81
+
+ // Cycle bound: 80.0
+ // IPC bound: 0.81
+
+ // Wall time: 11.64s
+ // User time: 11.64s
+
+ // ---------------------- original position ----------------------->
+ // 0 25 50
+ // |------------------------|------------------------|--------------
+ ldr q20, [x1], #32 // *................................................................
+ uzp1 v7.8H, v15.8H, v16.8H // ......*..........................................................
+ uzp2 v15.8H, v15.8H, v16.8H // .......*.........................................................
+ uzp1 v8.8H, v20.8H, v30.8H // ..*..............................................................
+ uzp2 v20.8H, v20.8H, v30.8H // ...*.............................................................
+ smull v30.4S, v8.4H, v15.4H // .............*...................................................
+ smull2 v15.4S, v8.8H, v15.8H // ..............*..................................................
+ smull v11.4S, v8.4H, v7.4H // .........*.......................................................
+ smull2 v8.4S, v8.8H, v7.8H // ..........*......................................................
+ smlal v30.4S, v20.4H, v7.4H // ...............*.................................................
+ smlal2 v15.4S, v20.8H, v7.8H // ................*................................................
+ smlal v11.4S, v20.4H, v9.4H // ...........*.....................................................
+ smlal2 v8.4S, v20.8H, v9.8H // ............*....................................................
+ uzp1 v7.8H, v21.8H, v6.8H // ...................*.............................................
+ uzp2 v20.8H, v21.8H, v6.8H // ....................*............................................
+ uzp1 v16.8H, v1.8H, v12.8H // .......................*.........................................
+ uzp2 v9.8H, v1.8H, v12.8H // ........................*........................................
+ smlal v11.4S, v7.4H, v16.4H // ..........................*......................................
+ smlal2 v8.4S, v7.8H, v16.8H // ...........................*.....................................
+ smlal v30.4S, v7.4H, v9.4H // ..............................*..................................
+ smlal2 v15.4S, v7.8H, v9.8H // ...............................*.................................
+ smlal v11.4S, v20.4H, v24.4H // ............................*....................................
+ smlal2 v8.4S, v20.8H, v24.8H // .............................*...................................
+ smlal v30.4S, v20.4H, v16.4H // ................................*................................
+ smlal2 v15.4S, v20.8H, v16.8H // .................................*...............................
+ uzp1 v7.8H, v19.8H, v31.8H // ....................................*............................
+ uzp2 v20.8H, v19.8H, v31.8H // .....................................*...........................
+ uzp1 v16.8H, v17.8H, v18.8H // ........................................*........................
+ uzp2 v9.8H, v17.8H, v18.8H // .........................................*.......................
+ smlal v11.4S, v7.4H, v16.4H // ...........................................*.....................
+ smlal2 v8.4S, v7.8H, v16.8H // ............................................*....................
+ smlal v30.4S, v7.4H, v9.4H // ...............................................*.................
+ smlal2 v15.4S, v7.8H, v9.8H // ................................................*................
+ smlal v11.4S, v20.4H, v25.4H // .............................................*...................
+ smlal2 v8.4S, v20.8H, v25.8H // ..............................................*..................
+ smlal v30.4S, v20.4H, v16.4H // .................................................*...............
+ smlal2 v15.4S, v20.8H, v16.8H // ..................................................*..............
+ ldr q16, [x2, #16] // .....e...........................................................
+ uzp1 v7.8H, v11.8H, v8.8H // ...................................................*.............
+ uzp1 v20.8H, v30.8H, v15.8H // ........................................................*........
+ mul v7.8H, v7.8H, v2.8H // ....................................................*............
+ mul v20.8H, v20.8H, v2.8H // .........................................................*.......
+ zip2 v9.8H, v27.8H, v10.8H // ..............................................................l..
+ zip1 v27.8H, v27.8H, v10.8H // .............................................................l...
+ smlal v11.4S, v7.4H, v0.4H // .....................................................*...........
+ smlal2 v8.4S, v7.8H, v0.8H // ......................................................*..........
+ smlal v30.4S, v20.4H, v0.4H // ..........................................................*......
+ smlal2 v15.4S, v20.8H, v0.8H // ...........................................................*.....
+ str q27, [x0], #32 // ...............................................................l.
+ uzp2 v27.8H, v11.8H, v8.8H // .......................................................*.........
+ str q9, [x0, #-16] // ................................................................l
+ uzp2 v10.8H, v30.8H, v15.8H // ............................................................*....
+ ldr q30, [x1, #16] // .e...............................................................
+ ldr q15, [x2], #32 // ....e............................................................
+ ld1 {v9.8H}, [x3], #16 // ........e........................................................
+ ldr q21, [x4], #32 // .................e...............................................
+ ldr q6, [x4, #-16] // ..................e..............................................
+ ldr q1, [x5], #32 // .....................e...........................................
+ ldr q12, [x5, #-16] // ......................e..........................................
+ ld1 {v24.8H}, [x6], #16 // .........................e.......................................
+ ldr q19, [x7], #32 // ..................................e..............................
+ ldr q31, [x7, #-16] // ...................................e.............................
+ ldr q17, [x8], #32 // ......................................e..........................
+ ldr q18, [x8, #-16] // .......................................e.........................
+ ld1 {v25.8H}, [x9], #16 // ..........................................e......................
+
+ // ---------------------------------------------------------------- new position ----------------------------------------------------------------->
+ // 0 25 50 75 100 125
+ // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------
+ // ldr q12, [x1], #32 // ............................*................................................................~..................................................
+ // ldr q13, [x1, #-16] // ...............e............'...................................................~............'..................................................
+ // uzp1 v3.8h, v12.8h, v13.8h // ............................'..*.............................................................'..~...............................................
+ // uzp2 v4.8h, v12.8h, v13.8h // ............................'...*............................................................'...~..............................................
+ // ldr q12, [x2], #32 // ................e...........'....................................................~...........'..................................................
+ // ldr q13, [x2, #-16] // e...........................'....................................~...........................'....................................~.............
+ // uzp1 v5.8h, v12.8h, v13.8h // ............................'*...............................................................'~.................................................
+ // uzp2 v6.8h, v12.8h, v13.8h // ............................'.*..............................................................'.~................................................
+ // ld1 {v7.8h}, [x3], #16 // .................e..........'.....................................................~..........'..................................................
+ // smull v8.4s, v3.4h, v5.4h // ............................'......*.........................................................'......~...........................................
+ // smull2 v10.4s, v3.8h, v5.8h // ............................'.......*........................................................'.......~..........................................
+ // smlal v8.4s, v4.4h, v7.4h // ............................'..........*.....................................................'..........~.......................................
+ // smlal2 v10.4s, v4.8h, v7.8h // ............................'...........*....................................................'...........~......................................
+ // smull v9.4s, v3.4h, v6.4h // ............................'....*...........................................................'....~.............................................
+ // smull2 v11.4s, v3.8h, v6.8h // ............................'.....*..........................................................'.....~............................................
+ // smlal v9.4s, v4.4h, v5.4h // ............................'........*.......................................................'........~.........................................
+ // smlal2 v11.4s, v4.8h, v5.8h // ............................'.........*......................................................'.........~........................................
+ // ldr q12, [x4], #32 // ..................e.........'......................................................~.........'..................................................
+ // ldr q13, [x4, #-16] // ...................e........'.......................................................~........'..................................................
+ // uzp1 v3.8h, v12.8h, v13.8h // ............................'............*...................................................'............~.....................................
+ // uzp2 v4.8h, v12.8h, v13.8h // ............................'.............*..................................................'.............~....................................
+ // ldr q12, [x5], #32 // ....................e.......'........................................................~.......'..................................................
+ // ldr q13, [x5, #-16] // .....................e......'.........................................................~......'..................................................
+ // uzp1 v5.8h, v12.8h, v13.8h // ............................'..............*.................................................'..............~...................................
+ // uzp2 v6.8h, v12.8h, v13.8h // ............................'...............*................................................'...............~..................................
+ // ld1 {v7.8h}, [x6], #16 // ......................e.....'..........................................................~.....'..................................................
+ // smlal v8.4s, v3.4h, v5.4h // ............................'................*...............................................'................~.................................
+ // smlal2 v10.4s, v3.8h, v5.8h // ............................'.................*..............................................'.................~................................
+ // smlal v8.4s, v4.4h, v7.4h // ............................'....................*...........................................'....................~.............................
+ // smlal2 v10.4s, v4.8h, v7.8h // ............................'.....................*..........................................'.....................~............................
+ // smlal v9.4s, v3.4h, v6.4h // ............................'..................*.............................................'..................~...............................
+ // smlal2 v11.4s, v3.8h, v6.8h // ............................'...................*............................................'...................~..............................
+ // smlal v9.4s, v4.4h, v5.4h // ............................'......................*.........................................'......................~...........................
+ // smlal2 v11.4s, v4.8h, v5.8h // ............................'.......................*........................................'.......................~..........................
+ // ldr q12, [x7], #32 // .......................e....'...........................................................~....'..................................................
+ // ldr q13, [x7, #-16] // ........................e...'............................................................~...'..................................................
+ // uzp1 v3.8h, v12.8h, v13.8h // ............................'........................*.......................................'........................~.........................
+ // uzp2 v4.8h, v12.8h, v13.8h // ............................'.........................*......................................'.........................~........................
+ // ldr q12, [x8], #32 // .........................e..'.............................................................~..'..................................................
+ // ldr q13, [x8, #-16] // ..........................e.'..............................................................~.'..................................................
+ // uzp1 v5.8h, v12.8h, v13.8h // ............................'..........................*.....................................'..........................~.......................
+ // uzp2 v6.8h, v12.8h, v13.8h // ............................'...........................*....................................'...........................~......................
+ // ld1 {v7.8h}, [x9], #16 // ...........................e'...............................................................~'..................................................
+ // smlal v8.4s, v3.4h, v5.4h // ............................'............................*...................................'............................~.....................
+ // smlal2 v10.4s, v3.8h, v5.8h // ............................'.............................*..................................'.............................~....................
+ // smlal v8.4s, v4.4h, v7.4h // ............................'................................*...............................'................................~.................
+ // smlal2 v10.4s, v4.8h, v7.8h // ............................'.................................*..............................'.................................~................
+ // smlal v9.4s, v3.4h, v6.4h // ............................'..............................*.................................'..............................~...................
+ // smlal2 v11.4s, v3.8h, v6.8h // ............................'...............................*................................'...............................~..................
+ // smlal v9.4s, v4.4h, v5.4h // ............................'..................................*.............................'..................................~...............
+ // smlal2 v11.4s, v4.8h, v5.8h // ............................'...................................*............................'...................................~..............
+ // uzp1 v28.8h, v8.8h, v10.8h // .~..........................'.....................................*..........................'.....................................~............
+ // mul v28.8h, v28.8h, v2.8h // ...~........................'.......................................*........................'.......................................~..........
+ // smlal v8.4s, v28.4h, v0.4h // .......~....................'...........................................*....................'...........................................~......
+ // smlal2 v10.4s, v28.8h, v0.8h // ........~...................'............................................*...................'............................................~.....
+ // uzp2 v26.8h, v8.8h, v10.8h // ............~...............'................................................*...............'................................................~.
+ // uzp1 v28.8h, v9.8h, v11.8h // ..~.........................'......................................*.........................'......................................~...........
+ // mul v28.8h, v28.8h, v2.8h // ....~.......................'........................................*.......................'........................................~.........
+ // smlal v9.4s, v28.4h, v0.4h // .........~..................'.............................................*..................'.............................................~....
+ // smlal2 v11.4s, v28.8h, v0.8h // ..........~.................'..............................................*.................'..............................................~...
+ // uzp2 v27.8h, v9.8h, v11.8h // ..............~.............'..................................................*.............'..................................................
+ // zip1 v12.8h, v26.8h, v27.8h // ......~.....................'..........................................~.....................'..........................................l.......
+ // zip2 v13.8h, v26.8h, v27.8h // .....~......................'.........................................~......................'.........................................l........
+ // str q12, [x0], #32 // ...........~................'...............................................~................'...............................................l..
+ // str q13, [x0, #-16] // .............~..............'.................................................~..............'.................................................l
+
+ sub count, count, #1
+ cbnz count, polyvec_basemul_acc_montgomery_cached_asm_k3_opt_loop
+ // Instructions: 55
+ // Expected cycles: 61
+ // Expected IPC: 0.90
+
+ // Cycle bound: 61.0
+ // IPC bound: 0.90
+
+ // Wall time: 8.41s
+ // User time: 8.41s
+
+ // ----------------- original position ------------------>
+ // 0 25 50
+ // |------------------------|------------------------|----
+ ldr q7, [x1], #32 // *......................................................
+ uzp1 v20.8H, v15.8H, v16.8H // .*.....................................................
+ uzp2 v15.8H, v15.8H, v16.8H // ..*....................................................
+ uzp1 v23.8H, v7.8H, v30.8H // ...*...................................................
+ uzp2 v11.8H, v7.8H, v30.8H // ....*..................................................
+ smull2 v8.4S, v23.8H, v20.8H // ........*..............................................
+ smull v5.4S, v23.4H, v20.4H // .......*...............................................
+ smull2 v30.4S, v23.8H, v15.8H // ......*................................................
+ uzp1 v28.8H, v1.8H, v12.8H // ...............*.......................................
+ smlal2 v8.4S, v11.8H, v9.8H // ............*..........................................
+ smlal v5.4S, v11.4H, v9.4H // ...........*...........................................
+ uzp1 v3.8H, v21.8H, v6.8H // .............*.........................................
+ smull v16.4S, v23.4H, v15.4H // .....*.................................................
+ smlal2 v8.4S, v3.8H, v28.8H // ..................*....................................
+ smlal v5.4S, v3.4H, v28.4H // .................*.....................................
+ uzp2 v29.8H, v21.8H, v6.8H // ..............*........................................
+ uzp1 v7.8H, v17.8H, v18.8H // ...........................*...........................
+ smlal2 v8.4S, v29.8H, v24.8H // ......................*................................
+ uzp1 v14.8H, v19.8H, v31.8H // .........................*.............................
+ smlal v16.4S, v11.4H, v20.4H // .........*.............................................
+ smlal2 v30.4S, v11.8H, v20.8H // ..........*............................................
+ smlal2 v8.4S, v14.8H, v7.8H // ..............................*........................
+ uzp2 v20.8H, v1.8H, v12.8H // ................*......................................
+ uzp2 v21.8H, v19.8H, v31.8H // ..........................*............................
+ smlal2 v30.4S, v3.8H, v20.8H // ....................*..................................
+ smlal v16.4S, v3.4H, v20.4H // ...................*...................................
+ smlal v5.4S, v29.4H, v24.4H // .....................*.................................
+ uzp2 v9.8H, v17.8H, v18.8H // ............................*..........................
+ smlal2 v30.4S, v29.8H, v28.8H // ........................*..............................
+ smlal v16.4S, v29.4H, v28.4H // .......................*...............................
+ smlal v5.4S, v14.4H, v7.4H // .............................*.........................
+ smlal2 v8.4S, v21.8H, v25.8H // ..................................*....................
+ smlal2 v30.4S, v14.8H, v9.8H // ................................*......................
+ smlal v16.4S, v14.4H, v9.4H // ...............................*.......................
+ smlal v5.4S, v21.4H, v25.4H // .................................*.....................
+ zip1 v20.8H, v27.8H, v10.8H // ..........................................*............
+ smlal2 v30.4S, v21.8H, v7.8H // ....................................*..................
+ smlal v16.4S, v21.4H, v7.4H // ...................................*...................
+ uzp1 v7.8H, v5.8H, v8.8H // .....................................*.................
+ str q20, [x0], #32 // ...............................................*.......
+ mul v15.8H, v7.8H, v2.8H // .......................................*...............
+ uzp1 v7.8H, v16.8H, v30.8H // ......................................*................
+ zip2 v31.8H, v27.8H, v10.8H // .........................................*.............
+ mul v20.8H, v7.8H, v2.8H // ........................................*..............
+ smlal v5.4S, v15.4H, v0.4H // ...........................................*...........
+ smlal2 v8.4S, v15.8H, v0.8H // ............................................*..........
+ str q31, [x0, #-16] // .................................................*.....
+ smlal2 v30.4S, v20.8H, v0.8H // ..............................................*........
+ smlal v16.4S, v20.4H, v0.4H // .............................................*.........
+ uzp2 v15.8H, v5.8H, v8.8H // ................................................*......
+ uzp2 v20.8H, v16.8H, v30.8H // ..................................................*....
+ zip1 v7.8H, v15.8H, v20.8H // ....................................................*..
+ zip2 v20.8H, v15.8H, v20.8H // ...................................................*...
+ str q7, [x0], #32 // .....................................................*.
+ str q20, [x0, #-16] // ......................................................*
+
+ // -------------------- new position -------------------->
+ // 0 25 50
+ // |------------------------|------------------------|----
+ // ldr q20, [x1], #32 // *......................................................
+ // uzp1 v7.8H, v15.8H, v16.8H // .*.....................................................
+ // uzp2 v15.8H, v15.8H, v16.8H // ..*....................................................
+ // uzp1 v8.8H, v20.8H, v30.8H // ...*...................................................
+ // uzp2 v20.8H, v20.8H, v30.8H // ....*..................................................
+ // smull v30.4S, v8.4H, v15.4H // ............*..........................................
+ // smull2 v15.4S, v8.8H, v15.8H // .......*...............................................
+ // smull v11.4S, v8.4H, v7.4H // ......*................................................
+ // smull2 v8.4S, v8.8H, v7.8H // .....*.................................................
+ // smlal v30.4S, v20.4H, v7.4H // ...................*...................................
+ // smlal2 v15.4S, v20.8H, v7.8H // ....................*..................................
+ // smlal v11.4S, v20.4H, v9.4H // ..........*............................................
+ // smlal2 v8.4S, v20.8H, v9.8H // .........*.............................................
+ // uzp1 v7.8H, v21.8H, v6.8H // ...........*...........................................
+ // uzp2 v20.8H, v21.8H, v6.8H // ...............*.......................................
+ // uzp1 v16.8H, v1.8H, v12.8H // ........*..............................................
+ // uzp2 v9.8H, v1.8H, v12.8H // ......................*................................
+ // smlal v11.4S, v7.4H, v16.4H // ..............*........................................
+ // smlal2 v8.4S, v7.8H, v16.8H // .............*.........................................
+ // smlal v30.4S, v7.4H, v9.4H // .........................*.............................
+ // smlal2 v15.4S, v7.8H, v9.8H // ........................*..............................
+ // smlal v11.4S, v20.4H, v24.4H // ..........................*............................
+ // smlal2 v8.4S, v20.8H, v24.8H // .................*.....................................
+ // smlal v30.4S, v20.4H, v16.4H // .............................*.........................
+ // smlal2 v15.4S, v20.8H, v16.8H // ............................*..........................
+ // uzp1 v7.8H, v19.8H, v31.8H // ..................*....................................
+ // uzp2 v20.8H, v19.8H, v31.8H // .......................*...............................
+ // uzp1 v16.8H, v17.8H, v18.8H // ................*......................................
+ // uzp2 v9.8H, v17.8H, v18.8H // ...........................*...........................
+ // smlal v11.4S, v7.4H, v16.4H // ..............................*........................
+ // smlal2 v8.4S, v7.8H, v16.8H // .....................*.................................
+ // smlal v30.4S, v7.4H, v9.4H // .................................*.....................
+ // smlal2 v15.4S, v7.8H, v9.8H // ................................*......................
+ // smlal v11.4S, v20.4H, v25.4H // ..................................*....................
+ // smlal2 v8.4S, v20.8H, v25.8H // ...............................*.......................
+ // smlal v30.4S, v20.4H, v16.4H // .....................................*.................
+ // smlal2 v15.4S, v20.8H, v16.8H // ....................................*..................
+ // uzp1 v7.8H, v11.8H, v8.8H // ......................................*................
+ // uzp1 v20.8H, v30.8H, v15.8H // .........................................*.............
+ // mul v7.8H, v7.8H, v2.8H // ........................................*..............
+ // mul v20.8H, v20.8H, v2.8H // ...........................................*...........
+ // zip2 v9.8H, v27.8H, v10.8H // ..........................................*............
+ // zip1 v27.8H, v27.8H, v10.8H // ...................................*...................
+ // smlal v11.4S, v7.4H, v0.4H // ............................................*..........
+ // smlal2 v8.4S, v7.8H, v0.8H // .............................................*.........
+ // smlal v30.4S, v20.4H, v0.4H // ................................................*......
+ // smlal2 v15.4S, v20.8H, v0.8H // ...............................................*.......
+ // str q27, [x0], #32 // .......................................*...............
+ // uzp2 v27.8H, v11.8H, v8.8H // .................................................*.....
+ // str q9, [x0, #-16] // ..............................................*........
+ // uzp2 v10.8H, v30.8H, v15.8H // ..................................................*....
+ // zip2 v9.8H, v27.8H, v10.8H // ....................................................*..
+ // zip1 v27.8H, v27.8H, v10.8H // ...................................................*...
+ // str q27, [x0], #32 // .....................................................*.
+ // str q9, [x0, #-16] // ......................................................*
+
+
+ pop_stack
+ ret
+
+/****************** REGISTER DEALLOCATIONS *******************/
+ .unreq out
+ .unreq a0_ptr
+ .unreq b0_ptr
+ .unreq b0_cache_ptr
+ .unreq a1_ptr
+ .unreq b1_ptr
+ .unreq b1_cache_ptr
+ .unreq a2_ptr
+ .unreq b2_ptr
+ .unreq b2_cache_ptr
+ .unreq a3_ptr
+ .unreq b3_ptr
+ .unreq b3_cache_ptr
+ .unreq count
+ .unreq modulus
+ .unreq modulus_twisted
+ .unreq wtmp
+ .unreq aa0
+ .unreq aa1
+ .unreq bb0
+ .unreq bb1
+ .unreq bb1t
+ .unreq res0l
+ .unreq res1l
+ .unreq res0h
+ .unreq res1h
+ .unreq tmp0
+ .unreq tmp1
+ .unreq q_tmp0
+ .unreq q_tmp1
+ .unreq out0
+ .unreq out1
+ .unreq t0
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT && MLKEM_K == 3 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S
new file mode 100644
index 000000000..c3d70ed42
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S
@@ -0,0 +1,761 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// AArch64 re-implementation of the asymmetric base multiplication from:
+
+// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
+// https://eprint.iacr.org/2021/986
+// https://github.com/neon-ntt/neon-ntt
+
+#include "../../../common.h"
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) && MLKEM_K == 4
+/* simpasm: header-end */
+
+// Input:
+// - Vectors al, ah of 32-bit entries
+// Output:
+// - Montgomery reductions of al || ah, stored in al
+.macro montgomery_reduce_long x, a
+ uzp1 t0.8h, \a\()l.8h, \a\()h.8h
+ mul t0.8h, t0.8h, modulus_twisted.8h
+ smlal \a\()l.4s, t0.4h, modulus.4h
+ smlal2 \a\()h.4s, t0.8h, modulus.8h
+ uzp2 \x\().8h, \a\()l.8h, \a\()h.8h
+.endm
+
+// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit.
+
+// Bounds:
+// - Assume |a| < 4096,
+// - Result: < 2*4096*2^15 = 2^28
+.macro pmull d, a, b
+ smull \d\()0l.4s, \a\()0.4h, \b\()0.4h
+ smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
+ smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
+ smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
+
+ smull \d\()1l.4s, \a\()0.4h, \b\()1.4h
+ smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
+ smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
+ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
+.endm
+
+.macro pmlal d, a, b
+ smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h
+ smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
+ smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
+ smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
+
+ smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h
+ smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
+ smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
+ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
+.endm
+
+.macro ld2_wrap a, ptr
+ ldr q_tmp0, [\ptr\()], #32
+ ldr q_tmp1, [\ptr\(), #-16]
+ uzp1 \a\()0.8h, tmp0.8h, tmp1.8h
+ uzp2 \a\()1.8h, tmp0.8h, tmp1.8h
+.endm
+
+.macro st2_wrap a, ptr
+ zip1 tmp0.8h, \a\()0.8h, \a\()1.8h
+ zip2 tmp1.8h, \a\()0.8h, \a\()1.8h
+ str q_tmp0, [\ptr\()], #32
+ str q_tmp1, [\ptr\(), #-16]
+.endm
+
+.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr
+ ld2_wrap \a\(), \a_ptr
+ ld2_wrap \b\(), \b_ptr
+ ld1 {\b\()1t.8h}, [\b_cache_ptr], #16
+.endm
+
+.macro save_vregs
+ sub sp, sp, #(16*4)
+ stp d8, d9, [sp, #16*0]
+ stp d10, d11, [sp, #16*1]
+ stp d12, d13, [sp, #16*2]
+ stp d14, d15, [sp, #16*3]
+.endm
+
+.macro restore_vregs
+ ldp d8, d9, [sp, #16*0]
+ ldp d10, d11, [sp, #16*1]
+ ldp d12, d13, [sp, #16*2]
+ ldp d14, d15, [sp, #16*3]
+ add sp, sp, #(16*4)
+.endm
+
+.macro push_stack
+ save_vregs
+.endm
+
+.macro pop_stack
+ restore_vregs
+.endm
+
+ out .req x0
+ a0_ptr .req x1
+ b0_ptr .req x2
+ b0_cache_ptr .req x3
+ a1_ptr .req x4
+ b1_ptr .req x5
+ b1_cache_ptr .req x6
+ a2_ptr .req x7
+ b2_ptr .req x8
+ b2_cache_ptr .req x9
+ a3_ptr .req x10
+ b3_ptr .req x11
+ b3_cache_ptr .req x12
+ count .req x13
+ wtmp .req w14
+
+ modulus .req v0
+ modulus_twisted .req v2
+
+ aa0 .req v3
+ aa1 .req v4
+ bb0 .req v5
+ bb1 .req v6
+ bb1t .req v7
+
+ res0l .req v8
+ res1l .req v9
+ res0h .req v10
+ res1h .req v11
+
+ tmp0 .req v12
+ tmp1 .req v13
+ q_tmp0 .req q12
+ q_tmp1 .req q13
+
+ out0 .req v26
+ out1 .req v27
+
+ t0 .req v28
+
+ .text
+ .global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4_opt)
+ .balign 4
+MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4_opt):
+ push_stack
+ mov wtmp, #3329
+ dup modulus.8h, wtmp
+
+ mov wtmp, #3327
+ dup modulus_twisted.8h, wtmp
+
+ // Computed bases of vector entries
+
+ add a1_ptr, a0_ptr, #(1 * 512)
+ add b1_ptr, b0_ptr, #(1 * 512)
+ add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
+ add a2_ptr, a0_ptr, #(2 * 512)
+ add b2_ptr, b0_ptr, #(2 * 512)
+ add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2)
+ add a3_ptr, a0_ptr, #(3 * 512)
+ add b3_ptr, b0_ptr, #(3 * 512)
+ add b3_cache_ptr, b0_cache_ptr, #(3 * 512/2)
+
+ // Bounds:
+
+ // Each pmull is bound by 2*4096*2^15=2^28, so the final value
+ // before Montgomery reduction is bound by 2^30.
+
+ mov count, #(MLKEM_N / 16)
+ // Instructions: 114
+ // Expected cycles: 153
+ // Expected IPC: 0.75
+ //
+ // Cycle bound: 153.0
+ // IPC bound: 0.75
+ //
+ // Wall time: 0.69s
+ // User time: 0.69s
+ //
+ // ----------------------------------------------- original position ----------------------------------------------->
+ // 0 25 50 75 100
+ // |------------------------|------------------------|------------------------|------------------------|-------------
+ ldr q23, [x2, #16] // .*................................................................................................................
+ ldr q19, [x2], #32 // *.................................................................................................................
+ ldr q17, [x5], #32 // ..*...............................................................................................................
+ uzp2 v13.8H, v19.8H, v23.8H // ..........*.......................................................................................................
+ uzp1 v19.8H, v19.8H, v23.8H // ...........*......................................................................................................
+ ldr q23, [x5, #-16] // ...*..............................................................................................................
+ ldr q30, [x1, #16] // .....*............................................................................................................
+ uzp2 v9.8H, v17.8H, v23.8H // ....*.............................................................................................................
+ uzp1 v23.8H, v17.8H, v23.8H // .......*..........................................................................................................
+ ldr q17, [x1], #32 // ......*...........................................................................................................
+ ldr q10, [x7, #16] // .............*....................................................................................................
+ uzp1 v12.8H, v17.8H, v30.8H // ........*.........................................................................................................
+ uzp2 v17.8H, v17.8H, v30.8H // .........*........................................................................................................
+ smull2 v30.4S, v12.8H, v13.8H // ............*.....................................................................................................
+ smull v13.4S, v12.4H, v13.4H // ............................................*.....................................................................
+ smull2 v22.4S, v12.8H, v19.8H // .....................................*............................................................................
+ smull v12.4S, v12.4H, v19.4H // ..........................................*.......................................................................
+ smlal2 v30.4S, v17.8H, v19.8H // ...............................*..................................................................................
+ smlal v13.4S, v17.4H, v19.4H // ...............................................*..................................................................
+ ldr q19, [x4], #32 // ....................*.............................................................................................
+ ldr q16, [x4, #-16] // .....................*............................................................................................
+ ld1 {v8.8H}, [x3], #16 // ................................*.................................................................................
+ uzp1 v26.8H, v19.8H, v16.8H // .......................*..........................................................................................
+ uzp2 v19.8H, v19.8H, v16.8H // ........................*.........................................................................................
+ smlal2 v30.4S, v26.8H, v9.8H // .................................*................................................................................
+ smlal v13.4S, v26.4H, v9.4H // ..................................................*...............................................................
+ smlal2 v22.4S, v17.8H, v8.8H // ........................................*.........................................................................
+ smlal v12.4S, v17.4H, v8.4H // .................................................*................................................................
+ smlal2 v30.4S, v19.8H, v23.8H // ...................................*..............................................................................
+ smlal v13.4S, v19.4H, v23.4H // .......................................................*..........................................................
+ smlal2 v22.4S, v26.8H, v23.8H // ...........................................*......................................................................
+ smlal v12.4S, v26.4H, v23.4H // .....................................................*............................................................
+ ldr q23, [x7], #32 // ......................*...........................................................................................
+ ldr q17, [x8, #16] // ..............*...................................................................................................
+ uzp1 v9.8H, v23.8H, v10.8H // ..........................*.......................................................................................
+ uzp2 v23.8H, v23.8H, v10.8H // ....................................*.............................................................................
+ ldr q10, [x10], #32 // ...............*..................................................................................................
+ ldr q16, [x10, #-16] // ................*.................................................................................................
+ ld1 {v8.8H}, [x12], #16 // .................*................................................................................................
+ uzp1 v26.8H, v10.8H, v16.8H // ..................*...............................................................................................
+ uzp2 v10.8H, v10.8H, v16.8H // ...................*..............................................................................................
+ ld1 {v16.8H}, [x6], #16 // .........................*........................................................................................
+ ldr q3, [x11, #16] // ...........................*......................................................................................
+ smlal2 v22.4S, v19.8H, v16.8H // ..............................................*...................................................................
+ smlal v12.4S, v19.4H, v16.4H // ........................................................*.........................................................
+ ldr q19, [x11], #32 // ............................*.....................................................................................
+ ld1 {v16.8H}, [x9], #16 // .............................*....................................................................................
+ uzp1 v4.8H, v19.8H, v3.8H // ..................................*...............................................................................
+ uzp2 v19.8H, v19.8H, v3.8H // .......................................*..........................................................................
+ ldr q3, [x8], #32 // ..............................*...................................................................................
+ ldr q31, [x2], #32 // ......................................*...........................................................................
+ uzp1 v6.8H, v3.8H, v17.8H // ...................................................*..............................................................
+ uzp2 v17.8H, v3.8H, v17.8H // .........................................................*........................................................
+ smlal2 v22.4S, v9.8H, v6.8H // ..........................................................*.......................................................
+ smlal2 v30.4S, v9.8H, v17.8H // ...........................................................*......................................................
+ smlal v13.4S, v9.4H, v17.4H // ............................................................*.....................................................
+ smlal v12.4S, v9.4H, v6.4H // .............................................................*....................................................
+ smlal2 v22.4S, v23.8H, v16.8H // ..............................................................*...................................................
+ smlal2 v30.4S, v23.8H, v6.8H // ...............................................................*..................................................
+ smlal v13.4S, v23.4H, v6.4H // ................................................................*.................................................
+ smlal v12.4S, v23.4H, v16.4H // .................................................................*................................................
+ smlal2 v22.4S, v26.8H, v4.8H // ..................................................................*...............................................
+ smlal2 v30.4S, v26.8H, v19.8H // ...................................................................*..............................................
+ smlal v13.4S, v26.4H, v19.4H // ....................................................................*.............................................
+ smlal v12.4S, v26.4H, v4.4H // .....................................................................*............................................
+ smlal2 v22.4S, v10.8H, v8.8H // ......................................................................*...........................................
+ smlal2 v30.4S, v10.8H, v4.8H // .......................................................................*..........................................
+ smlal v13.4S, v10.4H, v4.4H // ........................................................................*.........................................
+ smlal v12.4S, v10.4H, v8.4H // .........................................................................*........................................
+ ldr q19, [x2, #-16] // .........................................*........................................................................
+ uzp1 v23.8H, v13.8H, v30.8H // ...........................................................................*......................................
+ uzp1 v17.8H, v12.8H, v22.8H // ....................................................................................*.............................
+ mul v23.8H, v23.8H, v2.8H // .............................................................................*....................................
+ uzp2 v21.8H, v31.8H, v19.8H // ................................................................................*.................................
+ uzp1 v19.8H, v31.8H, v19.8H // ...................................................................................*..............................
+ mul v17.8H, v17.8H, v2.8H // .....................................................................................*............................
+ smlal v13.4S, v23.4H, v0.4H // .................................................................................*................................
+ smlal2 v30.4S, v23.8H, v0.8H // ..................................................................................*...............................
+ ldr q23, [x5], #32 // .............................................*....................................................................
+ smlal2 v22.4S, v17.8H, v0.8H // ...........................................................................................................*......
+ uzp2 v15.8H, v13.8H, v30.8H // ......................................................................................*...........................
+ smlal v12.4S, v17.4H, v0.4H // ............................................................................................................*.....
+ ldr q17, [x5, #-16] // ................................................*.................................................................
+ ldr q13, [x1, #16] // ......................................................*...........................................................
+ uzp2 v27.8H, v23.8H, v17.8H // ....................................................*.............................................................
+ uzp1 v28.8H, v23.8H, v17.8H // ............................................................................*.....................................
+ uzp2 v7.8H, v12.8H, v22.8H // ...............................................................................................................*..
+ ldr q23, [x1], #32 // ..........................................................................*.......................................
+ zip1 v5.8H, v7.8H, v15.8H // .................................................................................................................*
+ ldr q3, [x7, #16] // ........................................................................................*.........................
+ uzp1 v31.8H, v23.8H, v13.8H // ..............................................................................*...................................
+ uzp2 v16.8H, v23.8H, v13.8H // ...............................................................................*..................................
+ smull2 v24.4S, v31.8H, v21.8H // .......................................................................................*..........................
+ ldr q6, [x8, #16] // .........................................................................................*........................
+ ldr q23, [x10], #32 // ..........................................................................................*.......................
+ smlal2 v24.4S, v16.8H, v19.8H // ..........................................................................................................*.......
+ ldr q17, [x10, #-16] // ...........................................................................................*......................
+ ld1 {v22.8H}, [x12], #16 // ............................................................................................*.....................
+ uzp1 v30.8H, v23.8H, v17.8H // .............................................................................................*....................
+ uzp2 v11.8H, v23.8H, v17.8H // ..............................................................................................*...................
+ ldr q23, [x4], #32 // ...............................................................................................*..................
+ ldr q17, [x4, #-16] // ................................................................................................*.................
+ ldr q4, [x7], #32 // .................................................................................................*................
+ uzp1 v20.8H, v23.8H, v17.8H // ..................................................................................................*...............
+ uzp2 v26.8H, v23.8H, v17.8H // ...................................................................................................*..............
+ uzp1 v9.8H, v4.8H, v3.8H // .....................................................................................................*............
+ smlal2 v24.4S, v20.8H, v27.8H // ..............................................................................................................*...
+ ld1 {v8.8H}, [x6], #16 // ....................................................................................................*.............
+ ldr q25, [x11, #16] // ......................................................................................................*...........
+ ldr q29, [x11], #32 // .......................................................................................................*..........
+ ld1 {v12.8H}, [x9], #16 // ........................................................................................................*.........
+ uzp1 v10.8H, v29.8H, v25.8H // ................................................................................................................*.
+ ldr q14, [x8], #32 // .........................................................................................................*........
+ ld1 {v23.8H}, [x3], #16 // .............................................................................................................*....
+
+ // ------------------------------------------------- new position -------------------------------------------------->
+ // 0 25 50 75 100
+ // |------------------------|------------------------|------------------------|------------------------|-------------
+ // ldr q3, [x2], #32 // .*................................................................................................................
+ // ldr q17, [x2, #-16] // *.................................................................................................................
+ // ldr q21, [x5], #32 // ..*...............................................................................................................
+ // ldr q19, [x5, #-16] // .....*............................................................................................................
+ // uzp2 v27.8H, v21.8H, v19.8H // .......*..........................................................................................................
+ // ldr q25, [x1, #16] // ......*...........................................................................................................
+ // ldr q22, [x1], #32 // .........*........................................................................................................
+ // uzp1 v28.8H, v21.8H, v19.8H // ........*.........................................................................................................
+ // uzp1 v31.8H, v22.8H, v25.8H // ...........*......................................................................................................
+ // uzp2 v16.8H, v22.8H, v25.8H // ............*.....................................................................................................
+ // uzp2 v21.8H, v3.8H, v17.8H // ...*..............................................................................................................
+ // uzp1 v19.8H, v3.8H, v17.8H // ....*.............................................................................................................
+ // smull2 v24.4S, v31.8H, v21.8H // .............*....................................................................................................
+ // ldr q3, [x7, #16] // ..........*.......................................................................................................
+ // ldr q6, [x8, #16] // .................................*................................................................................
+ // ldr q8, [x10], #32 // ....................................*.............................................................................
+ // ldr q26, [x10, #-16] // .....................................*............................................................................
+ // ld1 {v22.8H}, [x12], #16 // ......................................*...........................................................................
+ // uzp1 v30.8H, v8.8H, v26.8H // .......................................*..........................................................................
+ // uzp2 v11.8H, v8.8H, v26.8H // ........................................*.........................................................................
+ // ldr q8, [x4], #32 // ...................*..............................................................................................
+ // ldr q26, [x4, #-16] // ....................*.............................................................................................
+ // ldr q4, [x7], #32 // ................................*.................................................................................
+ // uzp1 v20.8H, v8.8H, v26.8H // ......................*...........................................................................................
+ // uzp2 v26.8H, v8.8H, v26.8H // .......................*..........................................................................................
+ // ld1 {v8.8H}, [x6], #16 // .........................................*........................................................................
+ // uzp1 v9.8H, v4.8H, v3.8H // ..................................*...............................................................................
+ // ldr q25, [x11, #16] // ..........................................*.......................................................................
+ // ldr q29, [x11], #32 // .............................................*....................................................................
+ // ld1 {v12.8H}, [x9], #16 // ..............................................*...................................................................
+ // ldr q14, [x8], #32 // .................................................*................................................................
+ // smlal2 v24.4S, v16.8H, v19.8H // .................*................................................................................................
+ // ld1 {v23.8H}, [x3], #16 // .....................*............................................................................................
+ // smlal2 v24.4S, v20.8H, v27.8H // ........................*.........................................................................................
+ // uzp1 v10.8H, v29.8H, v25.8H // ...............................................*..................................................................
+ // smlal2 v24.4S, v26.8H, v28.8H // ............................*.....................................................................................
+ // uzp2 v4.8H, v4.8H, v3.8H // ...................................*..............................................................................
+ // smull2 v13.4S, v31.8H, v19.8H // ...............*..................................................................................................
+ // ldr q3, [x2], #32 // ..................................................*...............................................................
+ // uzp2 v1.8H, v29.8H, v25.8H // ................................................*.................................................................
+ // smlal2 v13.4S, v16.8H, v23.8H // ..........................*.......................................................................................
+ // ldr q17, [x2, #-16] // .....................................................................*............................................
+ // smull v18.4S, v31.4H, v19.4H // ................*.................................................................................................
+ // smlal2 v13.4S, v20.8H, v28.8H // ..............................*...................................................................................
+ // smull v29.4S, v31.4H, v21.4H // ..............*...................................................................................................
+ // ldr q21, [x5], #32 // ..............................................................................*...................................
+ // smlal2 v13.4S, v26.8H, v8.8H // ...........................................*......................................................................
+ // smlal v29.4S, v16.4H, v19.4H // ..................*...............................................................................................
+ // ldr q19, [x5, #-16] // ..................................................................................*...............................
+ // smlal v18.4S, v16.4H, v23.4H // ...........................*......................................................................................
+ // smlal v29.4S, v20.4H, v27.4H // .........................*........................................................................................
+ // uzp1 v31.8H, v14.8H, v6.8H // ...................................................*..............................................................
+ // uzp2 v27.8H, v21.8H, v19.8H // ....................................................................................*.............................
+ // smlal v18.4S, v20.4H, v28.4H // ...............................*..................................................................................
+ // ldr q25, [x1, #16] // ...................................................................................*..............................
+ // smlal v29.4S, v26.4H, v28.4H // .............................*....................................................................................
+ // smlal v18.4S, v26.4H, v8.4H // ............................................*.....................................................................
+ // uzp2 v26.8H, v14.8H, v6.8H // ....................................................*.............................................................
+ // smlal2 v13.4S, v9.8H, v31.8H // .....................................................*............................................................
+ // smlal2 v24.4S, v9.8H, v26.8H // ......................................................*...........................................................
+ // smlal v29.4S, v9.4H, v26.4H // .......................................................*..........................................................
+ // smlal v18.4S, v9.4H, v31.4H // ........................................................*.........................................................
+ // smlal2 v13.4S, v4.8H, v12.8H // .........................................................*........................................................
+ // smlal2 v24.4S, v4.8H, v31.8H // ..........................................................*.......................................................
+ // smlal v29.4S, v4.4H, v31.4H // ...........................................................*......................................................
+ // smlal v18.4S, v4.4H, v12.4H // ............................................................*.....................................................
+ // smlal2 v13.4S, v30.8H, v10.8H // .............................................................*....................................................
+ // smlal2 v24.4S, v30.8H, v1.8H // ..............................................................*...................................................
+ // smlal v29.4S, v30.4H, v1.4H // ...............................................................*..................................................
+ // smlal v18.4S, v30.4H, v10.4H // ................................................................*.................................................
+ // smlal2 v13.4S, v11.8H, v22.8H // .................................................................*................................................
+ // smlal2 v24.4S, v11.8H, v10.8H // ..................................................................*...............................................
+ // smlal v29.4S, v11.4H, v10.4H // ...................................................................*..............................................
+ // smlal v18.4S, v11.4H, v22.4H // ....................................................................*.............................................
+ // ldr q22, [x1], #32 // .......................................................................................*..........................
+ // uzp1 v31.8H, v29.8H, v24.8H // ......................................................................*...........................................
+ // uzp1 v28.8H, v21.8H, v19.8H // .....................................................................................*............................
+ // mul v19.8H, v31.8H, v2.8H // ........................................................................*.........................................
+ // uzp1 v31.8H, v22.8H, v25.8H // ..........................................................................................*.......................
+ // uzp2 v16.8H, v22.8H, v25.8H // ...........................................................................................*......................
+ // uzp2 v21.8H, v3.8H, v17.8H // .........................................................................*........................................
+ // smlal v29.4S, v19.4H, v0.4H // ............................................................................*.....................................
+ // smlal2 v24.4S, v19.8H, v0.8H // .............................................................................*....................................
+ // uzp1 v19.8H, v3.8H, v17.8H // ..........................................................................*.......................................
+ // uzp1 v26.8H, v18.8H, v13.8H // .......................................................................*..........................................
+ // mul v23.8H, v26.8H, v2.8H // ...........................................................................*......................................
+ // uzp2 v15.8H, v29.8H, v24.8H // ................................................................................*.................................
+ // smull2 v24.4S, v31.8H, v21.8H // ............................................................................................*.....................
+ // ldr q3, [x7, #16] // .........................................................................................*........................
+ // ldr q6, [x8, #16] // .............................................................................................*....................
+ // ldr q8, [x10], #32 // ..............................................................................................*...................
+ // ldr q26, [x10, #-16] // ................................................................................................*.................
+ // ld1 {v22.8H}, [x12], #16 // .................................................................................................*................
+ // uzp1 v30.8H, v8.8H, v26.8H // ..................................................................................................*...............
+ // uzp2 v11.8H, v8.8H, v26.8H // ...................................................................................................*..............
+ // ldr q8, [x4], #32 // ....................................................................................................*.............
+ // ldr q26, [x4, #-16] // .....................................................................................................*............
+ // ldr q4, [x7], #32 // ......................................................................................................*...........
+ // uzp1 v20.8H, v8.8H, v26.8H // .......................................................................................................*..........
+ // uzp2 v26.8H, v8.8H, v26.8H // ........................................................................................................*.........
+ // ld1 {v8.8H}, [x6], #16 // ...........................................................................................................*......
+ // uzp1 v9.8H, v4.8H, v3.8H // .........................................................................................................*........
+ // ldr q25, [x11, #16] // ............................................................................................................*.....
+ // ldr q29, [x11], #32 // .............................................................................................................*....
+ // ld1 {v12.8H}, [x9], #16 // ..............................................................................................................*...
+ // ldr q14, [x8], #32 // ................................................................................................................*.
+ // smlal2 v24.4S, v16.8H, v19.8H // ...............................................................................................*..................
+ // smlal2 v13.4S, v23.8H, v0.8H // ...............................................................................*..................................
+ // smlal v18.4S, v23.4H, v0.4H // .................................................................................*................................
+ // ld1 {v23.8H}, [x3], #16 // .................................................................................................................*
+ // smlal2 v24.4S, v20.8H, v27.8H // ..........................................................................................................*.......
+ // uzp2 v7.8H, v18.8H, v13.8H // ......................................................................................*...........................
+ // uzp1 v10.8H, v29.8H, v25.8H // ...............................................................................................................*..
+ // zip1 v5.8H, v7.8H, v15.8H // ........................................................................................*.........................
+
+ sub count, count, #2
+polyvec_basemul_acc_montgomery_cached_asm_k4_opt_loop:
+ // Instructions: 82
+ // Expected cycles: 102
+ // Expected IPC: 0.80
+ //
+ // Cycle bound: 102.0
+ // IPC bound: 0.80
+ //
+ // Wall time: 15.93s
+ // User time: 15.93s
+ //
+ // ------------------------------- original position ------------------------------->
+ // 0 25 50 75
+ // |------------------------|------------------------|------------------------|------
+ smlal2 v24.4S, v26.8H, v28.8H // .................................*................................................
+ uzp2 v4.8H, v4.8H, v3.8H // .....................................*............................................
+ smull2 v13.4S, v31.8H, v19.8H // ..........*.......................................................................
+ ldr q3, [x2], #32 // ....e.............................................................................
+ uzp2 v1.8H, v29.8H, v25.8H // ..........................................................*.......................
+ smlal2 v13.4S, v16.8H, v23.8H // ............*.....................................................................
+ ldr q17, [x2, #-16] // .....e............................................................................
+ smull v18.4S, v31.4H, v19.4H // .........*........................................................................
+ smlal2 v13.4S, v20.8H, v28.8H // ...........................*......................................................
+ smull v29.4S, v31.4H, v21.4H // .............*....................................................................
+ ldr q21, [x5], #32 // .....................e............................................................
+ smlal2 v13.4S, v26.8H, v8.8H // .............................*....................................................
+ smlal v29.4S, v16.4H, v19.4H // ...............*..................................................................
+ ldr q19, [x5, #-16] // ......................e...........................................................
+ smlal v18.4S, v16.4H, v23.4H // ...........*......................................................................
+ smlal v29.4S, v20.4H, v27.4H // ..............................*...................................................
+ uzp1 v31.8H, v14.8H, v6.8H // ........................................*.........................................
+ uzp2 v27.8H, v21.8H, v19.8H // ........................e.........................................................
+ smlal v18.4S, v20.4H, v28.4H // ..........................*.......................................................
+ ldr q25, [x1, #16] // .e................................................................................
+ smlal v29.4S, v26.4H, v28.4H // ................................*.................................................
+ smlal v18.4S, v26.4H, v8.4H // ............................*.....................................................
+ uzp2 v26.8H, v14.8H, v6.8H // .........................................*........................................
+ smlal2 v13.4S, v9.8H, v31.8H // ............................................*.....................................
+ smlal2 v24.4S, v9.8H, v26.8H // ................................................*.................................
+ smlal v29.4S, v9.4H, v26.4H // ...............................................*..................................
+ smlal v18.4S, v9.4H, v31.4H // ...........................................*......................................
+ smlal2 v13.4S, v4.8H, v12.8H // ..............................................*...................................
+ smlal2 v24.4S, v4.8H, v31.8H // ..................................................*...............................
+ smlal v29.4S, v4.4H, v31.4H // .................................................*................................
+ smlal v18.4S, v4.4H, v12.4H // .............................................*....................................
+ smlal2 v13.4S, v30.8H, v10.8H // .............................................................*....................
+ smlal2 v24.4S, v30.8H, v1.8H // .................................................................*................
+ smlal v29.4S, v30.4H, v1.4H // ................................................................*.................
+ smlal v18.4S, v30.4H, v10.4H // ............................................................*.....................
+ smlal2 v13.4S, v11.8H, v22.8H // ...............................................................*..................
+ smlal2 v24.4S, v11.8H, v10.8H // ...................................................................*..............
+ smlal v29.4S, v11.4H, v10.4H // ..................................................................*...............
+ smlal v18.4S, v11.4H, v22.4H // ..............................................................*...................
+ ldr q22, [x1], #32 // e.................................................................................
+ uzp1 v31.8H, v29.8H, v24.8H // .........................................................................*........
+ uzp1 v28.8H, v21.8H, v19.8H // .......................e..........................................................
+ mul v19.8H, v31.8H, v2.8H // ..........................................................................*.......
+ uzp1 v31.8H, v22.8H, v25.8H // ..e...............................................................................
+ uzp2 v16.8H, v22.8H, v25.8H // ...e..............................................................................
+ uzp2 v21.8H, v3.8H, v17.8H // .......e..........................................................................
+ smlal v29.4S, v19.4H, v0.4H // ...........................................................................*......
+ smlal2 v24.4S, v19.8H, v0.8H // ............................................................................*.....
+ uzp1 v19.8H, v3.8H, v17.8H // ......e...........................................................................
+ uzp1 v26.8H, v18.8H, v13.8H // ....................................................................*.............
+ zip2 v14.8H, v7.8H, v15.8H // ...............................................................................l..
+ mul v23.8H, v26.8H, v2.8H // .....................................................................*............
+ uzp2 v15.8H, v29.8H, v24.8H // .............................................................................*....
+ smull2 v24.4S, v31.8H, v21.8H // ..............e...................................................................
+ str q14, [x0, #16] // .................................................................................l
+ ldr q3, [x7, #16] // ...................................e..............................................
+ ldr q6, [x8, #16] // .......................................e..........................................
+ ldr q8, [x10], #32 // ...................................................e..............................
+ ldr q26, [x10, #-16] // ....................................................e.............................
+ ld1 {v22.8H}, [x12], #16 // ...........................................................e......................
+ uzp1 v30.8H, v8.8H, v26.8H // .....................................................e............................
+ uzp2 v11.8H, v8.8H, v26.8H // ......................................................e...........................
+ ldr q8, [x4], #32 // .................e................................................................
+ ldr q26, [x4, #-16] // ..................e...............................................................
+ ldr q4, [x7], #32 // ..................................e...............................................
+ uzp1 v20.8H, v8.8H, v26.8H // ...................e..............................................................
+ uzp2 v26.8H, v8.8H, v26.8H // ....................e.............................................................
+ ld1 {v8.8H}, [x6], #16 // .........................e........................................................
+ uzp1 v9.8H, v4.8H, v3.8H // ....................................e.............................................
+ ldr q25, [x11, #16] // ........................................................e.........................
+ ldr q29, [x11], #32 // .......................................................e..........................
+ ld1 {v12.8H}, [x9], #16 // ..........................................e.......................................
+ ldr q14, [x8], #32 // ......................................e...........................................
+ smlal2 v24.4S, v16.8H, v19.8H // ................e.................................................................
+ smlal2 v13.4S, v23.8H, v0.8H // .......................................................................*..........
+ smlal v18.4S, v23.4H, v0.4H // ......................................................................*...........
+ ld1 {v23.8H}, [x3], #16 // ........e.........................................................................
+ smlal2 v24.4S, v20.8H, v27.8H // ...............................e..................................................
+ uzp2 v7.8H, v18.8H, v13.8H // ........................................................................*.........
+ uzp1 v10.8H, v29.8H, v25.8H // .........................................................e........................
+ str q5, [x0], #32 // ................................................................................l.
+ zip1 v5.8H, v7.8H, v15.8H // ..............................................................................*...
+
+ // ----------------------------------------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------------------------------------>
+ // 0 25 50 75 100 125 150 175 200 225
+ // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------
+ // ldr q12, [x1], #32 // ....................................e..........................................'......................................~..........................................'......................................~.........................................
+ // ldr q13, [x1, #-16] // ................e..............................................................'..................~..............................................................'..................~.............................................................
+ // uzp1 v3.8h, v12.8h, v13.8h // ........................................e......................................'..........................................~......................................'..........................................~.....................................
+ // uzp2 v4.8h, v12.8h, v13.8h // .........................................e.....................................'...........................................~.....................................'...........................................~....................................
+ // ldr q12, [x2], #32 // e..............................................................................'..~..............................................................................'..~.............................................................................
+ // ldr q13, [x2, #-16] // ...e...........................................................................'.....~...........................................................................'.....~..........................................................................
+ // uzp1 v5.8h, v12.8h, v13.8h // .............................................e.................................'...............................................~.................................'...............................................~................................
+ // uzp2 v6.8h, v12.8h, v13.8h // ..........................................e....................................'............................................~....................................'............................................~...................................
+ // ld1 {v7.8h}, [x3], #16 // .........................................................................e.....'...........................................................................~.....'...........................................................................~....
+ // smull v8.4s, v3.4h, v5.4h // ....~..........................................................................'......*..........................................................................'......~.........................................................................
+ // smull2 v10.4s, v3.8h, v5.8h // ...............................................................................'.*...............................................................................'.~..............................................................................
+ // smlal v8.4s, v4.4h, v7.4h // ...........~...................................................................'.............*...................................................................'.............~..................................................................
+ // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................................................'....*............................................................................'....~...........................................................................
+ // smull v9.4s, v3.4h, v6.4h // ......~........................................................................'........*........................................................................'........~.......................................................................
+ // smull2 v11.4s, v3.8h, v6.8h // ..................................................e............................'....................................................~............................'....................................................~...........................
+ // smlal v9.4s, v4.4h, v5.4h // .........~.....................................................................'...........*.....................................................................'...........~....................................................................
+ // smlal2 v11.4s, v4.8h, v5.8h // ......................................................................e........'........................................................................~........'........................................................................~.......
+ // ldr q12, [x4], #32 // ...........................................................e...................'.............................................................~...................'.............................................................~..................
+ // ldr q13, [x4, #-16] // ............................................................e..................'..............................................................~..................'..............................................................~.................
+ // uzp1 v3.8h, v12.8h, v13.8h // ..............................................................e................'................................................................~................'................................................................~...............
+ // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................e...............'.................................................................~...............'.................................................................~..............
+ // ldr q12, [x5], #32 // .......e.......................................................................'.........~.......................................................................'.........~......................................................................
+ // ldr q13, [x5, #-16] // ..........e....................................................................'............~....................................................................'............~...................................................................
+ // uzp1 v5.8h, v12.8h, v13.8h // ......................................e........................................'........................................~........................................'........................................~.......................................
+ // uzp2 v6.8h, v12.8h, v13.8h // ..............e................................................................'................~................................................................'................~...............................................................
+ // ld1 {v7.8h}, [x6], #16 // ................................................................e..............'..................................................................~..............'..................................................................~.............
+ // smlal v8.4s, v3.4h, v5.4h // ...............~...............................................................'.................*...............................................................'.................~..............................................................
+ // smlal2 v10.4s, v3.8h, v5.8h // .....~.........................................................................'.......*.........................................................................'.......~........................................................................
+ // smlal v8.4s, v4.4h, v7.4h // ..................~............................................................'....................*............................................................'....................~...........................................................
+ // smlal2 v10.4s, v4.8h, v7.8h // ........~......................................................................'..........*......................................................................'..........~.....................................................................
+ // smlal v9.4s, v3.4h, v6.4h // ............~..................................................................'..............*..................................................................'..............~.................................................................
+ // smlal2 v11.4s, v3.8h, v6.8h // ..........................................................................e....'............................................................................~....'............................................................................~...
+ // smlal v9.4s, v4.4h, v5.4h // .................~.............................................................'...................*.............................................................'...................~............................................................
+ // smlal2 v11.4s, v4.8h, v5.8h // ...............................................................................*.................................................................................~................................................................................
+ // ldr q12, [x7], #32 // .............................................................e.................'...............................................................~.................'...............................................................~................
+ // ldr q13, [x7, #-16] // ....................................................e..........................'......................................................~..........................'......................................................~.........................
+ // uzp1 v3.8h, v12.8h, v13.8h // .................................................................e.............'...................................................................~.............'...................................................................~............
+ // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................................'*................................................................................'~...............................................................................
+ // ldr q12, [x8], #32 // .....................................................................e.........'.......................................................................~.........'.......................................................................~........
+ // ldr q13, [x8, #-16] // .....................................................e.........................'.......................................................~.........................'.......................................................~........................
+ // uzp1 v5.8h, v12.8h, v13.8h // .............~.................................................................'...............*.................................................................'...............~................................................................
+ // uzp2 v6.8h, v12.8h, v13.8h // ...................~...........................................................'.....................*...........................................................'.....................~..........................................................
+ // ld1 {v7.8h}, [x9], #16 // ....................................................................e..........'......................................................................~..........'......................................................................~.........
+ // smlal v8.4s, v3.4h, v5.4h // .......................~.......................................................'.........................*.......................................................'.........................~......................................................
+ // smlal2 v10.4s, v3.8h, v5.8h // ....................~..........................................................'......................*..........................................................'......................~.........................................................
+ // smlal v8.4s, v4.4h, v7.4h // ...........................~...................................................'.............................*...................................................'.............................~..................................................
+ // smlal2 v10.4s, v4.8h, v7.8h // ........................~......................................................'..........................*......................................................'..........................~.....................................................
+ // smlal v9.4s, v3.4h, v6.4h // ......................~........................................................'........................*........................................................'........................~.......................................................
+ // smlal2 v11.4s, v3.8h, v6.8h // .....................~.........................................................'.......................*.........................................................'.......................~........................................................
+ // smlal v9.4s, v4.4h, v5.4h // ..........................~....................................................'............................*....................................................'............................~...................................................
+ // smlal2 v11.4s, v4.8h, v5.8h // .........................~.....................................................'...........................*.....................................................'...........................~....................................................
+ // ldr q12, [x10], #32 // ......................................................e........................'........................................................~........................'........................................................~.......................
+ // ldr q13, [x10, #-16] // .......................................................e.......................'.........................................................~.......................'.........................................................~......................
+ // uzp1 v3.8h, v12.8h, v13.8h // .........................................................e.....................'...........................................................~.....................'...........................................................~....................
+ // uzp2 v4.8h, v12.8h, v13.8h // ..........................................................e....................'............................................................~....................'............................................................~...................
+ // ldr q12, [x11], #32 // ...................................................................e...........'.....................................................................~...........'.....................................................................~..........
+ // ldr q13, [x11, #-16] // ..................................................................e............'....................................................................~............'....................................................................~...........
+ // uzp1 v5.8h, v12.8h, v13.8h // ............................................................................e..'..............................................................................~..'..............................................................................~.
+ // uzp2 v6.8h, v12.8h, v13.8h // .~.............................................................................'...*.............................................................................'...~............................................................................
+ // ld1 {v7.8h}, [x12], #16 // ........................................................e......................'..........................................................~......................'..........................................................~.....................
+ // smlal v8.4s, v3.4h, v5.4h // ...............................~...............................................'.................................*...............................................'.................................~..............................................
+ // smlal2 v10.4s, v3.8h, v5.8h // ............................~..................................................'..............................*..................................................'..............................~.................................................
+ // smlal v8.4s, v4.4h, v7.4h // ...................................~...........................................'.....................................*...........................................'.....................................~..........................................
+ // smlal2 v10.4s, v4.8h, v7.8h // ................................~..............................................'..................................*..............................................'..................................~.............................................
+ // smlal v9.4s, v3.4h, v6.4h // ..............................~................................................'................................*................................................'................................~...............................................
+ // smlal2 v11.4s, v3.8h, v6.8h // .............................~.................................................'...............................*.................................................'...............................~................................................
+ // smlal v9.4s, v4.4h, v5.4h // ..................................~............................................'....................................*............................................'....................................~...........................................
+ // smlal2 v11.4s, v4.8h, v5.8h // .................................~.............................................'...................................*.............................................'...................................~............................................
+ // uzp1 v28.8h, v8.8h, v10.8h // ..............................................~................................'................................................*................................'................................................~...............................
+ // mul v28.8h, v28.8h, v2.8h // ................................................~..............................'..................................................*..............................'..................................................~.............................
+ // smlal v8.4s, v28.4h, v0.4h // ........................................................................~......'..........................................................................*......'..........................................................................~.....
+ // smlal2 v10.4s, v28.8h, v0.8h // .......................................................................~.......'.........................................................................*.......'.........................................................................~......
+ // uzp2 v26.8h, v8.8h, v10.8h // ...........................................................................~...'.............................................................................*...'.............................................................................~..
+ // uzp1 v28.8h, v9.8h, v11.8h // .....................................~.........................................'.......................................*.........................................'.......................................~........................................
+ // mul v28.8h, v28.8h, v2.8h // .......................................~.......................................'.........................................*.......................................'.........................................~......................................
+ // smlal v9.4s, v28.4h, v0.4h // ...........................................~...................................'.............................................*...................................'.............................................~..................................
+ // smlal2 v11.4s, v28.8h, v0.8h // ............................................~..................................'..............................................*..................................'..............................................~.................................
+ // uzp2 v27.8h, v9.8h, v11.8h // .................................................~.............................'...................................................*.............................'...................................................~............................
+ // zip1 v12.8h, v26.8h, v27.8h // ..............................................................................~'................................................................................*'................................................................................
+ // zip2 v13.8h, v26.8h, v27.8h // ...............................................~...............................'.................................................~...............................'.................................................l..............................
+ // str q12, [x0], #32 // .............................................................................~.'...............................................................................~.'...............................................................................l
+ // str q13, [x0, #-16] // ...................................................~...........................'.....................................................~...........................'.....................................................l..........................
+
+ sub count, count, #1
+ cbnz count, polyvec_basemul_acc_montgomery_cached_asm_k4_opt_loop
+
+ // Instructions: 50
+ // Expected cycles: 56
+ // Expected IPC: 0.89
+ //
+ // Cycle bound: 56.0
+ // IPC bound: 0.89
+ //
+ // Wall time: 4.16s
+ // User time: 4.16s
+ //
+ // --------------- original position --------------->
+ // 0 25
+ // |------------------------|
+ smull2 v17.4S, v31.8H, v19.8H // ..*...............................................
+ uzp2 v1.8H, v14.8H, v6.8H // ................*.................................
+ smull v18.4S, v31.4H, v21.4H // .......*..........................................
+ smlal2 v24.4S, v26.8H, v28.8H // *.................................................
+ smlal2 v17.4S, v16.8H, v23.8H // ....*.............................................
+ smull v21.4S, v31.4H, v19.4H // .....*............................................
+ smlal v18.4S, v16.4H, v19.4H // .........*........................................
+ uzp2 v31.8H, v4.8H, v3.8H // .*................................................
+ uzp1 v3.8H, v14.8H, v6.8H // ............*.....................................
+ smlal v21.4S, v16.4H, v23.4H // ..........*.......................................
+ smlal v18.4S, v20.4H, v27.4H // ...........*......................................
+ uzp2 v14.8H, v29.8H, v25.8H // ...*..............................................
+ smlal2 v17.4S, v20.8H, v28.8H // ......*...........................................
+ smlal v21.4S, v20.4H, v28.4H // .............*....................................
+ smlal v18.4S, v26.4H, v28.4H // ..............*...................................
+ smlal2 v24.4S, v9.8H, v1.8H // ..................*...............................
+ smlal2 v17.4S, v26.8H, v8.8H // ........*.........................................
+ smlal v21.4S, v26.4H, v8.4H // ...............*..................................
+ smlal v18.4S, v9.4H, v1.4H // ...................*..............................
+ smlal2 v24.4S, v31.8H, v3.8H // ......................*...........................
+ smlal2 v17.4S, v9.8H, v3.8H // .................*................................
+ smlal v21.4S, v9.4H, v3.4H // ....................*.............................
+ smlal v18.4S, v31.4H, v3.4H // .......................*..........................
+ smlal2 v24.4S, v30.8H, v14.8H // ..........................*.......................
+ smlal2 v17.4S, v31.8H, v12.8H // .....................*............................
+ smlal v21.4S, v31.4H, v12.4H // ........................*.........................
+ smlal v18.4S, v30.4H, v14.4H // ...........................*......................
+ smlal2 v24.4S, v11.8H, v10.8H // ..............................*...................
+ smlal2 v17.4S, v30.8H, v10.8H // .........................*........................
+ smlal v21.4S, v30.4H, v10.4H // ............................*.....................
+ smlal v18.4S, v11.4H, v10.4H // ...............................*..................
+ zip2 v19.8H, v7.8H, v15.8H // ......................................*...........
+ smlal2 v17.4S, v11.8H, v22.8H // .............................*....................
+ smlal v21.4S, v11.4H, v22.4H // ................................*.................
+ uzp1 v23.8H, v18.8H, v24.8H // .................................*................
+ str q19, [x0, #16] // .........................................*........
+ mul v19.8H, v23.8H, v2.8H // ..................................*...............
+ uzp1 v23.8H, v21.8H, v17.8H // .....................................*............
+ str q5, [x0], #32 // .............................................*....
+ mul v26.8H, v23.8H, v2.8H // .......................................*..........
+ smlal v18.4S, v19.4H, v0.4H // ...................................*..............
+ smlal2 v24.4S, v19.8H, v0.8H // ....................................*.............
+ smlal v21.4S, v26.4H, v0.4H // ...........................................*......
+ smlal2 v17.4S, v26.8H, v0.8H // ..........................................*.......
+ uzp2 v13.8H, v18.8H, v24.8H // ........................................*.........
+ uzp2 v19.8H, v21.8H, v17.8H // ............................................*.....
+ zip1 v23.8H, v19.8H, v13.8H // ..............................................*...
+ zip2 v19.8H, v19.8H, v13.8H // ...............................................*..
+ str q23, [x0], #32 // .................................................*
+ str q19, [x0, #-16] // ................................................*.
+
+ // ----------------- new position ------------------>
+ // 0 25
+ // |------------------------|------------------------
+ // smlal2 v24.4S, v26.8H, v28.8H // ...*..............................................
+ // uzp2 v4.8H, v4.8H, v3.8H // .......*..........................................
+ // smull2 v13.4S, v31.8H, v19.8H // *.................................................
+ // uzp2 v1.8H, v29.8H, v25.8H // ...........*......................................
+ // smlal2 v13.4S, v16.8H, v23.8H // ....*.............................................
+ // smull v18.4S, v31.4H, v19.4H // .....*............................................
+ // smlal2 v13.4S, v20.8H, v28.8H // ............*.....................................
+ // smull v29.4S, v31.4H, v21.4H // ..*...............................................
+ // smlal2 v13.4S, v26.8H, v8.8H // ................*.................................
+ // smlal v29.4S, v16.4H, v19.4H // ......*...........................................
+ // smlal v18.4S, v16.4H, v23.4H // .........*........................................
+ // smlal v29.4S, v20.4H, v27.4H // ..........*.......................................
+ // uzp1 v31.8H, v14.8H, v6.8H // ........*.........................................
+ // smlal v18.4S, v20.4H, v28.4H // .............*....................................
+ // smlal v29.4S, v26.4H, v28.4H // ..............*...................................
+ // smlal v18.4S, v26.4H, v8.4H // .................*................................
+ // uzp2 v26.8H, v14.8H, v6.8H // .*................................................
+ // smlal2 v13.4S, v9.8H, v31.8H // ....................*.............................
+ // smlal2 v24.4S, v9.8H, v26.8H // ...............*..................................
+ // smlal v29.4S, v9.4H, v26.4H // ..................*...............................
+ // smlal v18.4S, v9.4H, v31.4H // .....................*............................
+ // smlal2 v13.4S, v4.8H, v12.8H // ........................*.........................
+ // smlal2 v24.4S, v4.8H, v31.8H // ...................*..............................
+ // smlal v29.4S, v4.4H, v31.4H // ......................*...........................
+ // smlal v18.4S, v4.4H, v12.4H // .........................*........................
+ // smlal2 v13.4S, v30.8H, v10.8H // ............................*.....................
+ // smlal2 v24.4S, v30.8H, v1.8H // .......................*..........................
+ // smlal v29.4S, v30.4H, v1.4H // ..........................*.......................
+ // smlal v18.4S, v30.4H, v10.4H // .............................*....................
+ // smlal2 v13.4S, v11.8H, v22.8H // ................................*.................
+ // smlal2 v24.4S, v11.8H, v10.8H // ...........................*......................
+ // smlal v29.4S, v11.4H, v10.4H // ..............................*...................
+ // smlal v18.4S, v11.4H, v22.4H // .................................*................
+ // uzp1 v31.8H, v29.8H, v24.8H // ..................................*...............
+ // mul v19.8H, v31.8H, v2.8H // ....................................*.............
+ // smlal v29.4S, v19.4H, v0.4H // ........................................*.........
+ // smlal2 v24.4S, v19.8H, v0.8H // .........................................*........
+ // uzp1 v26.8H, v18.8H, v13.8H // .....................................*............
+ // zip2 v14.8H, v7.8H, v15.8H // ...............................*..................
+ // mul v23.8H, v26.8H, v2.8H // .......................................*..........
+ // uzp2 v15.8H, v29.8H, v24.8H // ............................................*.....
+ // str q14, [x0, #16] // ...................................*..............
+ // smlal2 v13.4S, v23.8H, v0.8H // ...........................................*......
+ // smlal v18.4S, v23.4H, v0.4H // ..........................................*.......
+ // uzp2 v7.8H, v18.8H, v13.8H // .............................................*....
+ // str q5, [x0], #32 // ......................................*...........
+ // zip1 v5.8H, v7.8H, v15.8H // ..............................................*...
+ // zip2 v14.8H, v7.8H, v15.8H // ...............................................*..
+ // str q14, [x0, #16] // .................................................*
+ // str q5, [x0], #32 // ................................................*.
+
+
+ pop_stack
+ ret
+
+/****************** REGISTER DEALLOCATIONS *******************/
+ .unreq out
+ .unreq a0_ptr
+ .unreq b0_ptr
+ .unreq b0_cache_ptr
+ .unreq a1_ptr
+ .unreq b1_ptr
+ .unreq b1_cache_ptr
+ .unreq a2_ptr
+ .unreq b2_ptr
+ .unreq b2_cache_ptr
+ .unreq a3_ptr
+ .unreq b3_ptr
+ .unreq b3_cache_ptr
+ .unreq count
+ .unreq modulus
+ .unreq modulus_twisted
+ .unreq wtmp
+ .unreq aa0
+ .unreq aa1
+ .unreq bb0
+ .unreq bb1
+ .unreq bb1t
+ .unreq res0l
+ .unreq res1l
+ .unreq res0h
+ .unreq res1h
+ .unreq tmp0
+ .unreq tmp1
+ .unreq q_tmp0
+ .unreq q_tmp1
+ .unreq out0
+ .unreq out1
+ .unreq t0
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT && MLKEM_K == 4 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_clean.S
deleted file mode 100644
index 94f0889b7..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_clean.S
+++ /dev/null
@@ -1,310 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-//
-// AArch64 re-implementation of the asymmetric base multiplication from:
-//
-// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
-// https://eprint.iacr.org/2021/986
-// https://github.com/neon-ntt/neon-ntt
-
-#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
-
-// Input:
-// - Vectors al, ah of 32-bit entries
-// Output:
-// - Montgomery reductions of al || ah, stored in al
-.macro montgomery_reduce_long x, a
- uzp1 t0.8h, \a\()l.8h, \a\()h.8h
- mul t0.8h, t0.8h, modulus_twisted.8h
- smlal \a\()l.4s, t0.4h, modulus.4h
- smlal2 \a\()h.4s, t0.8h, modulus.8h
- uzp2 \x\().8h, \a\()l.8h, \a\()h.8h
-.endm
-
-// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit.
-//
-// Bounds:
-// - Assume |a| < 4096,
-// - Result: < 2*4096*2^15 = 2^28
-.macro pmull d, a, b
- smull \d\()0l.4s, \a\()0.4h, \b\()0.4h
- smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
- smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
- smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
-
- smull \d\()1l.4s, \a\()0.4h, \b\()1.4h
- smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
- smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
- smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
-.endm
-
-.macro pmlal d, a, b
- smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h
- smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
- smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
- smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
-
- smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h
- smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
- smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
- smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
-.endm
-
-.macro ld2_wrap a, ptr
- ldr q_tmp0, [\ptr\()], #32
- ldr q_tmp1, [\ptr\(), #-16]
- uzp1 \a\()0.8h, tmp0.8h, tmp1.8h
- uzp2 \a\()1.8h, tmp0.8h, tmp1.8h
-.endm
-
-.macro st2_wrap a, ptr
- zip1 tmp0.8h, \a\()0.8h, \a\()1.8h
- zip2 tmp1.8h, \a\()0.8h, \a\()1.8h
- str q_tmp0, [\ptr\()], #32
- str q_tmp1, [\ptr\(), #-16]
-.endm
-
-.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr
- ld2_wrap \a\(), \a_ptr
- ld2_wrap \b\(), \b_ptr
- ld1 {\b\()1t.8h}, [\b_cache_ptr], #16
-.endm
-
-.macro save_vregs
- sub sp, sp, #(16*4)
- stp d8, d9, [sp, #16*0]
- stp d10, d11, [sp, #16*1]
- stp d12, d13, [sp, #16*2]
- stp d14, d15, [sp, #16*3]
-.endm
-
-.macro restore_vregs
- ldp d8, d9, [sp, #16*0]
- ldp d10, d11, [sp, #16*1]
- ldp d12, d13, [sp, #16*2]
- ldp d14, d15, [sp, #16*3]
- add sp, sp, #(16*4)
-.endm
-
-.macro push_stack
- save_vregs
-.endm
-
-.macro pop_stack
- restore_vregs
-.endm
-
- out .req x0
- a0_ptr .req x1
- b0_ptr .req x2
- b0_cache_ptr .req x3
- a1_ptr .req x4
- b1_ptr .req x5
- b1_cache_ptr .req x6
- a2_ptr .req x7
- b2_ptr .req x8
- b2_cache_ptr .req x9
- a3_ptr .req x10
- b3_ptr .req x11
- b3_cache_ptr .req x12
- count .req x13
- wtmp .req w14
-
- modulus .req v0
- modulus_twisted .req v2
-
- aa0 .req v3
- aa1 .req v4
- bb0 .req v5
- bb1 .req v6
- bb1t .req v7
-
- res0l .req v8
- res1l .req v9
- res0h .req v10
- res1h .req v11
-
- tmp0 .req v12
- tmp1 .req v13
- q_tmp0 .req q12
- q_tmp1 .req q13
-
- out0 .req v26
- out1 .req v27
-
- t0 .req v28
-
-#if MLKEM_K == 2
-.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
-
-.balign 4
-MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean):
- push_stack
-
- mov wtmp, #3329
- dup modulus.8h, wtmp
-
- mov wtmp, #3327
- dup modulus_twisted.8h, wtmp
-
- // Computed bases of vector entries
-
- add a1_ptr, a0_ptr, #(1 * 512)
- add b1_ptr, b0_ptr, #(1 * 512)
- add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
-
- mov count, #(MLKEM_N / 16)
-k2_loop_start:
-
- load_polys aa, bb, a0_ptr, b0_ptr, b0_cache_ptr
- pmull res, aa, bb
- load_polys aa, bb, a1_ptr, b1_ptr, b1_cache_ptr
- pmlal res, aa, bb
-
- montgomery_reduce_long out0, res0
- montgomery_reduce_long out1, res1
-
- st2_wrap out, out
-
- subs count, count, #1
- cbnz count, k2_loop_start
-
- pop_stack
- ret
-#endif /* MLKEM_K == 2 */
-
-#if MLKEM_K == 3
-.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
-
-.balign 4
-MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean):
- push_stack
- mov wtmp, #3329
- dup modulus.8h, wtmp
-
- mov wtmp, #3327
- dup modulus_twisted.8h, wtmp
-
- // Computed bases of vector entries
-
- add a1_ptr, a0_ptr, #(1 * 512)
- add b1_ptr, b0_ptr, #(1 * 512)
- add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
- add a2_ptr, a0_ptr, #(2 * 512)
- add b2_ptr, b0_ptr, #(2 * 512)
- add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2)
-
- mov count, #(MLKEM_N / 16)
-k3_loop_start:
-
- load_polys aa, bb, a0_ptr, b0_ptr, b0_cache_ptr
- pmull res, aa, bb
- load_polys aa, bb, a1_ptr, b1_ptr, b1_cache_ptr
- pmlal res, aa, bb
- load_polys aa, bb, a2_ptr, b2_ptr, b2_cache_ptr
- pmlal res, aa, bb
-
- montgomery_reduce_long out0, res0
- montgomery_reduce_long out1, res1
-
- st2_wrap out, out
-
- subs count, count, #1
- cbnz count, k3_loop_start
-
- pop_stack
- ret
-#endif /* MLKEM_K == 3 */
-
-#if MLKEM_K == 4
-.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
-
-.balign 4
-MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean):
- push_stack
- mov wtmp, #3329
- dup modulus.8h, wtmp
-
- mov wtmp, #3327
- dup modulus_twisted.8h, wtmp
-
- // Computed bases of vector entries
-
- add a1_ptr, a0_ptr, #(1 * 512)
- add b1_ptr, b0_ptr, #(1 * 512)
- add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
- add a2_ptr, a0_ptr, #(2 * 512)
- add b2_ptr, b0_ptr, #(2 * 512)
- add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2)
- add a3_ptr, a0_ptr, #(3 * 512)
- add b3_ptr, b0_ptr, #(3 * 512)
- add b3_cache_ptr, b0_cache_ptr, #(3 * 512/2)
-
- // Bounds:
- //
- // Each pmull is bound by 2*4096*2^15=2^28, so the final value
- // before Montgomery reduction is bound by 2^30.
-
- mov count, #(MLKEM_N / 16)
-k4_loop_start:
-
- load_polys aa, bb, a0_ptr, b0_ptr, b0_cache_ptr
- pmull res, aa, bb
- load_polys aa, bb, a1_ptr, b1_ptr, b1_cache_ptr
- pmlal res, aa, bb
- load_polys aa, bb, a2_ptr, b2_ptr, b2_cache_ptr
- pmlal res, aa, bb
- load_polys aa, bb, a3_ptr, b3_ptr, b3_cache_ptr
- pmlal res, aa, bb
-
- montgomery_reduce_long out0, res0
- montgomery_reduce_long out1, res1
-
- st2_wrap out, out
-
- subs count, count, #1
- cbnz count, k4_loop_start
-
- pop_stack
- ret
-#endif /* MLKEM_K == 4 */
-
-/****************** REGISTER DEALLOCATIONS *******************/
- .unreq out
- .unreq a0_ptr
- .unreq b0_ptr
- .unreq b0_cache_ptr
- .unreq a1_ptr
- .unreq b1_ptr
- .unreq b1_cache_ptr
- .unreq a2_ptr
- .unreq b2_ptr
- .unreq b2_cache_ptr
- .unreq a3_ptr
- .unreq b3_ptr
- .unreq b3_cache_ptr
- .unreq count
- .unreq modulus
- .unreq modulus_twisted
- .unreq aa0
- .unreq aa1
- .unreq bb0
- .unreq bb1
- .unreq bb1t
- .unreq res0l
- .unreq res1l
- .unreq res0h
- .unreq wtmp
- .unreq res1h
- .unreq tmp0
- .unreq tmp1
- .unreq q_tmp0
- .unreq q_tmp1
- .unreq out0
- .unreq out1
- .unreq t0
-
-#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_opt.S
deleted file mode 100644
index 275ca06d2..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_opt.S
+++ /dev/null
@@ -1,1606 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-// AArch64 re-implementation of the asymmetric base multiplication from:
-
-// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
-// https://eprint.iacr.org/2021/986
-// https://github.com/neon-ntt/neon-ntt
-
-#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
-
-// Input:
-// - Vectors al, ah of 32-bit entries
-// Output:
-// - Montgomery reductions of al || ah, stored in al
-.macro montgomery_reduce_long x, a
- uzp1 t0.8h, \a\()l.8h, \a\()h.8h
- mul t0.8h, t0.8h, modulus_twisted.8h
- smlal \a\()l.4s, t0.4h, modulus.4h
- smlal2 \a\()h.4s, t0.8h, modulus.8h
- uzp2 \x\().8h, \a\()l.8h, \a\()h.8h
-.endm
-
-// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit.
-
-// Bounds:
-// - Assume |a| < 4096,
-// - Result: < 2*4096*2^15 = 2^28
-.macro pmull d, a, b
- smull \d\()0l.4s, \a\()0.4h, \b\()0.4h
- smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
- smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
- smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
-
- smull \d\()1l.4s, \a\()0.4h, \b\()1.4h
- smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
- smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
- smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
-.endm
-
-.macro pmlal d, a, b
- smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h
- smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
- smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
- smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
-
- smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h
- smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
- smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
- smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
-.endm
-
-.macro ld2_wrap a, ptr
- ldr q_tmp0, [\ptr\()], #32
- ldr q_tmp1, [\ptr\(), #-16]
- uzp1 \a\()0.8h, tmp0.8h, tmp1.8h
- uzp2 \a\()1.8h, tmp0.8h, tmp1.8h
-.endm
-
-.macro st2_wrap a, ptr
- zip1 tmp0.8h, \a\()0.8h, \a\()1.8h
- zip2 tmp1.8h, \a\()0.8h, \a\()1.8h
- str q_tmp0, [\ptr\()], #32
- str q_tmp1, [\ptr\(), #-16]
-.endm
-
-.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr
- ld2_wrap \a\(), \a_ptr
- ld2_wrap \b\(), \b_ptr
- ld1 {\b\()1t.8h}, [\b_cache_ptr], #16
-.endm
-
-.macro save_vregs
- sub sp, sp, #(16*4)
- stp d8, d9, [sp, #16*0]
- stp d10, d11, [sp, #16*1]
- stp d12, d13, [sp, #16*2]
- stp d14, d15, [sp, #16*3]
-.endm
-
-.macro restore_vregs
- ldp d8, d9, [sp, #16*0]
- ldp d10, d11, [sp, #16*1]
- ldp d12, d13, [sp, #16*2]
- ldp d14, d15, [sp, #16*3]
- add sp, sp, #(16*4)
-.endm
-
-.macro push_stack
- save_vregs
-.endm
-
-.macro pop_stack
- restore_vregs
-.endm
-
- out .req x0
- a0_ptr .req x1
- b0_ptr .req x2
- b0_cache_ptr .req x3
- a1_ptr .req x4
- b1_ptr .req x5
- b1_cache_ptr .req x6
- a2_ptr .req x7
- b2_ptr .req x8
- b2_cache_ptr .req x9
- a3_ptr .req x10
- b3_ptr .req x11
- b3_cache_ptr .req x12
- count .req x13
- wtmp .req w14
-
- modulus .req v0
- modulus_twisted .req v2
-
- aa0 .req v3
- aa1 .req v4
- bb0 .req v5
- bb1 .req v6
- bb1t .req v7
-
- res0l .req v8
- res1l .req v9
- res0h .req v10
- res1h .req v11
-
- tmp0 .req v12
- tmp1 .req v13
- q_tmp0 .req q12
- q_tmp1 .req q13
-
- out0 .req v26
- out1 .req v27
-
- t0 .req v28
-
-#if MLKEM_K == 2
-.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
-
-.balign 4
-MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt):
- push_stack
-
- mov wtmp, #3329
- dup modulus.8h, wtmp
-
- mov wtmp, #3327
- dup modulus_twisted.8h, wtmp
-
- // Computed bases of vector entries
-
- add a1_ptr, a0_ptr, #(1 * 512)
- add b1_ptr, b0_ptr, #(1 * 512)
- add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
-
- mov count, #(MLKEM_N / 16)
- // Instructions: 75
- // Expected cycles: 94
- // Expected IPC: 0.80
-
- // Cycle bound: 94.0
- // IPC bound: 0.80
-
- // Wall time: 1.49s
- // User time: 1.49s
-
- // --------------------------- original position ---------------------------->
- // 0 25 50
- // |------------------------|------------------------|
- ldr q9, [x4], #32 // *..........................................................................
- ldr q5, [x4, #-16] // ......*....................................................................
- ldr q11, [x5], #32 // .*.........................................................................
- uzp1 v23.8H, v9.8H, v5.8H // .........*.................................................................
- uzp2 v9.8H, v9.8H, v5.8H // .....................*.....................................................
- ldr q5, [x2], #32 // ..*........................................................................
- ldr q7, [x5, #-16] // ..............*............................................................
- ldr q21, [x2, #-16] // ...*.......................................................................
- uzp2 v10.8H, v11.8H, v7.8H // .................*.........................................................
- uzp1 v11.8H, v11.8H, v7.8H // ..................*........................................................
- uzp1 v7.8H, v5.8H, v21.8H // ....*......................................................................
- uzp2 v5.8H, v5.8H, v21.8H // .....*.....................................................................
- ldr q21, [x1], #32 // .......*...................................................................
- ldr q25, [x1, #-16] // ........*..................................................................
- ld1 {v6.8H}, [x3], #16 // ............................*..............................................
- uzp1 v26.8H, v21.8H, v25.8H // ..........*................................................................
- uzp2 v21.8H, v21.8H, v25.8H // ...........*...............................................................
- smull v25.4S, v26.4H, v5.4H // ............*..............................................................
- smull2 v5.4S, v26.8H, v5.8H // .............*.............................................................
- smull v19.4S, v26.4H, v7.4H // ..........................*................................................
- smull2 v26.4S, v26.8H, v7.8H // ..............................*............................................
- smlal v25.4S, v21.4H, v7.4H // ...............*...........................................................
- smlal2 v5.4S, v21.8H, v7.8H // ................*..........................................................
- smlal v19.4S, v21.4H, v6.4H // ...................................*.......................................
- smlal2 v26.4S, v21.8H, v6.8H // .................................*.........................................
- smlal v25.4S, v23.4H, v10.4H // ...................*.......................................................
- smlal2 v5.4S, v23.8H, v10.8H // ....................*......................................................
- smlal v19.4S, v23.4H, v11.4H // ......................................*....................................
- smlal2 v26.4S, v23.8H, v11.8H // ....................................*......................................
- ld1 {v23.8H}, [x6], #16 // ........................*..................................................
- smlal v25.4S, v9.4H, v11.4H // ......................*....................................................
- smlal2 v5.4S, v9.8H, v11.8H // .......................*...................................................
- smlal2 v26.4S, v9.8H, v23.8H // .......................................*...................................
- smlal v19.4S, v9.4H, v23.4H // .........................................*.................................
- ldr q9, [x4], #32 // ...............................*...........................................
- uzp1 v11.8H, v25.8H, v5.8H // .........................*.................................................
- uzp1 v23.8H, v19.8H, v26.8H // .............................................*.............................
- mul v11.8H, v11.8H, v2.8H // ...........................*...............................................
- mul v23.8H, v23.8H, v2.8H // ..............................................*............................
- ldr q7, [x5], #32 // ................................*..........................................
- smlal2 v5.4S, v11.8H, v0.8H // .............................*.............................................
- smlal v25.4S, v11.4H, v0.4H // ..................................*........................................
- ldr q11, [x2], #32 // .....................................*.....................................
- ldr q21, [x2, #-16] // ........................................*..................................
- ldr q6, [x4, #-16] // ...............................................*...........................
- uzp1 v17.8H, v11.8H, v21.8H // ...........................................*...............................
- ldr q10, [x1], #32 // ................................................*..........................
- ldr q29, [x1, #-16] // .................................................*.........................
- uzp2 v11.8H, v11.8H, v21.8H // ............................................*..............................
- uzp1 v13.8H, v9.8H, v6.8H // ...................................................*.......................
- uzp1 v3.8H, v10.8H, v29.8H // ....................................................*......................
- uzp2 v10.8H, v10.8H, v29.8H // .....................................................*.....................
- smull v12.4S, v3.4H, v11.4H // ......................................................*....................
- smull2 v11.4S, v3.8H, v11.8H // .......................................................*...................
- ldr q21, [x5, #-16] // ........................................................*..................
- smlal v12.4S, v10.4H, v17.4H // .........................................................*.................
- smlal2 v11.4S, v10.8H, v17.8H // ..........................................................*................
- uzp2 v29.8H, v7.8H, v21.8H // ...........................................................*...............
- uzp1 v15.8H, v7.8H, v21.8H // ............................................................*..............
- smlal v12.4S, v13.4H, v29.4H // .............................................................*.............
- smlal2 v11.4S, v13.8H, v29.8H // ..............................................................*............
- uzp2 v28.8H, v9.8H, v6.8H // ...............................................................*...........
- smlal2 v26.4S, v23.8H, v0.8H // ..................................................*........................
- smlal v12.4S, v28.4H, v15.4H // .................................................................*.........
- smlal2 v11.4S, v28.8H, v15.8H // ..................................................................*........
- smlal v19.4S, v23.4H, v0.4H // ................................................................*..........
- uzp2 v27.8H, v25.8H, v5.8H // ..........................................*................................
- smull v23.4S, v3.4H, v17.4H // ......................................................................*....
- uzp1 v9.8H, v12.8H, v11.8H // .....................................................................*.....
- uzp2 v19.8H, v19.8H, v26.8H // ....................................................................*......
- mul v14.8H, v9.8H, v2.8H // .......................................................................*...
- ld1 {v22.8H}, [x6], #16 // ...................................................................*.......
- zip2 v9.8H, v19.8H, v27.8H // ........................................................................*..
- smlal2 v11.4S, v14.8H, v0.8H // ..........................................................................*
- ld1 {v4.8H}, [x3], #16 // .........................................................................*.
-
- // ------------------------------ new position ------------------------------>
- // 0 25 50
- // |------------------------|------------------------|------------------------
- // ldr q18, [x4], #32 // *..........................................................................
- // ldr q30, [x5], #32 // ..*........................................................................
- // ldr q8, [x2], #32 // .....*.....................................................................
- // ldr q9, [x2, #-16] // .......*...................................................................
- // uzp1 v17.8H, v8.8H, v9.8H // ..........*................................................................
- // uzp2 v4.8H, v8.8H, v9.8H // ...........*...............................................................
- // ldr q19, [x4, #-16] // .*.........................................................................
- // ldr q29, [x1], #32 // ............*..............................................................
- // ldr q12, [x1, #-16] // .............*.............................................................
- // uzp1 v13.8H, v18.8H, v19.8H // ...*.......................................................................
- // uzp1 v3.8H, v29.8H, v12.8H // ...............*...........................................................
- // uzp2 v10.8H, v29.8H, v12.8H // ................*..........................................................
- // smull v12.4S, v3.4H, v4.4H // .................*.........................................................
- // smull2 v11.4S, v3.8H, v4.8H // ..................*........................................................
- // ldr q5, [x5, #-16] // ......*....................................................................
- // smlal v12.4S, v10.4H, v17.4H // .....................*.....................................................
- // smlal2 v11.4S, v10.8H, v17.8H // ......................*....................................................
- // uzp2 v14.8H, v30.8H, v5.8H // ........*..................................................................
- // uzp1 v15.8H, v30.8H, v5.8H // .........*.................................................................
- // smlal v12.4S, v13.4H, v14.4H // .........................*.................................................
- // smlal2 v11.4S, v13.8H, v14.8H // ..........................*................................................
- // uzp2 v28.8H, v18.8H, v19.8H // ....*......................................................................
- // smlal v12.4S, v28.4H, v15.4H // ..............................*............................................
- // smlal2 v11.4S, v28.8H, v15.8H // ...............................*...........................................
- // ld1 {v22.8H}, [x6], #16 // .............................*.............................................
- // uzp1 v1.8H, v12.8H, v11.8H // ...................................*.......................................
- // smull v23.4S, v3.4H, v17.4H // ...................*.......................................................
- // mul v14.8H, v1.8H, v2.8H // .....................................*.....................................
- // ld1 {v4.8H}, [x3], #16 // ..............*............................................................
- // smlal2 v11.4S, v14.8H, v0.8H // ........................................*..................................
- // smull2 v20.4S, v3.8H, v17.8H // ....................*......................................................
- // ldr q18, [x4], #32 // ..................................*........................................
- // ldr q30, [x5], #32 // .......................................*...................................
- // smlal2 v20.4S, v10.8H, v4.8H // ........................*..................................................
- // smlal v12.4S, v14.4H, v0.4H // .........................................*.................................
- // smlal v23.4S, v10.4H, v4.4H // .......................*...................................................
- // smlal2 v20.4S, v13.8H, v15.8H // ............................*..............................................
- // ldr q8, [x2], #32 // ..........................................*................................
- // smlal v23.4S, v13.4H, v15.4H // ...........................*...............................................
- // smlal2 v20.4S, v28.8H, v22.8H // ................................*..........................................
- // ldr q9, [x2, #-16] // ...........................................*...............................
- // smlal v23.4S, v28.4H, v22.4H // .................................*.........................................
- // uzp2 v27.8H, v12.8H, v11.8H // ..................................................................*........
- // uzp1 v17.8H, v8.8H, v9.8H // .............................................*.............................
- // uzp2 v4.8H, v8.8H, v9.8H // ................................................*..........................
- // uzp1 v5.8H, v23.8H, v20.8H // ....................................*......................................
- // mul v31.8H, v5.8H, v2.8H // ......................................*....................................
- // ldr q19, [x4, #-16] // ............................................*..............................
- // ldr q29, [x1], #32 // ..............................................*............................
- // ldr q12, [x1, #-16] // ...............................................*...........................
- // smlal2 v20.4S, v31.8H, v0.8H // ..............................................................*............
- // uzp1 v13.8H, v18.8H, v19.8H // .................................................*.........................
- // uzp1 v3.8H, v29.8H, v12.8H // ..................................................*........................
- // uzp2 v10.8H, v29.8H, v12.8H // ...................................................*.......................
- // smull v12.4S, v3.4H, v4.4H // ....................................................*......................
- // smull2 v11.4S, v3.8H, v4.8H // .....................................................*.....................
- // ldr q5, [x5, #-16] // ......................................................*....................
- // smlal v12.4S, v10.4H, v17.4H // .......................................................*...................
- // smlal2 v11.4S, v10.8H, v17.8H // ........................................................*..................
- // uzp2 v14.8H, v30.8H, v5.8H // .........................................................*.................
- // uzp1 v15.8H, v30.8H, v5.8H // ..........................................................*................
- // smlal v12.4S, v13.4H, v14.4H // ...........................................................*...............
- // smlal2 v11.4S, v13.8H, v14.8H // ............................................................*..............
- // uzp2 v28.8H, v18.8H, v19.8H // .............................................................*.............
- // smlal v23.4S, v31.4H, v0.4H // .................................................................*.........
- // smlal v12.4S, v28.4H, v15.4H // ...............................................................*...........
- // smlal2 v11.4S, v28.8H, v15.8H // ................................................................*..........
- // ld1 {v22.8H}, [x6], #16 // .......................................................................*...
- // uzp2 v19.8H, v23.8H, v20.8H // .....................................................................*.....
- // uzp1 v1.8H, v12.8H, v11.8H // ....................................................................*......
- // smull v23.4S, v3.4H, v17.4H // ...................................................................*.......
- // mul v14.8H, v1.8H, v2.8H // ......................................................................*....
- // zip2 v9.8H, v19.8H, v27.8H // ........................................................................*..
- // ld1 {v4.8H}, [x3], #16 // ..........................................................................*
- // smlal2 v11.4S, v14.8H, v0.8H // .........................................................................*.
-
- sub count, count, #2
-1:
- // Instructions: 48
- // Expected cycles: 58
- // Expected IPC: 0.83
-
- // Cycle bound: 58.0
- // IPC bound: 0.83
-
- // Wall time: 6.39s
- // User time: 6.39s
-
- // -------------- original position -------------->
- // 0 25
- // |------------------------|----------------------
- smull2 v20.4S, v3.8H, v17.8H // ..........*.....................................
- ldr q18, [x4], #32 // .................e..............................
- ldr q30, [x5], #32 // .....................e..........................
- smlal2 v20.4S, v10.8H, v4.8H // ............*...................................
- smlal v12.4S, v14.4H, v0.4H // .........................................*......
- smlal v23.4S, v10.4H, v4.4H // ...........*....................................
- str q9, [x0, #16] // ...............................................l
- smlal2 v20.4S, v13.8H, v15.8H // ...........................*....................
- ldr q8, [x2], #32 // ....e...........................................
- smlal v23.4S, v13.4H, v15.4H // ..........................*.....................
- smlal2 v20.4S, v28.8H, v22.8H // .............................*..................
- zip1 v26.8H, v19.8H, v27.8H // ............................................l...
- ldr q9, [x2, #-16] // .....e..........................................
- smlal v23.4S, v28.4H, v22.4H // ............................*...................
- uzp2 v27.8H, v12.8H, v11.8H // ...........................................*....
- uzp1 v17.8H, v8.8H, v9.8H // ......e.........................................
- uzp2 v4.8H, v8.8H, v9.8H // .......e........................................
- uzp1 v5.8H, v23.8H, v20.8H // ..................................*.............
- str q26, [x0], #32 // ..............................................l.
- mul v31.8H, v5.8H, v2.8H // ...................................*............
- ldr q19, [x4, #-16] // ..................e.............................
- ldr q29, [x1], #32 // e...............................................
- ldr q12, [x1, #-16] // .e..............................................
- smlal2 v20.4S, v31.8H, v0.8H // .....................................*..........
- uzp1 v13.8H, v18.8H, v19.8H // ...................e............................
- uzp1 v3.8H, v29.8H, v12.8H // ..e.............................................
- uzp2 v10.8H, v29.8H, v12.8H // ...e............................................
- smull v12.4S, v3.4H, v4.4H // .............e..................................
- smull2 v11.4S, v3.8H, v4.8H // ..............e.................................
- ldr q5, [x5, #-16] // ......................e.........................
- smlal v12.4S, v10.4H, v17.4H // ...............e................................
- smlal2 v11.4S, v10.8H, v17.8H // ................e...............................
- uzp2 v14.8H, v30.8H, v5.8H // ........................e.......................
- uzp1 v15.8H, v30.8H, v5.8H // .......................e........................
- smlal v12.4S, v13.4H, v14.4H // ..............................e.................
- smlal2 v11.4S, v13.8H, v14.8H // ...............................e................
- uzp2 v28.8H, v18.8H, v19.8H // ....................e...........................
- smlal v23.4S, v31.4H, v0.4H // ....................................*...........
- smlal v12.4S, v28.4H, v15.4H // ................................e...............
- smlal2 v11.4S, v28.8H, v15.8H // .................................e..............
- ld1 {v22.8H}, [x6], #16 // .........................e......................
- uzp2 v19.8H, v23.8H, v20.8H // ......................................*.........
- uzp1 v1.8H, v12.8H, v11.8H // .......................................e........
- smull v23.4S, v3.4H, v17.4H // .........e......................................
- mul v14.8H, v1.8H, v2.8H // ........................................e.......
- zip2 v9.8H, v19.8H, v27.8H // .............................................*..
- ld1 {v4.8H}, [x3], #16 // ........e.......................................
- smlal2 v11.4S, v14.8H, v0.8H // ..........................................e.....
-
- // ------------------------------------------------- new position -------------------------------------------------->
- // 0 25 50 75 100
- // |------------------------|------------------------|------------------------|------------------------|-------------
- // ldr q12, [x1], #32 // ....................e..........................'....................~..........................'..................
- // ldr q13, [x1, #-16] // .....................e.........................'.....................~.........................'..................
- // uzp1 v3.8h, v12.8h, v13.8h // ........................e......................'........................~......................'..................
- // uzp2 v4.8h, v12.8h, v13.8h // .........................e.....................'.........................~.....................'..................
- // ldr q12, [x2], #32 // .......e.......................................'.......~.......................................'.......~..........
- // ldr q13, [x2, #-16] // ...........e...................................'...........~...................................'...........~......
- // uzp1 v5.8h, v12.8h, v13.8h // ..............e................................'..............~................................'..............~...
- // uzp2 v6.8h, v12.8h, v13.8h // ...............e...............................'...............~...............................'...............~..
- // ld1 {v7.8h}, [x3], #16 // .............................................e.'.............................................~.'..................
- // smull v8.4s, v3.4h, v5.4h // ..........................................e....'..........................................~....'..................
- // smull2 v10.4s, v3.8h, v5.8h // ...............................................*...............................................~..................
- // smlal v8.4s, v4.4h, v7.4h // ....~..........................................'....*..........................................'....~.............
- // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................'..*............................................'..~...............
- // smull v9.4s, v3.4h, v6.4h // ..........................e....................'..........................~....................'..................
- // smull2 v11.4s, v3.8h, v6.8h // ...........................e...................'...........................~...................'..................
- // smlal v9.4s, v4.4h, v5.4h // .............................e.................'.............................~.................'..................
- // smlal2 v11.4s, v4.8h, v5.8h // ..............................e................'..............................~................'..................
- // ldr q12, [x4], #32 // e..............................................'~..............................................'~.................
- // ldr q13, [x4, #-16] // ...................e...........................'...................~...........................'..................
- // uzp1 v3.8h, v12.8h, v13.8h // .......................e.......................'.......................~.......................'..................
- // uzp2 v4.8h, v12.8h, v13.8h // ...................................e...........'...................................~...........'..................
- // ldr q12, [x5], #32 // .e.............................................'.~.............................................'.~................
- // ldr q13, [x5, #-16] // ............................e..................'............................~..................'..................
- // uzp1 v5.8h, v12.8h, v13.8h // ................................e..............'................................~..............'..................
- // uzp2 v6.8h, v12.8h, v13.8h // ...............................e...............'...............................~...............'..................
- // ld1 {v7.8h}, [x6], #16 // .......................................e.......'.......................................~.......'..................
- // smlal v8.4s, v3.4h, v5.4h // ........~......................................'........*......................................'........~.........
- // smlal2 v10.4s, v3.8h, v5.8h // ......~........................................'......*........................................'......~...........
- // smlal v8.4s, v4.4h, v7.4h // ............~..................................'............*..................................'............~.....
- // smlal2 v10.4s, v4.8h, v7.8h // .........~.....................................'.........*.....................................'.........~........
- // smlal v9.4s, v3.4h, v6.4h // .................................e.............'.................................~.............'..................
- // smlal2 v11.4s, v3.8h, v6.8h // ..................................e............'..................................~............'..................
- // smlal v9.4s, v4.4h, v5.4h // .....................................e.........'.....................................~.........'..................
- // smlal2 v11.4s, v4.8h, v5.8h // ......................................e........'......................................~........'..................
- // uzp1 v28.8h, v8.8h, v10.8h // ................~..............................'................*..............................'................~.
- // mul v28.8h, v28.8h, v2.8h // ..................~............................'..................*............................'..................
- // smlal v8.4s, v28.4h, v0.4h // ....................................~..........'....................................*..........'..................
- // smlal2 v10.4s, v28.8h, v0.8h // ......................~........................'......................*........................'..................
- // uzp2 v26.8h, v8.8h, v10.8h // ........................................~......'........................................*......'..................
- // uzp1 v28.8h, v9.8h, v11.8h // .........................................e.....'.........................................~.....'..................
- // mul v28.8h, v28.8h, v2.8h // ...........................................e...'...........................................~...'..................
- // smlal v9.4s, v28.4h, v0.4h // ...~...........................................'...*...........................................'...~..............
- // smlal2 v11.4s, v28.8h, v0.8h // ..............................................e'..............................................~'..................
- // uzp2 v27.8h, v9.8h, v11.8h // .............~.................................'.............*.................................'.............~....
- // zip1 v12.8h, v26.8h, v27.8h // ..........~....................................'..........~....................................'..........l.......
- // zip2 v13.8h, v26.8h, v27.8h // ............................................~..'............................................*..'..................
- // str q12, [x0], #32 // .................~.............................'.................~.............................'.................l
- // str q13, [x0, #-16] // .....~.........................................'.....~.........................................'.....l............
-
- sub count, count, #1
- cbnz count, 1b
- // Instructions: 21
- // Expected cycles: 35
- // Expected IPC: 0.60
-
- // Cycle bound: 35.0
- // IPC bound: 0.60
-
- // Wall time: 0.08s
- // User time: 0.08s
-
- // ----- original position ----->
- // 0 25
- // |------------------------|----
- smull2 v5.4S, v3.8H, v17.8H // *.............................
- smlal v12.4S, v14.4H, v0.4H // ..*...........................
- smlal v23.4S, v10.4H, v4.4H // ...*..........................
- str q9, [x0, #16] // ....*.........................
- smlal2 v5.4S, v10.8H, v4.8H // .*............................
- uzp2 v11.8H, v12.8H, v11.8H // ..........*...................
- zip1 v9.8H, v19.8H, v27.8H // ........*.....................
- smlal v23.4S, v13.4H, v15.4H // ......*.......................
- smlal2 v5.4S, v13.8H, v15.8H // .....*........................
- str q9, [x0], #32 // ............*.................
- smlal v23.4S, v28.4H, v22.4H // .........*....................
- smlal2 v5.4S, v28.8H, v22.8H // .......*......................
- uzp1 v9.8H, v23.8H, v5.8H // ...........*..................
- mul v9.8H, v9.8H, v2.8H // .............*................
- smlal2 v5.4S, v9.8H, v0.8H // ..............*...............
- smlal v23.4S, v9.4H, v0.4H // ...............*..............
- uzp2 v9.8H, v23.8H, v5.8H // ................*.............
- zip2 v5.8H, v9.8H, v11.8H // .................*............
- zip1 v9.8H, v9.8H, v11.8H // ...................*..........
- str q5, [x0, #16] // ..................*...........
- str q9, [x0], #32 // ....................*.........
-
- // -------- new position -------->
- // 0 25
- // |------------------------|-----
- // smull2 v20.4S, v3.8H, v17.8H // *..............................
- // smlal2 v20.4S, v10.8H, v4.8H // ....*..........................
- // smlal v12.4S, v14.4H, v0.4H // .*.............................
- // smlal v23.4S, v10.4H, v4.4H // ..*............................
- // str q9, [x0, #16] // ...*...........................
- // smlal2 v20.4S, v13.8H, v15.8H // ........*......................
- // smlal v23.4S, v13.4H, v15.4H // .......*.......................
- // smlal2 v20.4S, v28.8H, v22.8H // ...........*...................
- // zip1 v26.8H, v19.8H, v27.8H // ......*........................
- // smlal v23.4S, v28.4H, v22.4H // ..........*....................
- // uzp2 v27.8H, v12.8H, v11.8H // .....*.........................
- // uzp1 v5.8H, v23.8H, v20.8H // ............*..................
- // str q26, [x0], #32 // .........*.....................
- // mul v31.8H, v5.8H, v2.8H // .............*.................
- // smlal2 v20.4S, v31.8H, v0.8H // ..............*................
- // smlal v23.4S, v31.4H, v0.4H // ...............*...............
- // uzp2 v19.8H, v23.8H, v20.8H // ................*..............
- // zip2 v9.8H, v19.8H, v27.8H // .................*.............
- // str q9, [x0, #16] // ...................*...........
- // zip1 v26.8H, v19.8H, v27.8H // ..................*............
- // str q26, [x0], #32 // ....................*..........
-
-
- pop_stack
- ret
-#endif /* MLKEM_K == 2 */
-
-#if MLKEM_K == 3
-.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
-
-.balign 4
-MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt):
- push_stack
- mov wtmp, #3329
- dup modulus.8h, wtmp
-
- mov wtmp, #3327
- dup modulus_twisted.8h, wtmp
-
- // Computed bases of vector entries
-
- add a1_ptr, a0_ptr, #(1 * 512)
- add b1_ptr, b0_ptr, #(1 * 512)
- add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
- add a2_ptr, a0_ptr, #(2 * 512)
- add b2_ptr, b0_ptr, #(2 * 512)
- add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2)
-
- mov count, #(MLKEM_N / 16)
- // Instructions: 75
- // Expected cycles: 103
- // Expected IPC: 0.73
-
- // Cycle bound: 103.0
- // IPC bound: 0.73
-
- // Wall time: 0.94s
- // User time: 0.94s
-
- // --------------------------- original position ---------------------------->
- // 0 25 50
- // |------------------------|------------------------|
- ldr q7, [x2, #16] // *..........................................................................
- ldr q20, [x2], #32 // ..*........................................................................
- ldr q15, [x1, #16] // .*.........................................................................
- uzp1 v8.8H, v20.8H, v7.8H // ...............*...........................................................
- uzp2 v7.8H, v20.8H, v7.8H // ................*..........................................................
- ld1 {v20.8H}, [x3], #16 // ...*.......................................................................
- ldr q30, [x1], #32 // ..............*............................................................
- ldr q11, [x4], #32 // ....*......................................................................
- uzp1 v16.8H, v30.8H, v15.8H // .................*.........................................................
- uzp2 v15.8H, v30.8H, v15.8H // ..................*........................................................
- smull v30.4S, v16.4H, v7.4H // ...................*.......................................................
- smull2 v7.4S, v16.8H, v7.8H // ....................*......................................................
- smull v9.4S, v16.4H, v8.4H // .....................*.....................................................
- smull2 v16.4S, v16.8H, v8.8H // ......................*....................................................
- smlal v30.4S, v15.4H, v8.4H // .......................*...................................................
- smlal2 v7.4S, v15.8H, v8.8H // ........................*..................................................
- smlal v9.4S, v15.4H, v20.4H // .........................*.................................................
- smlal2 v16.4S, v15.8H, v20.8H // ..........................*................................................
- ldr q20, [x4, #-16] // .....*.....................................................................
- ldr q15, [x5], #32 // ......*....................................................................
- uzp1 v8.8H, v11.8H, v20.8H // ...........................*...............................................
- uzp2 v20.8H, v11.8H, v20.8H // ............................*..............................................
- ldr q11, [x5, #-16] // .......*...................................................................
- ld1 {v27.8H}, [x6], #16 // ........*..................................................................
- uzp1 v10.8H, v15.8H, v11.8H // .............................*.............................................
- uzp2 v15.8H, v15.8H, v11.8H // ..............................*............................................
- smlal v9.4S, v8.4H, v10.4H // ...............................*...........................................
- smlal2 v16.4S, v8.8H, v10.8H // ................................*..........................................
- smlal v30.4S, v8.4H, v15.4H // .................................*.........................................
- smlal2 v7.4S, v8.8H, v15.8H // ..................................*........................................
- smlal v9.4S, v20.4H, v27.4H // ...................................*.......................................
- smlal2 v16.4S, v20.8H, v27.8H // ....................................*......................................
- smlal v30.4S, v20.4H, v10.4H // .....................................*.....................................
- smlal2 v7.4S, v20.8H, v10.8H // ......................................*....................................
- ldr q20, [x7], #32 // .........*.................................................................
- ldr q15, [x7, #-16] // ..........*................................................................
- ldr q8, [x8], #32 // ...........*...............................................................
- uzp1 v11.8H, v20.8H, v15.8H // .......................................*...................................
- uzp2 v20.8H, v20.8H, v15.8H // ........................................*..................................
- ldr q15, [x8, #-16] // ............*..............................................................
- ld1 {v27.8H}, [x9], #16 // .............*.............................................................
- uzp1 v10.8H, v8.8H, v15.8H // .........................................*.................................
- uzp2 v15.8H, v8.8H, v15.8H // ..........................................*................................
- smlal v9.4S, v11.4H, v10.4H // ...........................................*...............................
- smlal2 v16.4S, v11.8H, v10.8H // ............................................*..............................
- smlal v30.4S, v11.4H, v15.4H // .............................................*.............................
- smlal2 v7.4S, v11.8H, v15.8H // ..............................................*............................
- smlal v9.4S, v20.4H, v27.4H // ...............................................*...........................
- smlal2 v16.4S, v20.8H, v27.8H // ................................................*..........................
- smlal v30.4S, v20.4H, v10.4H // .................................................*.........................
- smlal2 v7.4S, v20.8H, v10.8H // ..................................................*........................
- ldr q15, [x2], #32 // ...............................................................*...........
- uzp1 v20.8H, v9.8H, v16.8H // ....................................................*......................
- uzp1 v8.8H, v30.8H, v7.8H // .....................................................*.....................
- mul v20.8H, v20.8H, v2.8H // ......................................................*....................
- mul v8.8H, v8.8H, v2.8H // .......................................................*...................
- ldr q21, [x4], #32 // .................................................................*.........
- smlal v9.4S, v20.4H, v0.4H // ........................................................*..................
- smlal2 v16.4S, v20.8H, v0.8H // .........................................................*.................
- smlal v30.4S, v8.4H, v0.4H // ..........................................................*................
- smlal2 v7.4S, v8.8H, v0.8H // ...........................................................*...............
- ldr q6, [x4, #-16] // ..................................................................*........
- uzp2 v27.8H, v9.8H, v16.8H // ............................................................*..............
- uzp2 v10.8H, v30.8H, v7.8H // .............................................................*.............
- ldr q16, [x2, #-16] // ...................................................*.......................
- ldr q30, [x1, #16] // ..............................................................*............
- ld1 {v9.8H}, [x3], #16 // ................................................................*..........
- ldr q1, [x5], #32 // ...................................................................*.......
- ldr q12, [x5, #-16] // ....................................................................*......
- ld1 {v24.8H}, [x6], #16 // .....................................................................*.....
- ldr q19, [x7], #32 // ......................................................................*....
- ldr q31, [x7, #-16] // .......................................................................*...
- ldr q17, [x8], #32 // ........................................................................*..
- ldr q18, [x8, #-16] // .........................................................................*.
- ld1 {v25.8H}, [x9], #16 // ..........................................................................*
-
- // ------------------------------ new position ------------------------------>
- // 0 25 50
- // |------------------------|------------------------|------------------------
- // ldr q16, [x2, #16] // *..........................................................................
- // ldr q30, [x1, #16] // ..*........................................................................
- // ldr q15, [x2], #32 // .*.........................................................................
- // ld1 {v9.8H}, [x3], #16 // .....*.....................................................................
- // ldr q21, [x4], #32 // .......*...................................................................
- // ldr q6, [x4, #-16] // ..................*........................................................
- // ldr q1, [x5], #32 // ...................*.......................................................
- // ldr q12, [x5, #-16] // ......................*....................................................
- // ld1 {v24.8H}, [x6], #16 // .......................*...................................................
- // ldr q19, [x7], #32 // ..................................*........................................
- // ldr q31, [x7, #-16] // ...................................*.......................................
- // ldr q17, [x8], #32 // ....................................*......................................
- // ldr q18, [x8, #-16] // .......................................*...................................
- // ld1 {v25.8H}, [x9], #16 // ........................................*..................................
- // ldr q20, [x1], #32 // ......*....................................................................
- // uzp1 v7.8H, v15.8H, v16.8H // ...*.......................................................................
- // uzp2 v15.8H, v15.8H, v16.8H // ....*......................................................................
- // uzp1 v8.8H, v20.8H, v30.8H // ........*..................................................................
- // uzp2 v20.8H, v20.8H, v30.8H // .........*.................................................................
- // smull v30.4S, v8.4H, v15.4H // ..........*................................................................
- // smull2 v15.4S, v8.8H, v15.8H // ...........*...............................................................
- // smull v11.4S, v8.4H, v7.4H // ............*..............................................................
- // smull2 v8.4S, v8.8H, v7.8H // .............*.............................................................
- // smlal v30.4S, v20.4H, v7.4H // ..............*............................................................
- // smlal2 v15.4S, v20.8H, v7.8H // ...............*...........................................................
- // smlal v11.4S, v20.4H, v9.4H // ................*..........................................................
- // smlal2 v8.4S, v20.8H, v9.8H // .................*.........................................................
- // uzp1 v7.8H, v21.8H, v6.8H // ....................*......................................................
- // uzp2 v20.8H, v21.8H, v6.8H // .....................*.....................................................
- // uzp1 v16.8H, v1.8H, v12.8H // ........................*..................................................
- // uzp2 v9.8H, v1.8H, v12.8H // .........................*.................................................
- // smlal v11.4S, v7.4H, v16.4H // ..........................*................................................
- // smlal2 v8.4S, v7.8H, v16.8H // ...........................*...............................................
- // smlal v30.4S, v7.4H, v9.4H // ............................*..............................................
- // smlal2 v15.4S, v7.8H, v9.8H // .............................*.............................................
- // smlal v11.4S, v20.4H, v24.4H // ..............................*............................................
- // smlal2 v8.4S, v20.8H, v24.8H // ...............................*...........................................
- // smlal v30.4S, v20.4H, v16.4H // ................................*..........................................
- // smlal2 v15.4S, v20.8H, v16.8H // .................................*.........................................
- // uzp1 v7.8H, v19.8H, v31.8H // .....................................*.....................................
- // uzp2 v20.8H, v19.8H, v31.8H // ......................................*....................................
- // uzp1 v16.8H, v17.8H, v18.8H // .........................................*.................................
- // uzp2 v9.8H, v17.8H, v18.8H // ..........................................*................................
- // smlal v11.4S, v7.4H, v16.4H // ...........................................*...............................
- // smlal2 v8.4S, v7.8H, v16.8H // ............................................*..............................
- // smlal v30.4S, v7.4H, v9.4H // .............................................*.............................
- // smlal2 v15.4S, v7.8H, v9.8H // ..............................................*............................
- // smlal v11.4S, v20.4H, v25.4H // ...............................................*...........................
- // smlal2 v8.4S, v20.8H, v25.8H // ................................................*..........................
- // smlal v30.4S, v20.4H, v16.4H // .................................................*.........................
- // smlal2 v15.4S, v20.8H, v16.8H // ..................................................*........................
- // ldr q16, [x2, #16] // ................................................................*..........
- // uzp1 v7.8H, v11.8H, v8.8H // ....................................................*......................
- // uzp1 v20.8H, v30.8H, v15.8H // .....................................................*.....................
- // mul v7.8H, v7.8H, v2.8H // ......................................................*....................
- // mul v20.8H, v20.8H, v2.8H // .......................................................*...................
- // smlal v11.4S, v7.4H, v0.4H // .........................................................*.................
- // smlal2 v8.4S, v7.8H, v0.8H // ..........................................................*................
- // smlal v30.4S, v20.4H, v0.4H // ...........................................................*...............
- // smlal2 v15.4S, v20.8H, v0.8H // ............................................................*..............
- // uzp2 v27.8H, v11.8H, v8.8H // ..............................................................*............
- // uzp2 v10.8H, v30.8H, v15.8H // ...............................................................*...........
- // ldr q30, [x1, #16] // .................................................................*.........
- // ldr q15, [x2], #32 // ...................................................*.......................
- // ld1 {v9.8H}, [x3], #16 // ..................................................................*........
- // ldr q21, [x4], #32 // ........................................................*..................
- // ldr q6, [x4, #-16] // .............................................................*.............
- // ldr q1, [x5], #32 // ...................................................................*.......
- // ldr q12, [x5, #-16] // ....................................................................*......
- // ld1 {v24.8H}, [x6], #16 // .....................................................................*.....
- // ldr q19, [x7], #32 // ......................................................................*....
- // ldr q31, [x7, #-16] // .......................................................................*...
- // ldr q17, [x8], #32 // ........................................................................*..
- // ldr q18, [x8, #-16] // .........................................................................*.
- // ld1 {v25.8H}, [x9], #16 // ..........................................................................*
-
- sub count, count, #2
-1:
- // Instructions: 65
- // Expected cycles: 80
- // Expected IPC: 0.81
-
- // Cycle bound: 80.0
- // IPC bound: 0.81
-
- // Wall time: 11.64s
- // User time: 11.64s
-
- // ---------------------- original position ----------------------->
- // 0 25 50
- // |------------------------|------------------------|--------------
- ldr q20, [x1], #32 // *................................................................
- uzp1 v7.8H, v15.8H, v16.8H // ......*..........................................................
- uzp2 v15.8H, v15.8H, v16.8H // .......*.........................................................
- uzp1 v8.8H, v20.8H, v30.8H // ..*..............................................................
- uzp2 v20.8H, v20.8H, v30.8H // ...*.............................................................
- smull v30.4S, v8.4H, v15.4H // .............*...................................................
- smull2 v15.4S, v8.8H, v15.8H // ..............*..................................................
- smull v11.4S, v8.4H, v7.4H // .........*.......................................................
- smull2 v8.4S, v8.8H, v7.8H // ..........*......................................................
- smlal v30.4S, v20.4H, v7.4H // ...............*.................................................
- smlal2 v15.4S, v20.8H, v7.8H // ................*................................................
- smlal v11.4S, v20.4H, v9.4H // ...........*.....................................................
- smlal2 v8.4S, v20.8H, v9.8H // ............*....................................................
- uzp1 v7.8H, v21.8H, v6.8H // ...................*.............................................
- uzp2 v20.8H, v21.8H, v6.8H // ....................*............................................
- uzp1 v16.8H, v1.8H, v12.8H // .......................*.........................................
- uzp2 v9.8H, v1.8H, v12.8H // ........................*........................................
- smlal v11.4S, v7.4H, v16.4H // ..........................*......................................
- smlal2 v8.4S, v7.8H, v16.8H // ...........................*.....................................
- smlal v30.4S, v7.4H, v9.4H // ..............................*..................................
- smlal2 v15.4S, v7.8H, v9.8H // ...............................*.................................
- smlal v11.4S, v20.4H, v24.4H // ............................*....................................
- smlal2 v8.4S, v20.8H, v24.8H // .............................*...................................
- smlal v30.4S, v20.4H, v16.4H // ................................*................................
- smlal2 v15.4S, v20.8H, v16.8H // .................................*...............................
- uzp1 v7.8H, v19.8H, v31.8H // ....................................*............................
- uzp2 v20.8H, v19.8H, v31.8H // .....................................*...........................
- uzp1 v16.8H, v17.8H, v18.8H // ........................................*........................
- uzp2 v9.8H, v17.8H, v18.8H // .........................................*.......................
- smlal v11.4S, v7.4H, v16.4H // ...........................................*.....................
- smlal2 v8.4S, v7.8H, v16.8H // ............................................*....................
- smlal v30.4S, v7.4H, v9.4H // ...............................................*.................
- smlal2 v15.4S, v7.8H, v9.8H // ................................................*................
- smlal v11.4S, v20.4H, v25.4H // .............................................*...................
- smlal2 v8.4S, v20.8H, v25.8H // ..............................................*..................
- smlal v30.4S, v20.4H, v16.4H // .................................................*...............
- smlal2 v15.4S, v20.8H, v16.8H // ..................................................*..............
- ldr q16, [x2, #16] // .....e...........................................................
- uzp1 v7.8H, v11.8H, v8.8H // ...................................................*.............
- uzp1 v20.8H, v30.8H, v15.8H // ........................................................*........
- mul v7.8H, v7.8H, v2.8H // ....................................................*............
- mul v20.8H, v20.8H, v2.8H // .........................................................*.......
- zip2 v9.8H, v27.8H, v10.8H // ..............................................................l..
- zip1 v27.8H, v27.8H, v10.8H // .............................................................l...
- smlal v11.4S, v7.4H, v0.4H // .....................................................*...........
- smlal2 v8.4S, v7.8H, v0.8H // ......................................................*..........
- smlal v30.4S, v20.4H, v0.4H // ..........................................................*......
- smlal2 v15.4S, v20.8H, v0.8H // ...........................................................*.....
- str q27, [x0], #32 // ...............................................................l.
- uzp2 v27.8H, v11.8H, v8.8H // .......................................................*.........
- str q9, [x0, #-16] // ................................................................l
- uzp2 v10.8H, v30.8H, v15.8H // ............................................................*....
- ldr q30, [x1, #16] // .e...............................................................
- ldr q15, [x2], #32 // ....e............................................................
- ld1 {v9.8H}, [x3], #16 // ........e........................................................
- ldr q21, [x4], #32 // .................e...............................................
- ldr q6, [x4, #-16] // ..................e..............................................
- ldr q1, [x5], #32 // .....................e...........................................
- ldr q12, [x5, #-16] // ......................e..........................................
- ld1 {v24.8H}, [x6], #16 // .........................e.......................................
- ldr q19, [x7], #32 // ..................................e..............................
- ldr q31, [x7, #-16] // ...................................e.............................
- ldr q17, [x8], #32 // ......................................e..........................
- ldr q18, [x8, #-16] // .......................................e.........................
- ld1 {v25.8H}, [x9], #16 // ..........................................e......................
-
- // ---------------------------------------------------------------- new position ----------------------------------------------------------------->
- // 0 25 50 75 100 125
- // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------
- // ldr q12, [x1], #32 // ............................*................................................................~..................................................
- // ldr q13, [x1, #-16] // ...............e............'...................................................~............'..................................................
- // uzp1 v3.8h, v12.8h, v13.8h // ............................'..*.............................................................'..~...............................................
- // uzp2 v4.8h, v12.8h, v13.8h // ............................'...*............................................................'...~..............................................
- // ldr q12, [x2], #32 // ................e...........'....................................................~...........'..................................................
- // ldr q13, [x2, #-16] // e...........................'....................................~...........................'....................................~.............
- // uzp1 v5.8h, v12.8h, v13.8h // ............................'*...............................................................'~.................................................
- // uzp2 v6.8h, v12.8h, v13.8h // ............................'.*..............................................................'.~................................................
- // ld1 {v7.8h}, [x3], #16 // .................e..........'.....................................................~..........'..................................................
- // smull v8.4s, v3.4h, v5.4h // ............................'......*.........................................................'......~...........................................
- // smull2 v10.4s, v3.8h, v5.8h // ............................'.......*........................................................'.......~..........................................
- // smlal v8.4s, v4.4h, v7.4h // ............................'..........*.....................................................'..........~.......................................
- // smlal2 v10.4s, v4.8h, v7.8h // ............................'...........*....................................................'...........~......................................
- // smull v9.4s, v3.4h, v6.4h // ............................'....*...........................................................'....~.............................................
- // smull2 v11.4s, v3.8h, v6.8h // ............................'.....*..........................................................'.....~............................................
- // smlal v9.4s, v4.4h, v5.4h // ............................'........*.......................................................'........~.........................................
- // smlal2 v11.4s, v4.8h, v5.8h // ............................'.........*......................................................'.........~........................................
- // ldr q12, [x4], #32 // ..................e.........'......................................................~.........'..................................................
- // ldr q13, [x4, #-16] // ...................e........'.......................................................~........'..................................................
- // uzp1 v3.8h, v12.8h, v13.8h // ............................'............*...................................................'............~.....................................
- // uzp2 v4.8h, v12.8h, v13.8h // ............................'.............*..................................................'.............~....................................
- // ldr q12, [x5], #32 // ....................e.......'........................................................~.......'..................................................
- // ldr q13, [x5, #-16] // .....................e......'.........................................................~......'..................................................
- // uzp1 v5.8h, v12.8h, v13.8h // ............................'..............*.................................................'..............~...................................
- // uzp2 v6.8h, v12.8h, v13.8h // ............................'...............*................................................'...............~..................................
- // ld1 {v7.8h}, [x6], #16 // ......................e.....'..........................................................~.....'..................................................
- // smlal v8.4s, v3.4h, v5.4h // ............................'................*...............................................'................~.................................
- // smlal2 v10.4s, v3.8h, v5.8h // ............................'.................*..............................................'.................~................................
- // smlal v8.4s, v4.4h, v7.4h // ............................'....................*...........................................'....................~.............................
- // smlal2 v10.4s, v4.8h, v7.8h // ............................'.....................*..........................................'.....................~............................
- // smlal v9.4s, v3.4h, v6.4h // ............................'..................*.............................................'..................~...............................
- // smlal2 v11.4s, v3.8h, v6.8h // ............................'...................*............................................'...................~..............................
- // smlal v9.4s, v4.4h, v5.4h // ............................'......................*.........................................'......................~...........................
- // smlal2 v11.4s, v4.8h, v5.8h // ............................'.......................*........................................'.......................~..........................
- // ldr q12, [x7], #32 // .......................e....'...........................................................~....'..................................................
- // ldr q13, [x7, #-16] // ........................e...'............................................................~...'..................................................
- // uzp1 v3.8h, v12.8h, v13.8h // ............................'........................*.......................................'........................~.........................
- // uzp2 v4.8h, v12.8h, v13.8h // ............................'.........................*......................................'.........................~........................
- // ldr q12, [x8], #32 // .........................e..'.............................................................~..'..................................................
- // ldr q13, [x8, #-16] // ..........................e.'..............................................................~.'..................................................
- // uzp1 v5.8h, v12.8h, v13.8h // ............................'..........................*.....................................'..........................~.......................
- // uzp2 v6.8h, v12.8h, v13.8h // ............................'...........................*....................................'...........................~......................
- // ld1 {v7.8h}, [x9], #16 // ...........................e'...............................................................~'..................................................
- // smlal v8.4s, v3.4h, v5.4h // ............................'............................*...................................'............................~.....................
- // smlal2 v10.4s, v3.8h, v5.8h // ............................'.............................*..................................'.............................~....................
- // smlal v8.4s, v4.4h, v7.4h // ............................'................................*...............................'................................~.................
- // smlal2 v10.4s, v4.8h, v7.8h // ............................'.................................*..............................'.................................~................
- // smlal v9.4s, v3.4h, v6.4h // ............................'..............................*.................................'..............................~...................
- // smlal2 v11.4s, v3.8h, v6.8h // ............................'...............................*................................'...............................~..................
- // smlal v9.4s, v4.4h, v5.4h // ............................'..................................*.............................'..................................~...............
- // smlal2 v11.4s, v4.8h, v5.8h // ............................'...................................*............................'...................................~..............
- // uzp1 v28.8h, v8.8h, v10.8h // .~..........................'.....................................*..........................'.....................................~............
- // mul v28.8h, v28.8h, v2.8h // ...~........................'.......................................*........................'.......................................~..........
- // smlal v8.4s, v28.4h, v0.4h // .......~....................'...........................................*....................'...........................................~......
- // smlal2 v10.4s, v28.8h, v0.8h // ........~...................'............................................*...................'............................................~.....
- // uzp2 v26.8h, v8.8h, v10.8h // ............~...............'................................................*...............'................................................~.
- // uzp1 v28.8h, v9.8h, v11.8h // ..~.........................'......................................*.........................'......................................~...........
- // mul v28.8h, v28.8h, v2.8h // ....~.......................'........................................*.......................'........................................~.........
- // smlal v9.4s, v28.4h, v0.4h // .........~..................'.............................................*..................'.............................................~....
- // smlal2 v11.4s, v28.8h, v0.8h // ..........~.................'..............................................*.................'..............................................~...
- // uzp2 v27.8h, v9.8h, v11.8h // ..............~.............'..................................................*.............'..................................................
- // zip1 v12.8h, v26.8h, v27.8h // ......~.....................'..........................................~.....................'..........................................l.......
- // zip2 v13.8h, v26.8h, v27.8h // .....~......................'.........................................~......................'.........................................l........
- // str q12, [x0], #32 // ...........~................'...............................................~................'...............................................l..
- // str q13, [x0, #-16] // .............~..............'.................................................~..............'.................................................l
-
- sub count, count, #1
- cbnz count, 1b
- // Instructions: 55
- // Expected cycles: 61
- // Expected IPC: 0.90
-
- // Cycle bound: 61.0
- // IPC bound: 0.90
-
- // Wall time: 8.41s
- // User time: 8.41s
-
- // ----------------- original position ------------------>
- // 0 25 50
- // |------------------------|------------------------|----
- ldr q7, [x1], #32 // *......................................................
- uzp1 v20.8H, v15.8H, v16.8H // .*.....................................................
- uzp2 v15.8H, v15.8H, v16.8H // ..*....................................................
- uzp1 v23.8H, v7.8H, v30.8H // ...*...................................................
- uzp2 v11.8H, v7.8H, v30.8H // ....*..................................................
- smull2 v8.4S, v23.8H, v20.8H // ........*..............................................
- smull v5.4S, v23.4H, v20.4H // .......*...............................................
- smull2 v30.4S, v23.8H, v15.8H // ......*................................................
- uzp1 v28.8H, v1.8H, v12.8H // ...............*.......................................
- smlal2 v8.4S, v11.8H, v9.8H // ............*..........................................
- smlal v5.4S, v11.4H, v9.4H // ...........*...........................................
- uzp1 v3.8H, v21.8H, v6.8H // .............*.........................................
- smull v16.4S, v23.4H, v15.4H // .....*.................................................
- smlal2 v8.4S, v3.8H, v28.8H // ..................*....................................
- smlal v5.4S, v3.4H, v28.4H // .................*.....................................
- uzp2 v29.8H, v21.8H, v6.8H // ..............*........................................
- uzp1 v7.8H, v17.8H, v18.8H // ...........................*...........................
- smlal2 v8.4S, v29.8H, v24.8H // ......................*................................
- uzp1 v14.8H, v19.8H, v31.8H // .........................*.............................
- smlal v16.4S, v11.4H, v20.4H // .........*.............................................
- smlal2 v30.4S, v11.8H, v20.8H // ..........*............................................
- smlal2 v8.4S, v14.8H, v7.8H // ..............................*........................
- uzp2 v20.8H, v1.8H, v12.8H // ................*......................................
- uzp2 v21.8H, v19.8H, v31.8H // ..........................*............................
- smlal2 v30.4S, v3.8H, v20.8H // ....................*..................................
- smlal v16.4S, v3.4H, v20.4H // ...................*...................................
- smlal v5.4S, v29.4H, v24.4H // .....................*.................................
- uzp2 v9.8H, v17.8H, v18.8H // ............................*..........................
- smlal2 v30.4S, v29.8H, v28.8H // ........................*..............................
- smlal v16.4S, v29.4H, v28.4H // .......................*...............................
- smlal v5.4S, v14.4H, v7.4H // .............................*.........................
- smlal2 v8.4S, v21.8H, v25.8H // ..................................*....................
- smlal2 v30.4S, v14.8H, v9.8H // ................................*......................
- smlal v16.4S, v14.4H, v9.4H // ...............................*.......................
- smlal v5.4S, v21.4H, v25.4H // .................................*.....................
- zip1 v20.8H, v27.8H, v10.8H // ..........................................*............
- smlal2 v30.4S, v21.8H, v7.8H // ....................................*..................
- smlal v16.4S, v21.4H, v7.4H // ...................................*...................
- uzp1 v7.8H, v5.8H, v8.8H // .....................................*.................
- str q20, [x0], #32 // ...............................................*.......
- mul v15.8H, v7.8H, v2.8H // .......................................*...............
- uzp1 v7.8H, v16.8H, v30.8H // ......................................*................
- zip2 v31.8H, v27.8H, v10.8H // .........................................*.............
- mul v20.8H, v7.8H, v2.8H // ........................................*..............
- smlal v5.4S, v15.4H, v0.4H // ...........................................*...........
- smlal2 v8.4S, v15.8H, v0.8H // ............................................*..........
- str q31, [x0, #-16] // .................................................*.....
- smlal2 v30.4S, v20.8H, v0.8H // ..............................................*........
- smlal v16.4S, v20.4H, v0.4H // .............................................*.........
- uzp2 v15.8H, v5.8H, v8.8H // ................................................*......
- uzp2 v20.8H, v16.8H, v30.8H // ..................................................*....
- zip1 v7.8H, v15.8H, v20.8H // ....................................................*..
- zip2 v20.8H, v15.8H, v20.8H // ...................................................*...
- str q7, [x0], #32 // .....................................................*.
- str q20, [x0, #-16] // ......................................................*
-
- // -------------------- new position -------------------->
- // 0 25 50
- // |------------------------|------------------------|----
- // ldr q20, [x1], #32 // *......................................................
- // uzp1 v7.8H, v15.8H, v16.8H // .*.....................................................
- // uzp2 v15.8H, v15.8H, v16.8H // ..*....................................................
- // uzp1 v8.8H, v20.8H, v30.8H // ...*...................................................
- // uzp2 v20.8H, v20.8H, v30.8H // ....*..................................................
- // smull v30.4S, v8.4H, v15.4H // ............*..........................................
- // smull2 v15.4S, v8.8H, v15.8H // .......*...............................................
- // smull v11.4S, v8.4H, v7.4H // ......*................................................
- // smull2 v8.4S, v8.8H, v7.8H // .....*.................................................
- // smlal v30.4S, v20.4H, v7.4H // ...................*...................................
- // smlal2 v15.4S, v20.8H, v7.8H // ....................*..................................
- // smlal v11.4S, v20.4H, v9.4H // ..........*............................................
- // smlal2 v8.4S, v20.8H, v9.8H // .........*.............................................
- // uzp1 v7.8H, v21.8H, v6.8H // ...........*...........................................
- // uzp2 v20.8H, v21.8H, v6.8H // ...............*.......................................
- // uzp1 v16.8H, v1.8H, v12.8H // ........*..............................................
- // uzp2 v9.8H, v1.8H, v12.8H // ......................*................................
- // smlal v11.4S, v7.4H, v16.4H // ..............*........................................
- // smlal2 v8.4S, v7.8H, v16.8H // .............*.........................................
- // smlal v30.4S, v7.4H, v9.4H // .........................*.............................
- // smlal2 v15.4S, v7.8H, v9.8H // ........................*..............................
- // smlal v11.4S, v20.4H, v24.4H // ..........................*............................
- // smlal2 v8.4S, v20.8H, v24.8H // .................*.....................................
- // smlal v30.4S, v20.4H, v16.4H // .............................*.........................
- // smlal2 v15.4S, v20.8H, v16.8H // ............................*..........................
- // uzp1 v7.8H, v19.8H, v31.8H // ..................*....................................
- // uzp2 v20.8H, v19.8H, v31.8H // .......................*...............................
- // uzp1 v16.8H, v17.8H, v18.8H // ................*......................................
- // uzp2 v9.8H, v17.8H, v18.8H // ...........................*...........................
- // smlal v11.4S, v7.4H, v16.4H // ..............................*........................
- // smlal2 v8.4S, v7.8H, v16.8H // .....................*.................................
- // smlal v30.4S, v7.4H, v9.4H // .................................*.....................
- // smlal2 v15.4S, v7.8H, v9.8H // ................................*......................
- // smlal v11.4S, v20.4H, v25.4H // ..................................*....................
- // smlal2 v8.4S, v20.8H, v25.8H // ...............................*.......................
- // smlal v30.4S, v20.4H, v16.4H // .....................................*.................
- // smlal2 v15.4S, v20.8H, v16.8H // ....................................*..................
- // uzp1 v7.8H, v11.8H, v8.8H // ......................................*................
- // uzp1 v20.8H, v30.8H, v15.8H // .........................................*.............
- // mul v7.8H, v7.8H, v2.8H // ........................................*..............
- // mul v20.8H, v20.8H, v2.8H // ...........................................*...........
- // zip2 v9.8H, v27.8H, v10.8H // ..........................................*............
- // zip1 v27.8H, v27.8H, v10.8H // ...................................*...................
- // smlal v11.4S, v7.4H, v0.4H // ............................................*..........
- // smlal2 v8.4S, v7.8H, v0.8H // .............................................*.........
- // smlal v30.4S, v20.4H, v0.4H // ................................................*......
- // smlal2 v15.4S, v20.8H, v0.8H // ...............................................*.......
- // str q27, [x0], #32 // .......................................*...............
- // uzp2 v27.8H, v11.8H, v8.8H // .................................................*.....
- // str q9, [x0, #-16] // ..............................................*........
- // uzp2 v10.8H, v30.8H, v15.8H // ..................................................*....
- // zip2 v9.8H, v27.8H, v10.8H // ....................................................*..
- // zip1 v27.8H, v27.8H, v10.8H // ...................................................*...
- // str q27, [x0], #32 // .....................................................*.
- // str q9, [x0, #-16] // ......................................................*
-
-
- pop_stack
- ret
-#endif /* MLKEM_K == 3 */
-
-#if MLKEM_K == 4
-.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
-
-.balign 4
-MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt):
- push_stack
- mov wtmp, #3329
- dup modulus.8h, wtmp
-
- mov wtmp, #3327
- dup modulus_twisted.8h, wtmp
-
- // Computed bases of vector entries
-
- add a1_ptr, a0_ptr, #(1 * 512)
- add b1_ptr, b0_ptr, #(1 * 512)
- add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
- add a2_ptr, a0_ptr, #(2 * 512)
- add b2_ptr, b0_ptr, #(2 * 512)
- add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2)
- add a3_ptr, a0_ptr, #(3 * 512)
- add b3_ptr, b0_ptr, #(3 * 512)
- add b3_cache_ptr, b0_cache_ptr, #(3 * 512/2)
-
- // Bounds:
-
- // Each pmull is bound by 2*4096*2^15=2^28, so the final value
- // before Montgomery reduction is bound by 2^30.
-
- mov count, #(MLKEM_N / 16)
- // Instructions: 114
- // Expected cycles: 153
- // Expected IPC: 0.75
- //
- // Cycle bound: 153.0
- // IPC bound: 0.75
- //
- // Wall time: 0.69s
- // User time: 0.69s
- //
- // ----------------------------------------------- original position ----------------------------------------------->
- // 0 25 50 75 100
- // |------------------------|------------------------|------------------------|------------------------|-------------
- ldr q23, [x2, #16] // .*................................................................................................................
- ldr q19, [x2], #32 // *.................................................................................................................
- ldr q17, [x5], #32 // ..*...............................................................................................................
- uzp2 v13.8H, v19.8H, v23.8H // ..........*.......................................................................................................
- uzp1 v19.8H, v19.8H, v23.8H // ...........*......................................................................................................
- ldr q23, [x5, #-16] // ...*..............................................................................................................
- ldr q30, [x1, #16] // .....*............................................................................................................
- uzp2 v9.8H, v17.8H, v23.8H // ....*.............................................................................................................
- uzp1 v23.8H, v17.8H, v23.8H // .......*..........................................................................................................
- ldr q17, [x1], #32 // ......*...........................................................................................................
- ldr q10, [x7, #16] // .............*....................................................................................................
- uzp1 v12.8H, v17.8H, v30.8H // ........*.........................................................................................................
- uzp2 v17.8H, v17.8H, v30.8H // .........*........................................................................................................
- smull2 v30.4S, v12.8H, v13.8H // ............*.....................................................................................................
- smull v13.4S, v12.4H, v13.4H // ............................................*.....................................................................
- smull2 v22.4S, v12.8H, v19.8H // .....................................*............................................................................
- smull v12.4S, v12.4H, v19.4H // ..........................................*.......................................................................
- smlal2 v30.4S, v17.8H, v19.8H // ...............................*..................................................................................
- smlal v13.4S, v17.4H, v19.4H // ...............................................*..................................................................
- ldr q19, [x4], #32 // ....................*.............................................................................................
- ldr q16, [x4, #-16] // .....................*............................................................................................
- ld1 {v8.8H}, [x3], #16 // ................................*.................................................................................
- uzp1 v26.8H, v19.8H, v16.8H // .......................*..........................................................................................
- uzp2 v19.8H, v19.8H, v16.8H // ........................*.........................................................................................
- smlal2 v30.4S, v26.8H, v9.8H // .................................*................................................................................
- smlal v13.4S, v26.4H, v9.4H // ..................................................*...............................................................
- smlal2 v22.4S, v17.8H, v8.8H // ........................................*.........................................................................
- smlal v12.4S, v17.4H, v8.4H // .................................................*................................................................
- smlal2 v30.4S, v19.8H, v23.8H // ...................................*..............................................................................
- smlal v13.4S, v19.4H, v23.4H // .......................................................*..........................................................
- smlal2 v22.4S, v26.8H, v23.8H // ...........................................*......................................................................
- smlal v12.4S, v26.4H, v23.4H // .....................................................*............................................................
- ldr q23, [x7], #32 // ......................*...........................................................................................
- ldr q17, [x8, #16] // ..............*...................................................................................................
- uzp1 v9.8H, v23.8H, v10.8H // ..........................*.......................................................................................
- uzp2 v23.8H, v23.8H, v10.8H // ....................................*.............................................................................
- ldr q10, [x10], #32 // ...............*..................................................................................................
- ldr q16, [x10, #-16] // ................*.................................................................................................
- ld1 {v8.8H}, [x12], #16 // .................*................................................................................................
- uzp1 v26.8H, v10.8H, v16.8H // ..................*...............................................................................................
- uzp2 v10.8H, v10.8H, v16.8H // ...................*..............................................................................................
- ld1 {v16.8H}, [x6], #16 // .........................*........................................................................................
- ldr q3, [x11, #16] // ...........................*......................................................................................
- smlal2 v22.4S, v19.8H, v16.8H // ..............................................*...................................................................
- smlal v12.4S, v19.4H, v16.4H // ........................................................*.........................................................
- ldr q19, [x11], #32 // ............................*.....................................................................................
- ld1 {v16.8H}, [x9], #16 // .............................*....................................................................................
- uzp1 v4.8H, v19.8H, v3.8H // ..................................*...............................................................................
- uzp2 v19.8H, v19.8H, v3.8H // .......................................*..........................................................................
- ldr q3, [x8], #32 // ..............................*...................................................................................
- ldr q31, [x2], #32 // ......................................*...........................................................................
- uzp1 v6.8H, v3.8H, v17.8H // ...................................................*..............................................................
- uzp2 v17.8H, v3.8H, v17.8H // .........................................................*........................................................
- smlal2 v22.4S, v9.8H, v6.8H // ..........................................................*.......................................................
- smlal2 v30.4S, v9.8H, v17.8H // ...........................................................*......................................................
- smlal v13.4S, v9.4H, v17.4H // ............................................................*.....................................................
- smlal v12.4S, v9.4H, v6.4H // .............................................................*....................................................
- smlal2 v22.4S, v23.8H, v16.8H // ..............................................................*...................................................
- smlal2 v30.4S, v23.8H, v6.8H // ...............................................................*..................................................
- smlal v13.4S, v23.4H, v6.4H // ................................................................*.................................................
- smlal v12.4S, v23.4H, v16.4H // .................................................................*................................................
- smlal2 v22.4S, v26.8H, v4.8H // ..................................................................*...............................................
- smlal2 v30.4S, v26.8H, v19.8H // ...................................................................*..............................................
- smlal v13.4S, v26.4H, v19.4H // ....................................................................*.............................................
- smlal v12.4S, v26.4H, v4.4H // .....................................................................*............................................
- smlal2 v22.4S, v10.8H, v8.8H // ......................................................................*...........................................
- smlal2 v30.4S, v10.8H, v4.8H // .......................................................................*..........................................
- smlal v13.4S, v10.4H, v4.4H // ........................................................................*.........................................
- smlal v12.4S, v10.4H, v8.4H // .........................................................................*........................................
- ldr q19, [x2, #-16] // .........................................*........................................................................
- uzp1 v23.8H, v13.8H, v30.8H // ...........................................................................*......................................
- uzp1 v17.8H, v12.8H, v22.8H // ....................................................................................*.............................
- mul v23.8H, v23.8H, v2.8H // .............................................................................*....................................
- uzp2 v21.8H, v31.8H, v19.8H // ................................................................................*.................................
- uzp1 v19.8H, v31.8H, v19.8H // ...................................................................................*..............................
- mul v17.8H, v17.8H, v2.8H // .....................................................................................*............................
- smlal v13.4S, v23.4H, v0.4H // .................................................................................*................................
- smlal2 v30.4S, v23.8H, v0.8H // ..................................................................................*...............................
- ldr q23, [x5], #32 // .............................................*....................................................................
- smlal2 v22.4S, v17.8H, v0.8H // ...........................................................................................................*......
- uzp2 v15.8H, v13.8H, v30.8H // ......................................................................................*...........................
- smlal v12.4S, v17.4H, v0.4H // ............................................................................................................*.....
- ldr q17, [x5, #-16] // ................................................*.................................................................
- ldr q13, [x1, #16] // ......................................................*...........................................................
- uzp2 v27.8H, v23.8H, v17.8H // ....................................................*.............................................................
- uzp1 v28.8H, v23.8H, v17.8H // ............................................................................*.....................................
- uzp2 v7.8H, v12.8H, v22.8H // ...............................................................................................................*..
- ldr q23, [x1], #32 // ..........................................................................*.......................................
- zip1 v5.8H, v7.8H, v15.8H // .................................................................................................................*
- ldr q3, [x7, #16] // ........................................................................................*.........................
- uzp1 v31.8H, v23.8H, v13.8H // ..............................................................................*...................................
- uzp2 v16.8H, v23.8H, v13.8H // ...............................................................................*..................................
- smull2 v24.4S, v31.8H, v21.8H // .......................................................................................*..........................
- ldr q6, [x8, #16] // .........................................................................................*........................
- ldr q23, [x10], #32 // ..........................................................................................*.......................
- smlal2 v24.4S, v16.8H, v19.8H // ..........................................................................................................*.......
- ldr q17, [x10, #-16] // ...........................................................................................*......................
- ld1 {v22.8H}, [x12], #16 // ............................................................................................*.....................
- uzp1 v30.8H, v23.8H, v17.8H // .............................................................................................*....................
- uzp2 v11.8H, v23.8H, v17.8H // ..............................................................................................*...................
- ldr q23, [x4], #32 // ...............................................................................................*..................
- ldr q17, [x4, #-16] // ................................................................................................*.................
- ldr q4, [x7], #32 // .................................................................................................*................
- uzp1 v20.8H, v23.8H, v17.8H // ..................................................................................................*...............
- uzp2 v26.8H, v23.8H, v17.8H // ...................................................................................................*..............
- uzp1 v9.8H, v4.8H, v3.8H // .....................................................................................................*............
- smlal2 v24.4S, v20.8H, v27.8H // ..............................................................................................................*...
- ld1 {v8.8H}, [x6], #16 // ....................................................................................................*.............
- ldr q25, [x11, #16] // ......................................................................................................*...........
- ldr q29, [x11], #32 // .......................................................................................................*..........
- ld1 {v12.8H}, [x9], #16 // ........................................................................................................*.........
- uzp1 v10.8H, v29.8H, v25.8H // ................................................................................................................*.
- ldr q14, [x8], #32 // .........................................................................................................*........
- ld1 {v23.8H}, [x3], #16 // .............................................................................................................*....
-
- // ------------------------------------------------- new position -------------------------------------------------->
- // 0 25 50 75 100
- // |------------------------|------------------------|------------------------|------------------------|-------------
- // ldr q3, [x2], #32 // .*................................................................................................................
- // ldr q17, [x2, #-16] // *.................................................................................................................
- // ldr q21, [x5], #32 // ..*...............................................................................................................
- // ldr q19, [x5, #-16] // .....*............................................................................................................
- // uzp2 v27.8H, v21.8H, v19.8H // .......*..........................................................................................................
- // ldr q25, [x1, #16] // ......*...........................................................................................................
- // ldr q22, [x1], #32 // .........*........................................................................................................
- // uzp1 v28.8H, v21.8H, v19.8H // ........*.........................................................................................................
- // uzp1 v31.8H, v22.8H, v25.8H // ...........*......................................................................................................
- // uzp2 v16.8H, v22.8H, v25.8H // ............*.....................................................................................................
- // uzp2 v21.8H, v3.8H, v17.8H // ...*..............................................................................................................
- // uzp1 v19.8H, v3.8H, v17.8H // ....*.............................................................................................................
- // smull2 v24.4S, v31.8H, v21.8H // .............*....................................................................................................
- // ldr q3, [x7, #16] // ..........*.......................................................................................................
- // ldr q6, [x8, #16] // .................................*................................................................................
- // ldr q8, [x10], #32 // ....................................*.............................................................................
- // ldr q26, [x10, #-16] // .....................................*............................................................................
- // ld1 {v22.8H}, [x12], #16 // ......................................*...........................................................................
- // uzp1 v30.8H, v8.8H, v26.8H // .......................................*..........................................................................
- // uzp2 v11.8H, v8.8H, v26.8H // ........................................*.........................................................................
- // ldr q8, [x4], #32 // ...................*..............................................................................................
- // ldr q26, [x4, #-16] // ....................*.............................................................................................
- // ldr q4, [x7], #32 // ................................*.................................................................................
- // uzp1 v20.8H, v8.8H, v26.8H // ......................*...........................................................................................
- // uzp2 v26.8H, v8.8H, v26.8H // .......................*..........................................................................................
- // ld1 {v8.8H}, [x6], #16 // .........................................*........................................................................
- // uzp1 v9.8H, v4.8H, v3.8H // ..................................*...............................................................................
- // ldr q25, [x11, #16] // ..........................................*.......................................................................
- // ldr q29, [x11], #32 // .............................................*....................................................................
- // ld1 {v12.8H}, [x9], #16 // ..............................................*...................................................................
- // ldr q14, [x8], #32 // .................................................*................................................................
- // smlal2 v24.4S, v16.8H, v19.8H // .................*................................................................................................
- // ld1 {v23.8H}, [x3], #16 // .....................*............................................................................................
- // smlal2 v24.4S, v20.8H, v27.8H // ........................*.........................................................................................
- // uzp1 v10.8H, v29.8H, v25.8H // ...............................................*..................................................................
- // smlal2 v24.4S, v26.8H, v28.8H // ............................*.....................................................................................
- // uzp2 v4.8H, v4.8H, v3.8H // ...................................*..............................................................................
- // smull2 v13.4S, v31.8H, v19.8H // ...............*..................................................................................................
- // ldr q3, [x2], #32 // ..................................................*...............................................................
- // uzp2 v1.8H, v29.8H, v25.8H // ................................................*.................................................................
- // smlal2 v13.4S, v16.8H, v23.8H // ..........................*.......................................................................................
- // ldr q17, [x2, #-16] // .....................................................................*............................................
- // smull v18.4S, v31.4H, v19.4H // ................*.................................................................................................
- // smlal2 v13.4S, v20.8H, v28.8H // ..............................*...................................................................................
- // smull v29.4S, v31.4H, v21.4H // ..............*...................................................................................................
- // ldr q21, [x5], #32 // ..............................................................................*...................................
- // smlal2 v13.4S, v26.8H, v8.8H // ...........................................*......................................................................
- // smlal v29.4S, v16.4H, v19.4H // ..................*...............................................................................................
- // ldr q19, [x5, #-16] // ..................................................................................*...............................
- // smlal v18.4S, v16.4H, v23.4H // ...........................*......................................................................................
- // smlal v29.4S, v20.4H, v27.4H // .........................*........................................................................................
- // uzp1 v31.8H, v14.8H, v6.8H // ...................................................*..............................................................
- // uzp2 v27.8H, v21.8H, v19.8H // ....................................................................................*.............................
- // smlal v18.4S, v20.4H, v28.4H // ...............................*..................................................................................
- // ldr q25, [x1, #16] // ...................................................................................*..............................
- // smlal v29.4S, v26.4H, v28.4H // .............................*....................................................................................
- // smlal v18.4S, v26.4H, v8.4H // ............................................*.....................................................................
- // uzp2 v26.8H, v14.8H, v6.8H // ....................................................*.............................................................
- // smlal2 v13.4S, v9.8H, v31.8H // .....................................................*............................................................
- // smlal2 v24.4S, v9.8H, v26.8H // ......................................................*...........................................................
- // smlal v29.4S, v9.4H, v26.4H // .......................................................*..........................................................
- // smlal v18.4S, v9.4H, v31.4H // ........................................................*.........................................................
- // smlal2 v13.4S, v4.8H, v12.8H // .........................................................*........................................................
- // smlal2 v24.4S, v4.8H, v31.8H // ..........................................................*.......................................................
- // smlal v29.4S, v4.4H, v31.4H // ...........................................................*......................................................
- // smlal v18.4S, v4.4H, v12.4H // ............................................................*.....................................................
- // smlal2 v13.4S, v30.8H, v10.8H // .............................................................*....................................................
- // smlal2 v24.4S, v30.8H, v1.8H // ..............................................................*...................................................
- // smlal v29.4S, v30.4H, v1.4H // ...............................................................*..................................................
- // smlal v18.4S, v30.4H, v10.4H // ................................................................*.................................................
- // smlal2 v13.4S, v11.8H, v22.8H // .................................................................*................................................
- // smlal2 v24.4S, v11.8H, v10.8H // ..................................................................*...............................................
- // smlal v29.4S, v11.4H, v10.4H // ...................................................................*..............................................
- // smlal v18.4S, v11.4H, v22.4H // ....................................................................*.............................................
- // ldr q22, [x1], #32 // .......................................................................................*..........................
- // uzp1 v31.8H, v29.8H, v24.8H // ......................................................................*...........................................
- // uzp1 v28.8H, v21.8H, v19.8H // .....................................................................................*............................
- // mul v19.8H, v31.8H, v2.8H // ........................................................................*.........................................
- // uzp1 v31.8H, v22.8H, v25.8H // ..........................................................................................*.......................
- // uzp2 v16.8H, v22.8H, v25.8H // ...........................................................................................*......................
- // uzp2 v21.8H, v3.8H, v17.8H // .........................................................................*........................................
- // smlal v29.4S, v19.4H, v0.4H // ............................................................................*.....................................
- // smlal2 v24.4S, v19.8H, v0.8H // .............................................................................*....................................
- // uzp1 v19.8H, v3.8H, v17.8H // ..........................................................................*.......................................
- // uzp1 v26.8H, v18.8H, v13.8H // .......................................................................*..........................................
- // mul v23.8H, v26.8H, v2.8H // ...........................................................................*......................................
- // uzp2 v15.8H, v29.8H, v24.8H // ................................................................................*.................................
- // smull2 v24.4S, v31.8H, v21.8H // ............................................................................................*.....................
- // ldr q3, [x7, #16] // .........................................................................................*........................
- // ldr q6, [x8, #16] // .............................................................................................*....................
- // ldr q8, [x10], #32 // ..............................................................................................*...................
- // ldr q26, [x10, #-16] // ................................................................................................*.................
- // ld1 {v22.8H}, [x12], #16 // .................................................................................................*................
- // uzp1 v30.8H, v8.8H, v26.8H // ..................................................................................................*...............
- // uzp2 v11.8H, v8.8H, v26.8H // ...................................................................................................*..............
- // ldr q8, [x4], #32 // ....................................................................................................*.............
- // ldr q26, [x4, #-16] // .....................................................................................................*............
- // ldr q4, [x7], #32 // ......................................................................................................*...........
- // uzp1 v20.8H, v8.8H, v26.8H // .......................................................................................................*..........
- // uzp2 v26.8H, v8.8H, v26.8H // ........................................................................................................*.........
- // ld1 {v8.8H}, [x6], #16 // ...........................................................................................................*......
- // uzp1 v9.8H, v4.8H, v3.8H // .........................................................................................................*........
- // ldr q25, [x11, #16] // ............................................................................................................*.....
- // ldr q29, [x11], #32 // .............................................................................................................*....
- // ld1 {v12.8H}, [x9], #16 // ..............................................................................................................*...
- // ldr q14, [x8], #32 // ................................................................................................................*.
- // smlal2 v24.4S, v16.8H, v19.8H // ...............................................................................................*..................
- // smlal2 v13.4S, v23.8H, v0.8H // ...............................................................................*..................................
- // smlal v18.4S, v23.4H, v0.4H // .................................................................................*................................
- // ld1 {v23.8H}, [x3], #16 // .................................................................................................................*
- // smlal2 v24.4S, v20.8H, v27.8H // ..........................................................................................................*.......
- // uzp2 v7.8H, v18.8H, v13.8H // ......................................................................................*...........................
- // uzp1 v10.8H, v29.8H, v25.8H // ...............................................................................................................*..
- // zip1 v5.8H, v7.8H, v15.8H // ........................................................................................*.........................
-
- sub count, count, #2
-1:
- // Instructions: 82
- // Expected cycles: 102
- // Expected IPC: 0.80
- //
- // Cycle bound: 102.0
- // IPC bound: 0.80
- //
- // Wall time: 15.93s
- // User time: 15.93s
- //
- // ------------------------------- original position ------------------------------->
- // 0 25 50 75
- // |------------------------|------------------------|------------------------|------
- smlal2 v24.4S, v26.8H, v28.8H // .................................*................................................
- uzp2 v4.8H, v4.8H, v3.8H // .....................................*............................................
- smull2 v13.4S, v31.8H, v19.8H // ..........*.......................................................................
- ldr q3, [x2], #32 // ....e.............................................................................
- uzp2 v1.8H, v29.8H, v25.8H // ..........................................................*.......................
- smlal2 v13.4S, v16.8H, v23.8H // ............*.....................................................................
- ldr q17, [x2, #-16] // .....e............................................................................
- smull v18.4S, v31.4H, v19.4H // .........*........................................................................
- smlal2 v13.4S, v20.8H, v28.8H // ...........................*......................................................
- smull v29.4S, v31.4H, v21.4H // .............*....................................................................
- ldr q21, [x5], #32 // .....................e............................................................
- smlal2 v13.4S, v26.8H, v8.8H // .............................*....................................................
- smlal v29.4S, v16.4H, v19.4H // ...............*..................................................................
- ldr q19, [x5, #-16] // ......................e...........................................................
- smlal v18.4S, v16.4H, v23.4H // ...........*......................................................................
- smlal v29.4S, v20.4H, v27.4H // ..............................*...................................................
- uzp1 v31.8H, v14.8H, v6.8H // ........................................*.........................................
- uzp2 v27.8H, v21.8H, v19.8H // ........................e.........................................................
- smlal v18.4S, v20.4H, v28.4H // ..........................*.......................................................
- ldr q25, [x1, #16] // .e................................................................................
- smlal v29.4S, v26.4H, v28.4H // ................................*.................................................
- smlal v18.4S, v26.4H, v8.4H // ............................*.....................................................
- uzp2 v26.8H, v14.8H, v6.8H // .........................................*........................................
- smlal2 v13.4S, v9.8H, v31.8H // ............................................*.....................................
- smlal2 v24.4S, v9.8H, v26.8H // ................................................*.................................
- smlal v29.4S, v9.4H, v26.4H // ...............................................*..................................
- smlal v18.4S, v9.4H, v31.4H // ...........................................*......................................
- smlal2 v13.4S, v4.8H, v12.8H // ..............................................*...................................
- smlal2 v24.4S, v4.8H, v31.8H // ..................................................*...............................
- smlal v29.4S, v4.4H, v31.4H // .................................................*................................
- smlal v18.4S, v4.4H, v12.4H // .............................................*....................................
- smlal2 v13.4S, v30.8H, v10.8H // .............................................................*....................
- smlal2 v24.4S, v30.8H, v1.8H // .................................................................*................
- smlal v29.4S, v30.4H, v1.4H // ................................................................*.................
- smlal v18.4S, v30.4H, v10.4H // ............................................................*.....................
- smlal2 v13.4S, v11.8H, v22.8H // ...............................................................*..................
- smlal2 v24.4S, v11.8H, v10.8H // ...................................................................*..............
- smlal v29.4S, v11.4H, v10.4H // ..................................................................*...............
- smlal v18.4S, v11.4H, v22.4H // ..............................................................*...................
- ldr q22, [x1], #32 // e.................................................................................
- uzp1 v31.8H, v29.8H, v24.8H // .........................................................................*........
- uzp1 v28.8H, v21.8H, v19.8H // .......................e..........................................................
- mul v19.8H, v31.8H, v2.8H // ..........................................................................*.......
- uzp1 v31.8H, v22.8H, v25.8H // ..e...............................................................................
- uzp2 v16.8H, v22.8H, v25.8H // ...e..............................................................................
- uzp2 v21.8H, v3.8H, v17.8H // .......e..........................................................................
- smlal v29.4S, v19.4H, v0.4H // ...........................................................................*......
- smlal2 v24.4S, v19.8H, v0.8H // ............................................................................*.....
- uzp1 v19.8H, v3.8H, v17.8H // ......e...........................................................................
- uzp1 v26.8H, v18.8H, v13.8H // ....................................................................*.............
- zip2 v14.8H, v7.8H, v15.8H // ...............................................................................l..
- mul v23.8H, v26.8H, v2.8H // .....................................................................*............
- uzp2 v15.8H, v29.8H, v24.8H // .............................................................................*....
- smull2 v24.4S, v31.8H, v21.8H // ..............e...................................................................
- str q14, [x0, #16] // .................................................................................l
- ldr q3, [x7, #16] // ...................................e..............................................
- ldr q6, [x8, #16] // .......................................e..........................................
- ldr q8, [x10], #32 // ...................................................e..............................
- ldr q26, [x10, #-16] // ....................................................e.............................
- ld1 {v22.8H}, [x12], #16 // ...........................................................e......................
- uzp1 v30.8H, v8.8H, v26.8H // .....................................................e............................
- uzp2 v11.8H, v8.8H, v26.8H // ......................................................e...........................
- ldr q8, [x4], #32 // .................e................................................................
- ldr q26, [x4, #-16] // ..................e...............................................................
- ldr q4, [x7], #32 // ..................................e...............................................
- uzp1 v20.8H, v8.8H, v26.8H // ...................e..............................................................
- uzp2 v26.8H, v8.8H, v26.8H // ....................e.............................................................
- ld1 {v8.8H}, [x6], #16 // .........................e........................................................
- uzp1 v9.8H, v4.8H, v3.8H // ....................................e.............................................
- ldr q25, [x11, #16] // ........................................................e.........................
- ldr q29, [x11], #32 // .......................................................e..........................
- ld1 {v12.8H}, [x9], #16 // ..........................................e.......................................
- ldr q14, [x8], #32 // ......................................e...........................................
- smlal2 v24.4S, v16.8H, v19.8H // ................e.................................................................
- smlal2 v13.4S, v23.8H, v0.8H // .......................................................................*..........
- smlal v18.4S, v23.4H, v0.4H // ......................................................................*...........
- ld1 {v23.8H}, [x3], #16 // ........e.........................................................................
- smlal2 v24.4S, v20.8H, v27.8H // ...............................e..................................................
- uzp2 v7.8H, v18.8H, v13.8H // ........................................................................*.........
- uzp1 v10.8H, v29.8H, v25.8H // .........................................................e........................
- str q5, [x0], #32 // ................................................................................l.
- zip1 v5.8H, v7.8H, v15.8H // ..............................................................................*...
-
- // ----------------------------------------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------------------------------------>
- // 0 25 50 75 100 125 150 175 200 225
- // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------
- // ldr q12, [x1], #32 // ....................................e..........................................'......................................~..........................................'......................................~.........................................
- // ldr q13, [x1, #-16] // ................e..............................................................'..................~..............................................................'..................~.............................................................
- // uzp1 v3.8h, v12.8h, v13.8h // ........................................e......................................'..........................................~......................................'..........................................~.....................................
- // uzp2 v4.8h, v12.8h, v13.8h // .........................................e.....................................'...........................................~.....................................'...........................................~....................................
- // ldr q12, [x2], #32 // e..............................................................................'..~..............................................................................'..~.............................................................................
- // ldr q13, [x2, #-16] // ...e...........................................................................'.....~...........................................................................'.....~..........................................................................
- // uzp1 v5.8h, v12.8h, v13.8h // .............................................e.................................'...............................................~.................................'...............................................~................................
- // uzp2 v6.8h, v12.8h, v13.8h // ..........................................e....................................'............................................~....................................'............................................~...................................
- // ld1 {v7.8h}, [x3], #16 // .........................................................................e.....'...........................................................................~.....'...........................................................................~....
- // smull v8.4s, v3.4h, v5.4h // ....~..........................................................................'......*..........................................................................'......~.........................................................................
- // smull2 v10.4s, v3.8h, v5.8h // ...............................................................................'.*...............................................................................'.~..............................................................................
- // smlal v8.4s, v4.4h, v7.4h // ...........~...................................................................'.............*...................................................................'.............~..................................................................
- // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................................................'....*............................................................................'....~...........................................................................
- // smull v9.4s, v3.4h, v6.4h // ......~........................................................................'........*........................................................................'........~.......................................................................
- // smull2 v11.4s, v3.8h, v6.8h // ..................................................e............................'....................................................~............................'....................................................~...........................
- // smlal v9.4s, v4.4h, v5.4h // .........~.....................................................................'...........*.....................................................................'...........~....................................................................
- // smlal2 v11.4s, v4.8h, v5.8h // ......................................................................e........'........................................................................~........'........................................................................~.......
- // ldr q12, [x4], #32 // ...........................................................e...................'.............................................................~...................'.............................................................~..................
- // ldr q13, [x4, #-16] // ............................................................e..................'..............................................................~..................'..............................................................~.................
- // uzp1 v3.8h, v12.8h, v13.8h // ..............................................................e................'................................................................~................'................................................................~...............
- // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................e...............'.................................................................~...............'.................................................................~..............
- // ldr q12, [x5], #32 // .......e.......................................................................'.........~.......................................................................'.........~......................................................................
- // ldr q13, [x5, #-16] // ..........e....................................................................'............~....................................................................'............~...................................................................
- // uzp1 v5.8h, v12.8h, v13.8h // ......................................e........................................'........................................~........................................'........................................~.......................................
- // uzp2 v6.8h, v12.8h, v13.8h // ..............e................................................................'................~................................................................'................~...............................................................
- // ld1 {v7.8h}, [x6], #16 // ................................................................e..............'..................................................................~..............'..................................................................~.............
- // smlal v8.4s, v3.4h, v5.4h // ...............~...............................................................'.................*...............................................................'.................~..............................................................
- // smlal2 v10.4s, v3.8h, v5.8h // .....~.........................................................................'.......*.........................................................................'.......~........................................................................
- // smlal v8.4s, v4.4h, v7.4h // ..................~............................................................'....................*............................................................'....................~...........................................................
- // smlal2 v10.4s, v4.8h, v7.8h // ........~......................................................................'..........*......................................................................'..........~.....................................................................
- // smlal v9.4s, v3.4h, v6.4h // ............~..................................................................'..............*..................................................................'..............~.................................................................
- // smlal2 v11.4s, v3.8h, v6.8h // ..........................................................................e....'............................................................................~....'............................................................................~...
- // smlal v9.4s, v4.4h, v5.4h // .................~.............................................................'...................*.............................................................'...................~............................................................
- // smlal2 v11.4s, v4.8h, v5.8h // ...............................................................................*.................................................................................~................................................................................
- // ldr q12, [x7], #32 // .............................................................e.................'...............................................................~.................'...............................................................~................
- // ldr q13, [x7, #-16] // ....................................................e..........................'......................................................~..........................'......................................................~.........................
- // uzp1 v3.8h, v12.8h, v13.8h // .................................................................e.............'...................................................................~.............'...................................................................~............
- // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................................'*................................................................................'~...............................................................................
- // ldr q12, [x8], #32 // .....................................................................e.........'.......................................................................~.........'.......................................................................~........
- // ldr q13, [x8, #-16] // .....................................................e.........................'.......................................................~.........................'.......................................................~........................
- // uzp1 v5.8h, v12.8h, v13.8h // .............~.................................................................'...............*.................................................................'...............~................................................................
- // uzp2 v6.8h, v12.8h, v13.8h // ...................~...........................................................'.....................*...........................................................'.....................~..........................................................
- // ld1 {v7.8h}, [x9], #16 // ....................................................................e..........'......................................................................~..........'......................................................................~.........
- // smlal v8.4s, v3.4h, v5.4h // .......................~.......................................................'.........................*.......................................................'.........................~......................................................
- // smlal2 v10.4s, v3.8h, v5.8h // ....................~..........................................................'......................*..........................................................'......................~.........................................................
- // smlal v8.4s, v4.4h, v7.4h // ...........................~...................................................'.............................*...................................................'.............................~..................................................
- // smlal2 v10.4s, v4.8h, v7.8h // ........................~......................................................'..........................*......................................................'..........................~.....................................................
- // smlal v9.4s, v3.4h, v6.4h // ......................~........................................................'........................*........................................................'........................~.......................................................
- // smlal2 v11.4s, v3.8h, v6.8h // .....................~.........................................................'.......................*.........................................................'.......................~........................................................
- // smlal v9.4s, v4.4h, v5.4h // ..........................~....................................................'............................*....................................................'............................~...................................................
- // smlal2 v11.4s, v4.8h, v5.8h // .........................~.....................................................'...........................*.....................................................'...........................~....................................................
- // ldr q12, [x10], #32 // ......................................................e........................'........................................................~........................'........................................................~.......................
- // ldr q13, [x10, #-16] // .......................................................e.......................'.........................................................~.......................'.........................................................~......................
- // uzp1 v3.8h, v12.8h, v13.8h // .........................................................e.....................'...........................................................~.....................'...........................................................~....................
- // uzp2 v4.8h, v12.8h, v13.8h // ..........................................................e....................'............................................................~....................'............................................................~...................
- // ldr q12, [x11], #32 // ...................................................................e...........'.....................................................................~...........'.....................................................................~..........
- // ldr q13, [x11, #-16] // ..................................................................e............'....................................................................~............'....................................................................~...........
- // uzp1 v5.8h, v12.8h, v13.8h // ............................................................................e..'..............................................................................~..'..............................................................................~.
- // uzp2 v6.8h, v12.8h, v13.8h // .~.............................................................................'...*.............................................................................'...~............................................................................
- // ld1 {v7.8h}, [x12], #16 // ........................................................e......................'..........................................................~......................'..........................................................~.....................
- // smlal v8.4s, v3.4h, v5.4h // ...............................~...............................................'.................................*...............................................'.................................~..............................................
- // smlal2 v10.4s, v3.8h, v5.8h // ............................~..................................................'..............................*..................................................'..............................~.................................................
- // smlal v8.4s, v4.4h, v7.4h // ...................................~...........................................'.....................................*...........................................'.....................................~..........................................
- // smlal2 v10.4s, v4.8h, v7.8h // ................................~..............................................'..................................*..............................................'..................................~.............................................
- // smlal v9.4s, v3.4h, v6.4h // ..............................~................................................'................................*................................................'................................~...............................................
- // smlal2 v11.4s, v3.8h, v6.8h // .............................~.................................................'...............................*.................................................'...............................~................................................
- // smlal v9.4s, v4.4h, v5.4h // ..................................~............................................'....................................*............................................'....................................~...........................................
- // smlal2 v11.4s, v4.8h, v5.8h // .................................~.............................................'...................................*.............................................'...................................~............................................
- // uzp1 v28.8h, v8.8h, v10.8h // ..............................................~................................'................................................*................................'................................................~...............................
- // mul v28.8h, v28.8h, v2.8h // ................................................~..............................'..................................................*..............................'..................................................~.............................
- // smlal v8.4s, v28.4h, v0.4h // ........................................................................~......'..........................................................................*......'..........................................................................~.....
- // smlal2 v10.4s, v28.8h, v0.8h // .......................................................................~.......'.........................................................................*.......'.........................................................................~......
- // uzp2 v26.8h, v8.8h, v10.8h // ...........................................................................~...'.............................................................................*...'.............................................................................~..
- // uzp1 v28.8h, v9.8h, v11.8h // .....................................~.........................................'.......................................*.........................................'.......................................~........................................
- // mul v28.8h, v28.8h, v2.8h // .......................................~.......................................'.........................................*.......................................'.........................................~......................................
- // smlal v9.4s, v28.4h, v0.4h // ...........................................~...................................'.............................................*...................................'.............................................~..................................
- // smlal2 v11.4s, v28.8h, v0.8h // ............................................~..................................'..............................................*..................................'..............................................~.................................
- // uzp2 v27.8h, v9.8h, v11.8h // .................................................~.............................'...................................................*.............................'...................................................~............................
- // zip1 v12.8h, v26.8h, v27.8h // ..............................................................................~'................................................................................*'................................................................................
- // zip2 v13.8h, v26.8h, v27.8h // ...............................................~...............................'.................................................~...............................'.................................................l..............................
- // str q12, [x0], #32 // .............................................................................~.'...............................................................................~.'...............................................................................l
- // str q13, [x0, #-16] // ...................................................~...........................'.....................................................~...........................'.....................................................l..........................
-
- sub count, count, #1
- cbnz count, 1b
- // Instructions: 50
- // Expected cycles: 56
- // Expected IPC: 0.89
- //
- // Cycle bound: 56.0
- // IPC bound: 0.89
- //
- // Wall time: 4.16s
- // User time: 4.16s
- //
- // --------------- original position --------------->
- // 0 25
- // |------------------------|
- smull2 v17.4S, v31.8H, v19.8H // ..*...............................................
- uzp2 v1.8H, v14.8H, v6.8H // ................*.................................
- smull v18.4S, v31.4H, v21.4H // .......*..........................................
- smlal2 v24.4S, v26.8H, v28.8H // *.................................................
- smlal2 v17.4S, v16.8H, v23.8H // ....*.............................................
- smull v21.4S, v31.4H, v19.4H // .....*............................................
- smlal v18.4S, v16.4H, v19.4H // .........*........................................
- uzp2 v31.8H, v4.8H, v3.8H // .*................................................
- uzp1 v3.8H, v14.8H, v6.8H // ............*.....................................
- smlal v21.4S, v16.4H, v23.4H // ..........*.......................................
- smlal v18.4S, v20.4H, v27.4H // ...........*......................................
- uzp2 v14.8H, v29.8H, v25.8H // ...*..............................................
- smlal2 v17.4S, v20.8H, v28.8H // ......*...........................................
- smlal v21.4S, v20.4H, v28.4H // .............*....................................
- smlal v18.4S, v26.4H, v28.4H // ..............*...................................
- smlal2 v24.4S, v9.8H, v1.8H // ..................*...............................
- smlal2 v17.4S, v26.8H, v8.8H // ........*.........................................
- smlal v21.4S, v26.4H, v8.4H // ...............*..................................
- smlal v18.4S, v9.4H, v1.4H // ...................*..............................
- smlal2 v24.4S, v31.8H, v3.8H // ......................*...........................
- smlal2 v17.4S, v9.8H, v3.8H // .................*................................
- smlal v21.4S, v9.4H, v3.4H // ....................*.............................
- smlal v18.4S, v31.4H, v3.4H // .......................*..........................
- smlal2 v24.4S, v30.8H, v14.8H // ..........................*.......................
- smlal2 v17.4S, v31.8H, v12.8H // .....................*............................
- smlal v21.4S, v31.4H, v12.4H // ........................*.........................
- smlal v18.4S, v30.4H, v14.4H // ...........................*......................
- smlal2 v24.4S, v11.8H, v10.8H // ..............................*...................
- smlal2 v17.4S, v30.8H, v10.8H // .........................*........................
- smlal v21.4S, v30.4H, v10.4H // ............................*.....................
- smlal v18.4S, v11.4H, v10.4H // ...............................*..................
- zip2 v19.8H, v7.8H, v15.8H // ......................................*...........
- smlal2 v17.4S, v11.8H, v22.8H // .............................*....................
- smlal v21.4S, v11.4H, v22.4H // ................................*.................
- uzp1 v23.8H, v18.8H, v24.8H // .................................*................
- str q19, [x0, #16] // .........................................*........
- mul v19.8H, v23.8H, v2.8H // ..................................*...............
- uzp1 v23.8H, v21.8H, v17.8H // .....................................*............
- str q5, [x0], #32 // .............................................*....
- mul v26.8H, v23.8H, v2.8H // .......................................*..........
- smlal v18.4S, v19.4H, v0.4H // ...................................*..............
- smlal2 v24.4S, v19.8H, v0.8H // ....................................*.............
- smlal v21.4S, v26.4H, v0.4H // ...........................................*......
- smlal2 v17.4S, v26.8H, v0.8H // ..........................................*.......
- uzp2 v13.8H, v18.8H, v24.8H // ........................................*.........
- uzp2 v19.8H, v21.8H, v17.8H // ............................................*.....
- zip1 v23.8H, v19.8H, v13.8H // ..............................................*...
- zip2 v19.8H, v19.8H, v13.8H // ...............................................*..
- str q23, [x0], #32 // .................................................*
- str q19, [x0, #-16] // ................................................*.
-
- // ----------------- new position ------------------>
- // 0 25
- // |------------------------|------------------------
- // smlal2 v24.4S, v26.8H, v28.8H // ...*..............................................
- // uzp2 v4.8H, v4.8H, v3.8H // .......*..........................................
- // smull2 v13.4S, v31.8H, v19.8H // *.................................................
- // uzp2 v1.8H, v29.8H, v25.8H // ...........*......................................
- // smlal2 v13.4S, v16.8H, v23.8H // ....*.............................................
- // smull v18.4S, v31.4H, v19.4H // .....*............................................
- // smlal2 v13.4S, v20.8H, v28.8H // ............*.....................................
- // smull v29.4S, v31.4H, v21.4H // ..*...............................................
- // smlal2 v13.4S, v26.8H, v8.8H // ................*.................................
- // smlal v29.4S, v16.4H, v19.4H // ......*...........................................
- // smlal v18.4S, v16.4H, v23.4H // .........*........................................
- // smlal v29.4S, v20.4H, v27.4H // ..........*.......................................
- // uzp1 v31.8H, v14.8H, v6.8H // ........*.........................................
- // smlal v18.4S, v20.4H, v28.4H // .............*....................................
- // smlal v29.4S, v26.4H, v28.4H // ..............*...................................
- // smlal v18.4S, v26.4H, v8.4H // .................*................................
- // uzp2 v26.8H, v14.8H, v6.8H // .*................................................
- // smlal2 v13.4S, v9.8H, v31.8H // ....................*.............................
- // smlal2 v24.4S, v9.8H, v26.8H // ...............*..................................
- // smlal v29.4S, v9.4H, v26.4H // ..................*...............................
- // smlal v18.4S, v9.4H, v31.4H // .....................*............................
- // smlal2 v13.4S, v4.8H, v12.8H // ........................*.........................
- // smlal2 v24.4S, v4.8H, v31.8H // ...................*..............................
- // smlal v29.4S, v4.4H, v31.4H // ......................*...........................
- // smlal v18.4S, v4.4H, v12.4H // .........................*........................
- // smlal2 v13.4S, v30.8H, v10.8H // ............................*.....................
- // smlal2 v24.4S, v30.8H, v1.8H // .......................*..........................
- // smlal v29.4S, v30.4H, v1.4H // ..........................*.......................
- // smlal v18.4S, v30.4H, v10.4H // .............................*....................
- // smlal2 v13.4S, v11.8H, v22.8H // ................................*.................
- // smlal2 v24.4S, v11.8H, v10.8H // ...........................*......................
- // smlal v29.4S, v11.4H, v10.4H // ..............................*...................
- // smlal v18.4S, v11.4H, v22.4H // .................................*................
- // uzp1 v31.8H, v29.8H, v24.8H // ..................................*...............
- // mul v19.8H, v31.8H, v2.8H // ....................................*.............
- // smlal v29.4S, v19.4H, v0.4H // ........................................*.........
- // smlal2 v24.4S, v19.8H, v0.8H // .........................................*........
- // uzp1 v26.8H, v18.8H, v13.8H // .....................................*............
- // zip2 v14.8H, v7.8H, v15.8H // ...............................*..................
- // mul v23.8H, v26.8H, v2.8H // .......................................*..........
- // uzp2 v15.8H, v29.8H, v24.8H // ............................................*.....
- // str q14, [x0, #16] // ...................................*..............
- // smlal2 v13.4S, v23.8H, v0.8H // ...........................................*......
- // smlal v18.4S, v23.4H, v0.4H // ..........................................*.......
- // uzp2 v7.8H, v18.8H, v13.8H // .............................................*....
- // str q5, [x0], #32 // ......................................*...........
- // zip1 v5.8H, v7.8H, v15.8H // ..............................................*...
- // zip2 v14.8H, v7.8H, v15.8H // ...............................................*..
- // str q14, [x0, #16] // .................................................*
- // str q5, [x0], #32 // ................................................*.
-
-
- pop_stack
- ret
-#endif /* MLKEM_K == 4 */
-
-/****************** REGISTER DEALLOCATIONS *******************/
- .unreq out
- .unreq a0_ptr
- .unreq b0_ptr
- .unreq b0_cache_ptr
- .unreq a1_ptr
- .unreq b1_ptr
- .unreq b1_cache_ptr
- .unreq a2_ptr
- .unreq b2_ptr
- .unreq b2_cache_ptr
- .unreq a3_ptr
- .unreq b3_ptr
- .unreq b3_cache_ptr
- .unreq count
- .unreq modulus
- .unreq modulus_twisted
- .unreq wtmp
- .unreq aa0
- .unreq aa1
- .unreq bb0
- .unreq bb1
- .unreq bb1t
- .unreq res0l
- .unreq res1l
- .unreq res0h
- .unreq res1h
- .unreq tmp0
- .unreq tmp1
- .unreq q_tmp0
- .unreq q_tmp1
- .unreq out0
- .unreq out1
- .unreq t0
-
-#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_asm_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_asm_clean.S
index 8302d2a3e..f2451815a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_asm_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_asm_clean.S
@@ -19,8 +19,8 @@
* Returns number of sampled 16-bit integers (at most MLKEM_N).
**************************************************/
#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \
- defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+/* simpasm: header-end */
// We save the output on the stack first, and copy to the actual
// output buffer only in the end. This is because the main loop can overwrite
@@ -112,9 +112,9 @@
mlkem_q .req v30
bits .req v31
-.text
-.global MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean)
-.balign 4
+ .text
+ .global MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean)
+ .balign 4
MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean):
push_stack
@@ -402,5 +402,5 @@ return:
.unreq mlkem_q
.unreq bits
-#endif /* defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) ||
- defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) */
+/* simpasm: footer-start */
+#endif /* defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_table.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_table.c
index becdf303b..592c15fb0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_table.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_table.c
@@ -10,8 +10,7 @@
#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \
- defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
#include
#include "arith_native_aarch64.h"
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/default.h
index f9fe4310a..df43dc5b0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/default.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/default.h
@@ -11,21 +11,10 @@
#include "../sys.h"
#ifdef SYS_AARCH64
-/*
- * For AArch64, we currently we have one clean and one opt profile.
- * We default to the opt profile.
- *
- * In the future, this may branch further depending on the microarchitecture.
- */
#include "aarch64/opt.h"
#endif /* SYS_AARCH64 */
#ifdef SYS_X86_64_AVX2
-/*
- * For now, there's only one x86_64 profile, based on
- * the AVX2 code from the Kyber repository.
- * https://github.com/pq-crystals/kyber
- */
#include "x86_64/default.h"
#endif /* SYS_X86_64 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/native/default.h
index f9fe4310a..df43dc5b0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/native/default.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/native/default.h
@@ -11,21 +11,10 @@
#include "../sys.h"
#ifdef SYS_AARCH64
-/*
- * For AArch64, we currently we have one clean and one opt profile.
- * We default to the opt profile.
- *
- * In the future, this may branch further depending on the microarchitecture.
- */
#include "aarch64/opt.h"
#endif /* SYS_AARCH64 */
#ifdef SYS_X86_64_AVX2
-/*
- * For now, there's only one x86_64 profile, based on
- * the AVX2 code from the Kyber repository.
- * https://github.com/pq-crystals/kyber
- */
#include "x86_64/default.h"
#endif /* SYS_X86_64 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/default.h
index f9fe4310a..df43dc5b0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/default.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/default.h
@@ -11,21 +11,10 @@
#include "../sys.h"
#ifdef SYS_AARCH64
-/*
- * For AArch64, we currently we have one clean and one opt profile.
- * We default to the opt profile.
- *
- * In the future, this may branch further depending on the microarchitecture.
- */
#include "aarch64/opt.h"
#endif /* SYS_AARCH64 */
#ifdef SYS_X86_64_AVX2
-/*
- * For now, there's only one x86_64 profile, based on
- * the AVX2 code from the Kyber repository.
- * https://github.com/pq-crystals/kyber
- */
#include "x86_64/default.h"
#endif /* SYS_X86_64 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.S
index 5fdc3d0a0..3063d20ae 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.S
@@ -8,6 +8,7 @@
#include "../../../common.h"
#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
#include "consts.h"
@@ -113,6 +114,7 @@ vmovdqa %ymm11,(64*\off+48)*2(%rdi)
.text
.global MLKEM_ASM_NAMESPACE(basemul_avx2)
+.balign 4
MLKEM_ASM_NAMESPACE(basemul_avx2):
mov %rsp,%r8
and $-32,%rsp
@@ -133,4 +135,5 @@ schoolbook 3
mov %r8,%rsp
ret
+/* simpasm: footer-start */
#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/intt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/intt.S
index 7b1f22624..e74199930 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/intt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/intt.S
@@ -12,6 +12,7 @@
#include "../../../common.h"
#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
#include "consts.h"
#include "shuffle.inc"
@@ -242,6 +243,7 @@ vmovdqa %ymm11,(64*\off+176)*2(%rdi)
.text
.global MLKEM_ASM_NAMESPACE(invntt_avx2)
+.balign 4
MLKEM_ASM_NAMESPACE(invntt_avx2):
vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0
@@ -252,4 +254,5 @@ intt_level6 0
intt_level6 1
ret
+/* simpasm: footer-start */
#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntt.S
index 5d928b4cc..70582fbc1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntt.S
@@ -8,6 +8,7 @@
#include "../../../common.h"
#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
#include "consts.h"
#include "shuffle.inc"
@@ -205,6 +206,7 @@ vmovdqa %ymm11,(128*\off+112)*2(%rdi)
.text
.global MLKEM_ASM_NAMESPACE(ntt_avx2)
+.balign 4
MLKEM_ASM_NAMESPACE(ntt_avx2):
vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0
@@ -216,4 +218,5 @@ levels1t6 1
ret
+/* simpasm: footer-start */
#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttfrombytes.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttfrombytes.S
new file mode 100644
index 000000000..71f2af000
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttfrombytes.S
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// Implementation from Kyber reference repository
+// https://github.com/pq-crystals/kyber/blob/main/avx2
+
+#include "../../../common.h"
+
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
+
+#include "consts.h"
+#include "fq.inc"
+#include "shuffle.inc"
+
+.text
+.global MLKEM_ASM_NAMESPACE(nttfrombytes_avx2)
+.balign 4
+MLKEM_ASM_NAMESPACE(nttfrombytes_avx2):
+#consts
+vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMASK*2(%rdx),%ymm0
+call nttfrombytes128_avx
+add $256,%rdi
+add $192,%rsi
+call nttfrombytes128_avx
+ret
+
+nttfrombytes128_avx:
+#load
+vmovdqu (%rsi),%ymm4
+vmovdqu 32(%rsi),%ymm5
+vmovdqu 64(%rsi),%ymm6
+vmovdqu 96(%rsi),%ymm7
+vmovdqu 128(%rsi),%ymm8
+vmovdqu 160(%rsi),%ymm9
+
+shuffle8 4,7,3,7
+shuffle8 5,8,4,8
+shuffle8 6,9,5,9
+
+shuffle4 3,8,6,8
+shuffle4 7,5,3,5
+shuffle4 4,9,7,9
+
+shuffle2 6,5,4,5
+shuffle2 8,7,6,7
+shuffle2 3,9,8,9
+
+shuffle1 4,7,10,7
+shuffle1 5,8,4,8
+shuffle1 6,9,5,9
+
+#bitunpack
+vpsrlw $12,%ymm10,%ymm11
+vpsllw $4,%ymm7,%ymm12
+vpor %ymm11,%ymm12,%ymm11
+vpand %ymm0,%ymm10,%ymm10
+vpand %ymm0,%ymm11,%ymm11
+
+vpsrlw $8,%ymm7,%ymm12
+vpsllw $8,%ymm4,%ymm13
+vpor %ymm12,%ymm13,%ymm12
+vpand %ymm0,%ymm12,%ymm12
+
+vpsrlw $4,%ymm4,%ymm13
+vpand %ymm0,%ymm13,%ymm13
+
+vpsrlw $12,%ymm8,%ymm14
+vpsllw $4,%ymm5,%ymm15
+vpor %ymm14,%ymm15,%ymm14
+vpand %ymm0,%ymm8,%ymm8
+vpand %ymm0,%ymm14,%ymm14
+
+vpsrlw $8,%ymm5,%ymm15
+vpsllw $8,%ymm9,%ymm1
+vpor %ymm15,%ymm1,%ymm15
+vpand %ymm0,%ymm15,%ymm15
+
+vpsrlw $4,%ymm9,%ymm1
+vpand %ymm0,%ymm1,%ymm1
+
+#store
+vmovdqa %ymm10,(%rdi)
+vmovdqa %ymm11,32(%rdi)
+vmovdqa %ymm12,64(%rdi)
+vmovdqa %ymm13,96(%rdi)
+vmovdqa %ymm8,128(%rdi)
+vmovdqa %ymm14,160(%rdi)
+vmovdqa %ymm15,192(%rdi)
+vmovdqa %ymm1,224(%rdi)
+
+ret
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttpack.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttpack.S
new file mode 100644
index 000000000..4c10ef366
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttpack.S
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// Implementation from Kyber reference repository
+// https://github.com/pq-crystals/kyber/blob/main/avx2
+
+#include "../../../common.h"
+
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
+
+#include "consts.h"
+#include "fq.inc"
+#include "shuffle.inc"
+
+.text
+.global MLKEM_ASM_NAMESPACE(nttpack_avx2)
+.balign 4
+MLKEM_ASM_NAMESPACE(nttpack_avx2):
+#load
+vmovdqa (%rdi),%ymm4
+vmovdqa 32(%rdi),%ymm5
+vmovdqa 64(%rdi),%ymm6
+vmovdqa 96(%rdi),%ymm7
+vmovdqa 128(%rdi),%ymm8
+vmovdqa 160(%rdi),%ymm9
+vmovdqa 192(%rdi),%ymm10
+vmovdqa 224(%rdi),%ymm11
+
+shuffle1 4,5,3,5
+shuffle1 6,7,4,7
+shuffle1 8,9,6,9
+shuffle1 10,11,8,11
+
+shuffle2 3,4,10,4
+shuffle2 6,8,3,8
+shuffle2 5,7,6,7
+shuffle2 9,11,5,11
+
+shuffle4 10,3,9,3
+shuffle4 6,5,10,5
+shuffle4 4,8,6,8
+shuffle4 7,11,4,11
+
+shuffle8 9,10,7,10
+shuffle8 6,4,9,4
+shuffle8 3,5,6,5
+shuffle8 8,11,3,11
+
+#store
+vmovdqa %ymm7,(%rdi)
+vmovdqa %ymm9,32(%rdi)
+vmovdqa %ymm6,64(%rdi)
+vmovdqa %ymm3,96(%rdi)
+vmovdqa %ymm10,128(%rdi)
+vmovdqa %ymm4,160(%rdi)
+vmovdqa %ymm5,192(%rdi)
+vmovdqa %ymm11,224(%rdi)
+
+ret
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntttobytes.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntttobytes.S
new file mode 100644
index 000000000..4f0b01e83
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntttobytes.S
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// Implementation from Kyber reference repository
+// https://github.com/pq-crystals/kyber/blob/main/avx2
+
+#include "../../../common.h"
+
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
+
+#include "consts.h"
+#include "fq.inc"
+#include "shuffle.inc"
+
+.text
+.global MLKEM_ASM_NAMESPACE(ntttobytes_avx2)
+.balign 4
+MLKEM_ASM_NAMESPACE(ntttobytes_avx2):
+#consts
+vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rdx),%ymm0
+call ntttobytes128_avx
+add $256,%rsi
+add $192,%rdi
+call ntttobytes128_avx
+ret
+
+ntttobytes128_avx:
+#load
+vmovdqa (%rsi),%ymm5
+vmovdqa 32(%rsi),%ymm6
+vmovdqa 64(%rsi),%ymm7
+vmovdqa 96(%rsi),%ymm8
+vmovdqa 128(%rsi),%ymm9
+vmovdqa 160(%rsi),%ymm10
+vmovdqa 192(%rsi),%ymm11
+vmovdqa 224(%rsi),%ymm12
+
+#bitpack
+vpsllw $12,%ymm6,%ymm4
+vpor %ymm4,%ymm5,%ymm4
+
+vpsrlw $4,%ymm6,%ymm5
+vpsllw $8,%ymm7,%ymm6
+vpor %ymm5,%ymm6,%ymm5
+
+vpsrlw $8,%ymm7,%ymm6
+vpsllw $4,%ymm8,%ymm7
+vpor %ymm6,%ymm7,%ymm6
+
+vpsllw $12,%ymm10,%ymm7
+vpor %ymm7,%ymm9,%ymm7
+
+vpsrlw $4,%ymm10,%ymm8
+vpsllw $8,%ymm11,%ymm9
+vpor %ymm8,%ymm9,%ymm8
+
+vpsrlw $8,%ymm11,%ymm9
+vpsllw $4,%ymm12,%ymm10
+vpor %ymm9,%ymm10,%ymm9
+
+shuffle1 4,5,3,5
+shuffle1 6,7,4,7
+shuffle1 8,9,6,9
+
+shuffle2 3,4,8,4
+shuffle2 6,5,3,5
+shuffle2 7,9,6,9
+
+shuffle4 8,3,7,3
+shuffle4 6,4,8,4
+shuffle4 5,9,6,9
+
+shuffle8 7,8,5,8
+shuffle8 6,3,7,3
+shuffle8 4,9,6,9
+
+#store
+vmovdqu %ymm5,(%rdi)
+vmovdqu %ymm7,32(%rdi)
+vmovdqu %ymm6,64(%rdi)
+vmovdqu %ymm8,96(%rdi)
+vmovdqu %ymm3,128(%rdi)
+vmovdqu %ymm9,160(%rdi)
+
+ret
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttunpack.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttunpack.S
new file mode 100644
index 000000000..0cf45c671
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttunpack.S
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// Implementation from Kyber reference repository
+// https://github.com/pq-crystals/kyber/blob/main/avx2
+
+#include "../../../common.h"
+
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
+
+#include "consts.h"
+#include "fq.inc"
+#include "shuffle.inc"
+
+.text
+.global MLKEM_ASM_NAMESPACE(nttunpack_avx2)
+.balign 4
+MLKEM_ASM_NAMESPACE(nttunpack_avx2):
+call nttunpack128_avx2
+add $256,%rdi
+call nttunpack128_avx2
+ret
+
+nttunpack128_avx2:
+#load
+vmovdqa (%rdi),%ymm4
+vmovdqa 32(%rdi),%ymm5
+vmovdqa 64(%rdi),%ymm6
+vmovdqa 96(%rdi),%ymm7
+vmovdqa 128(%rdi),%ymm8
+vmovdqa 160(%rdi),%ymm9
+vmovdqa 192(%rdi),%ymm10
+vmovdqa 224(%rdi),%ymm11
+
+shuffle8 4,8,3,8
+shuffle8 5,9,4,9
+shuffle8 6,10,5,10
+shuffle8 7,11,6,11
+
+shuffle4 3,5,7,5
+shuffle4 8,10,3,10
+shuffle4 4,6,8,6
+shuffle4 9,11,4,11
+
+shuffle2 7,8,9,8
+shuffle2 5,6,7,6
+shuffle2 3,4,5,4
+shuffle2 10,11,3,11
+
+shuffle1 9,5,10,5
+shuffle1 8,4,9,4
+shuffle1 7,3,8,3
+shuffle1 6,11,7,11
+
+#store
+vmovdqa %ymm10,(%rdi)
+vmovdqa %ymm5,32(%rdi)
+vmovdqa %ymm9,64(%rdi)
+vmovdqa %ymm4,96(%rdi)
+vmovdqa %ymm8,128(%rdi)
+vmovdqa %ymm3,160(%rdi)
+vmovdqa %ymm7,192(%rdi)
+vmovdqa %ymm11,224(%rdi)
+
+ret
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/reduce.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/reduce.S
new file mode 100644
index 000000000..78bad0559
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/reduce.S
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// Implementation based on Kyber reference repository
+// https://github.com/pq-crystals/kyber/blob/main/avx2
+
+// Changes:
+// - Add call to csub in reduce128_avx to produce outputs
+// in [0,1,...,q-1] rather than [0,1,...,q], matching the
+// semantics of poly_reduce().
+
+#include "../../../common.h"
+
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
+
+#include "consts.h"
+#include "fq.inc"
+
+.text
+.global MLKEM_ASM_NAMESPACE(reduce_avx2)
+.balign 4
+MLKEM_ASM_NAMESPACE(reduce_avx2):
+#consts
+vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0
+vmovdqa AVX2_BACKEND_DATA_OFFSET_16XV*2(%rsi),%ymm1
+call reduce128_avx2
+add $256,%rdi
+call reduce128_avx2
+ret
+
+reduce128_avx2:
+#load
+vmovdqa (%rdi),%ymm2
+vmovdqa 32(%rdi),%ymm3
+vmovdqa 64(%rdi),%ymm4
+vmovdqa 96(%rdi),%ymm5
+vmovdqa 128(%rdi),%ymm6
+vmovdqa 160(%rdi),%ymm7
+vmovdqa 192(%rdi),%ymm8
+vmovdqa 224(%rdi),%ymm9
+
+red16 2
+red16 3
+red16 4
+red16 5
+red16 6
+red16 7
+red16 8
+red16 9
+
+csubq 2
+csubq 3
+csubq 4
+csubq 5
+csubq 6
+csubq 7
+csubq 8
+csubq 9
+
+#store
+vmovdqa %ymm2,(%rdi)
+vmovdqa %ymm3,32(%rdi)
+vmovdqa %ymm4,64(%rdi)
+vmovdqa %ymm5,96(%rdi)
+vmovdqa %ymm6,128(%rdi)
+vmovdqa %ymm7,160(%rdi)
+vmovdqa %ymm8,192(%rdi)
+vmovdqa %ymm9,224(%rdi)
+
+ret
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/shuffle.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/shuffle.S
deleted file mode 100644
index 9bcd04896..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/shuffle.S
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-// Implementation from Kyber reference repository
-// https://github.com/pq-crystals/kyber/blob/main/avx2
-
-#include "../../../common.h"
-
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
-
-#include "consts.h"
-#include "fq.inc"
-#include "shuffle.inc"
-
-.global MLKEM_ASM_NAMESPACE(nttpack_avx2)
-MLKEM_ASM_NAMESPACE(nttpack_avx2):
-#load
-vmovdqa (%rdi),%ymm4
-vmovdqa 32(%rdi),%ymm5
-vmovdqa 64(%rdi),%ymm6
-vmovdqa 96(%rdi),%ymm7
-vmovdqa 128(%rdi),%ymm8
-vmovdqa 160(%rdi),%ymm9
-vmovdqa 192(%rdi),%ymm10
-vmovdqa 224(%rdi),%ymm11
-
-shuffle1 4,5,3,5
-shuffle1 6,7,4,7
-shuffle1 8,9,6,9
-shuffle1 10,11,8,11
-
-shuffle2 3,4,10,4
-shuffle2 6,8,3,8
-shuffle2 5,7,6,7
-shuffle2 9,11,5,11
-
-shuffle4 10,3,9,3
-shuffle4 6,5,10,5
-shuffle4 4,8,6,8
-shuffle4 7,11,4,11
-
-shuffle8 9,10,7,10
-shuffle8 6,4,9,4
-shuffle8 3,5,6,5
-shuffle8 8,11,3,11
-
-#store
-vmovdqa %ymm7,(%rdi)
-vmovdqa %ymm9,32(%rdi)
-vmovdqa %ymm6,64(%rdi)
-vmovdqa %ymm3,96(%rdi)
-vmovdqa %ymm10,128(%rdi)
-vmovdqa %ymm4,160(%rdi)
-vmovdqa %ymm5,192(%rdi)
-vmovdqa %ymm11,224(%rdi)
-
-ret
-
-nttunpack128_avx2:
-#load
-vmovdqa (%rdi),%ymm4
-vmovdqa 32(%rdi),%ymm5
-vmovdqa 64(%rdi),%ymm6
-vmovdqa 96(%rdi),%ymm7
-vmovdqa 128(%rdi),%ymm8
-vmovdqa 160(%rdi),%ymm9
-vmovdqa 192(%rdi),%ymm10
-vmovdqa 224(%rdi),%ymm11
-
-shuffle8 4,8,3,8
-shuffle8 5,9,4,9
-shuffle8 6,10,5,10
-shuffle8 7,11,6,11
-
-shuffle4 3,5,7,5
-shuffle4 8,10,3,10
-shuffle4 4,6,8,6
-shuffle4 9,11,4,11
-
-shuffle2 7,8,9,8
-shuffle2 5,6,7,6
-shuffle2 3,4,5,4
-shuffle2 10,11,3,11
-
-shuffle1 9,5,10,5
-shuffle1 8,4,9,4
-shuffle1 7,3,8,3
-shuffle1 6,11,7,11
-
-#store
-vmovdqa %ymm10,(%rdi)
-vmovdqa %ymm5,32(%rdi)
-vmovdqa %ymm9,64(%rdi)
-vmovdqa %ymm4,96(%rdi)
-vmovdqa %ymm8,128(%rdi)
-vmovdqa %ymm3,160(%rdi)
-vmovdqa %ymm7,192(%rdi)
-vmovdqa %ymm11,224(%rdi)
-
-ret
-
-.global MLKEM_ASM_NAMESPACE(nttunpack_avx2)
-MLKEM_ASM_NAMESPACE(nttunpack_avx2):
-call nttunpack128_avx2
-add $256,%rdi
-call nttunpack128_avx2
-ret
-
-ntttobytes128_avx:
-#load
-vmovdqa (%rsi),%ymm5
-vmovdqa 32(%rsi),%ymm6
-vmovdqa 64(%rsi),%ymm7
-vmovdqa 96(%rsi),%ymm8
-vmovdqa 128(%rsi),%ymm9
-vmovdqa 160(%rsi),%ymm10
-vmovdqa 192(%rsi),%ymm11
-vmovdqa 224(%rsi),%ymm12
-
-#bitpack
-vpsllw $12,%ymm6,%ymm4
-vpor %ymm4,%ymm5,%ymm4
-
-vpsrlw $4,%ymm6,%ymm5
-vpsllw $8,%ymm7,%ymm6
-vpor %ymm5,%ymm6,%ymm5
-
-vpsrlw $8,%ymm7,%ymm6
-vpsllw $4,%ymm8,%ymm7
-vpor %ymm6,%ymm7,%ymm6
-
-vpsllw $12,%ymm10,%ymm7
-vpor %ymm7,%ymm9,%ymm7
-
-vpsrlw $4,%ymm10,%ymm8
-vpsllw $8,%ymm11,%ymm9
-vpor %ymm8,%ymm9,%ymm8
-
-vpsrlw $8,%ymm11,%ymm9
-vpsllw $4,%ymm12,%ymm10
-vpor %ymm9,%ymm10,%ymm9
-
-shuffle1 4,5,3,5
-shuffle1 6,7,4,7
-shuffle1 8,9,6,9
-
-shuffle2 3,4,8,4
-shuffle2 6,5,3,5
-shuffle2 7,9,6,9
-
-shuffle4 8,3,7,3
-shuffle4 6,4,8,4
-shuffle4 5,9,6,9
-
-shuffle8 7,8,5,8
-shuffle8 6,3,7,3
-shuffle8 4,9,6,9
-
-#store
-vmovdqu %ymm5,(%rdi)
-vmovdqu %ymm7,32(%rdi)
-vmovdqu %ymm6,64(%rdi)
-vmovdqu %ymm8,96(%rdi)
-vmovdqu %ymm3,128(%rdi)
-vmovdqu %ymm9,160(%rdi)
-
-ret
-
-.global MLKEM_ASM_NAMESPACE(ntttobytes_avx2)
-MLKEM_ASM_NAMESPACE(ntttobytes_avx2):
-#consts
-vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rdx),%ymm0
-call ntttobytes128_avx
-add $256,%rsi
-add $192,%rdi
-call ntttobytes128_avx
-ret
-
-nttfrombytes128_avx:
-#load
-vmovdqu (%rsi),%ymm4
-vmovdqu 32(%rsi),%ymm5
-vmovdqu 64(%rsi),%ymm6
-vmovdqu 96(%rsi),%ymm7
-vmovdqu 128(%rsi),%ymm8
-vmovdqu 160(%rsi),%ymm9
-
-shuffle8 4,7,3,7
-shuffle8 5,8,4,8
-shuffle8 6,9,5,9
-
-shuffle4 3,8,6,8
-shuffle4 7,5,3,5
-shuffle4 4,9,7,9
-
-shuffle2 6,5,4,5
-shuffle2 8,7,6,7
-shuffle2 3,9,8,9
-
-shuffle1 4,7,10,7
-shuffle1 5,8,4,8
-shuffle1 6,9,5,9
-
-#bitunpack
-vpsrlw $12,%ymm10,%ymm11
-vpsllw $4,%ymm7,%ymm12
-vpor %ymm11,%ymm12,%ymm11
-vpand %ymm0,%ymm10,%ymm10
-vpand %ymm0,%ymm11,%ymm11
-
-vpsrlw $8,%ymm7,%ymm12
-vpsllw $8,%ymm4,%ymm13
-vpor %ymm12,%ymm13,%ymm12
-vpand %ymm0,%ymm12,%ymm12
-
-vpsrlw $4,%ymm4,%ymm13
-vpand %ymm0,%ymm13,%ymm13
-
-vpsrlw $12,%ymm8,%ymm14
-vpsllw $4,%ymm5,%ymm15
-vpor %ymm14,%ymm15,%ymm14
-vpand %ymm0,%ymm8,%ymm8
-vpand %ymm0,%ymm14,%ymm14
-
-vpsrlw $8,%ymm5,%ymm15
-vpsllw $8,%ymm9,%ymm1
-vpor %ymm15,%ymm1,%ymm15
-vpand %ymm0,%ymm15,%ymm15
-
-vpsrlw $4,%ymm9,%ymm1
-vpand %ymm0,%ymm1,%ymm1
-
-#store
-vmovdqa %ymm10,(%rdi)
-vmovdqa %ymm11,32(%rdi)
-vmovdqa %ymm12,64(%rdi)
-vmovdqa %ymm13,96(%rdi)
-vmovdqa %ymm8,128(%rdi)
-vmovdqa %ymm14,160(%rdi)
-vmovdqa %ymm15,192(%rdi)
-vmovdqa %ymm1,224(%rdi)
-
-ret
-
-.global MLKEM_ASM_NAMESPACE(nttfrombytes_avx2)
-MLKEM_ASM_NAMESPACE(nttfrombytes_avx2):
-#consts
-vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMASK*2(%rdx),%ymm0
-call nttfrombytes128_avx
-add $256,%rdi
-add $192,%rsi
-call nttfrombytes128_avx
-ret
-
-#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/fq.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/tomont.S
similarity index 64%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/fq.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/tomont.S
index 3f013a5fa..7774cec0b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/fq.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/tomont.S
@@ -14,63 +14,24 @@
#include "../../../common.h"
#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
-#include "consts.h"
+/* simpasm: header-end */
+#include "consts.h"
#include "fq.inc"
.text
-reduce128_avx2:
-#load
-vmovdqa (%rdi),%ymm2
-vmovdqa 32(%rdi),%ymm3
-vmovdqa 64(%rdi),%ymm4
-vmovdqa 96(%rdi),%ymm5
-vmovdqa 128(%rdi),%ymm6
-vmovdqa 160(%rdi),%ymm7
-vmovdqa 192(%rdi),%ymm8
-vmovdqa 224(%rdi),%ymm9
-
-red16 2
-red16 3
-red16 4
-red16 5
-red16 6
-red16 7
-red16 8
-red16 9
-
-csubq 2
-csubq 3
-csubq 4
-csubq 5
-csubq 6
-csubq 7
-csubq 8
-csubq 9
-
-#store
-vmovdqa %ymm2,(%rdi)
-vmovdqa %ymm3,32(%rdi)
-vmovdqa %ymm4,64(%rdi)
-vmovdqa %ymm5,96(%rdi)
-vmovdqa %ymm6,128(%rdi)
-vmovdqa %ymm7,160(%rdi)
-vmovdqa %ymm8,192(%rdi)
-vmovdqa %ymm9,224(%rdi)
-
-ret
-
-.global MLKEM_ASM_NAMESPACE(reduce_avx2)
-MLKEM_ASM_NAMESPACE(reduce_avx2):
+.global MLKEM_ASM_NAMESPACE(tomont_avx2)
+.balign 4
+MLKEM_ASM_NAMESPACE(tomont_avx2):
#consts
vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0
-vmovdqa AVX2_BACKEND_DATA_OFFSET_16XV*2(%rsi),%ymm1
-call reduce128_avx2
+vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO*2(%rsi),%ymm1
+vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI*2(%rsi),%ymm2
+call tomont128_avx2
add $256,%rdi
-call reduce128_avx2
+call tomont128_avx2
ret
-
tomont128_avx2:
#load
vmovdqa (%rdi),%ymm3
@@ -103,15 +64,5 @@ vmovdqa %ymm10,224(%rdi)
ret
-.global MLKEM_ASM_NAMESPACE(tomont_avx2)
-MLKEM_ASM_NAMESPACE(tomont_avx2):
-#consts
-vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0
-vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO*2(%rsi),%ymm1
-vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI*2(%rsi),%ymm2
-call tomont128_avx2
-add $256,%rdi
-call tomont128_avx2
-ret
-
+/* simpasm: footer-start */
#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/README.md b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/README.md
index e499a4a22..a420f05b6 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/README.md
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/README.md
@@ -10,10 +10,9 @@ works:
- _Fast and Clean: Auditable high-performance assembly via constraint solving_, Amin Abdulrahman, Hanno Becker, Matthias
J. Kannwischer, Fabien Klein, [https://eprint.iacr.org/2022/1303](https://eprint.iacr.org/2022/1303)
-## Profiles
-This backend comes with two profiles: "clean" and optimized. The "clean" backend is handwritten and meant to be easy to
-read and modify; for example, is heavily leverages register aliases and assembly macros. The optimized profile is
-automatically generated from the clean profile via [SLOTHY](https://github.com/slothy-optimizer/slothy). Currently, the
+## Variants
+
+This backend comes in two versions: "clean" and optimized. The "clean" backend is handwritten and meant to be easy to read and modify; for example, is heavily leverages register aliases and assembly macros. This directory contains the optimized version, which is automatically generated from the clean one via [SLOTHY](https://github.com/slothy-optimizer/slothy). Currently, the
target architecture is Cortex-A55, but you can easily re-optimize the code for a different microarchitecture supported
-by SLOTHY, by adjusting the parameters in [optimize.sh](src/optimize.sh).
+by SLOTHY, by adjusting the parameters in [optimize.sh](../../../test/aarch64_clean/src/optimize.sh).
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/clean.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/clean.h
deleted file mode 100644
index f124702a4..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/clean.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/* ML-KEM arithmetic native profile for clean assembly */
-
-#ifdef MLKEM_NATIVE_ARITH_PROFILE_H
-#error Only one MLKEM_ARITH assembly profile can be defined -- did you include multiple profiles?
-#else
-#define MLKEM_NATIVE_ARITH_PROFILE_H
-
-/* Identifier for this backend so that source and assembly files
- * in the build can be appropriately guarded. */
-#define MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN
-
-#define MLKEM_NATIVE_ARITH_BACKEND_NAME AARCH64_CLEAN
-
-/* Filename of the C backend implementation.
- * This is not inlined here because this header is included in assembly
- * files as well. */
-#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/aarch64/src/clean_impl.h"
-
-#endif /* MLKEM_NATIVE_ARITH_PROFILE_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/opt.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/opt.h
index a7217163f..4a0243279 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/opt.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/opt.h
@@ -3,8 +3,6 @@
* SPDX-License-Identifier: Apache-2.0
*/
-/* ML-KEM arithmetic native profile for clean assembly */
-
#ifdef MLKEM_NATIVE_ARITH_PROFILE_H
#error Only one MLKEM_ARITH assembly profile can be defined -- did you include multiple profiles?
#else
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/aarch64_zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/aarch64_zetas.c
index 2c1bb31e1..23e7949d3 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/aarch64_zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/aarch64_zetas.c
@@ -10,8 +10,7 @@
#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \
- defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
#include
#include "arith_native_aarch64.h"
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/arith_native_aarch64.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/arith_native_aarch64.h
index ed0825892..60779598d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/arith_native_aarch64.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/arith_native_aarch64.h
@@ -29,62 +29,49 @@ extern const int16_t aarch64_zetas_mulcache_native[];
extern const int16_t aarch64_zetas_mulcache_twisted_native[];
extern const uint8_t rej_uniform_table[];
-#define ntt_asm_clean MLKEM_NAMESPACE(ntt_asm_clean)
-void ntt_asm_clean(int16_t *, const int16_t *, const int16_t *);
-
#define ntt_asm_opt MLKEM_NAMESPACE(ntt_asm_opt)
void ntt_asm_opt(int16_t *, const int16_t *, const int16_t *);
-#define intt_asm_clean MLKEM_NAMESPACE(intt_asm_clean)
-void intt_asm_clean(int16_t *, const int16_t *, const int16_t *);
-
#define intt_asm_opt MLKEM_NAMESPACE(intt_asm_opt)
void intt_asm_opt(int16_t *, const int16_t *, const int16_t *);
-#define rej_uniform_asm_clean MLKEM_NAMESPACE(rej_uniform_asm_clean)
-unsigned rej_uniform_asm_clean(int16_t *r, const uint8_t *buf, unsigned buflen,
- const uint8_t *table);
-
-#define poly_reduce_asm_clean MLKEM_NAMESPACE(poly_reduce_asm_clean)
-void poly_reduce_asm_clean(int16_t *);
-
#define poly_reduce_asm_opt MLKEM_NAMESPACE(poly_reduce_asm_opt)
void poly_reduce_asm_opt(int16_t *);
-#define poly_tomont_asm_clean MLKEM_NAMESPACE(poly_tomont_asm_clean)
-void poly_tomont_asm_clean(int16_t *);
-
#define poly_tomont_asm_opt MLKEM_NAMESPACE(poly_tomont_asm_opt)
void poly_tomont_asm_opt(int16_t *);
-#define poly_mulcache_compute_asm_clean \
- MLKEM_NAMESPACE(poly_mulcache_compute_asm_clean)
-void poly_mulcache_compute_asm_clean(int16_t *, const int16_t *,
- const int16_t *, const int16_t *);
-
-
#define poly_mulcache_compute_asm_opt \
MLKEM_NAMESPACE(poly_mulcache_compute_asm_opt)
void poly_mulcache_compute_asm_opt(int16_t *, const int16_t *, const int16_t *,
const int16_t *);
-#define poly_tobytes_asm_clean MLKEM_NAMESPACE(poly_tobytes_asm_clean)
-void poly_tobytes_asm_clean(uint8_t *r, const int16_t *a);
-
#define poly_tobytes_asm_opt MLKEM_NAMESPACE(poly_tobytes_asm_opt)
void poly_tobytes_asm_opt(uint8_t *r, const int16_t *a);
-#define polyvec_basemul_acc_montgomery_cached_asm_clean \
- MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
-void polyvec_basemul_acc_montgomery_cached_asm_clean(int16_t *r,
- const int16_t *a,
- const int16_t *b,
- const int16_t *b_cache);
+#define polyvec_basemul_acc_montgomery_cached_asm_k2_opt \
+ MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2_opt)
+void polyvec_basemul_acc_montgomery_cached_asm_k2_opt(int16_t *r,
+ const int16_t *a,
+ const int16_t *b,
+ const int16_t *b_cache);
+
+#define polyvec_basemul_acc_montgomery_cached_asm_k3_opt \
+ MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3_opt)
+void polyvec_basemul_acc_montgomery_cached_asm_k3_opt(int16_t *r,
+ const int16_t *a,
+ const int16_t *b,
+ const int16_t *b_cache);
+
+#define polyvec_basemul_acc_montgomery_cached_asm_k4_opt \
+ MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4_opt)
+void polyvec_basemul_acc_montgomery_cached_asm_k4_opt(int16_t *r,
+ const int16_t *a,
+ const int16_t *b,
+ const int16_t *b_cache);
-#define polyvec_basemul_acc_montgomery_cached_asm_opt \
- MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
-void polyvec_basemul_acc_montgomery_cached_asm_opt(int16_t *r, const int16_t *a,
- const int16_t *b,
- const int16_t *b_cache);
+#define rej_uniform_asm_clean MLKEM_NAMESPACE(rej_uniform_asm_clean)
+unsigned rej_uniform_asm_clean(int16_t *r, const uint8_t *buf, unsigned buflen,
+ const uint8_t *table);
#endif /* MLKEM_AARCH64_NATIVE_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/clean_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/clean_impl.h
deleted file mode 100644
index 4be90fb24..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/clean_impl.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/* ML-KEM arithmetic native profile for clean assembly */
-
-#ifdef MLKEM_NATIVE_ARITH_PROFILE_IMPL_H
-#error Only one MLKEM_ARITH assembly profile can be defined -- did you include multiple profiles?
-#else
-#define MLKEM_NATIVE_ARITH_PROFILE_IMPL_H
-
-#include "arith_native_aarch64.h"
-
-/* Set of primitives that this backend replaces */
-#define MLKEM_USE_NATIVE_NTT
-#define MLKEM_USE_NATIVE_INTT
-#define MLKEM_USE_NATIVE_POLY_REDUCE
-#define MLKEM_USE_NATIVE_POLY_TOMONT
-#define MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE
-#define MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED
-#define MLKEM_USE_NATIVE_POLY_TOBYTES
-#define MLKEM_USE_NATIVE_REJ_UNIFORM
-
-static INLINE void ntt_native(int16_t data[MLKEM_N])
-{
- ntt_asm_clean(data, aarch64_ntt_zetas_layer01234, aarch64_ntt_zetas_layer56);
-}
-
-static INLINE void intt_native(int16_t data[MLKEM_N])
-{
- intt_asm_clean(data, aarch64_invntt_zetas_layer01234,
- aarch64_invntt_zetas_layer56);
-}
-
-static INLINE void poly_reduce_native(int16_t data[MLKEM_N])
-{
- poly_reduce_asm_clean(data);
-}
-
-static INLINE void poly_tomont_native(int16_t data[MLKEM_N])
-{
- poly_tomont_asm_clean(data);
-}
-
-static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2],
- const int16_t y[MLKEM_N])
-{
- poly_mulcache_compute_asm_clean(x, y, aarch64_zetas_mulcache_native,
- aarch64_zetas_mulcache_twisted_native);
-}
-
-static INLINE void polyvec_basemul_acc_montgomery_cached_native(
- int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
- const int16_t b[MLKEM_K * MLKEM_N],
- const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)])
-{
- polyvec_basemul_acc_montgomery_cached_asm_clean(r, a, b, b_cache);
-}
-
-static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
- const int16_t a[MLKEM_N])
-{
- poly_tobytes_asm_clean(r, a);
-}
-
-static INLINE int rej_uniform_native(int16_t *r, unsigned len,
- const uint8_t *buf, unsigned buflen)
-{
- if (len != MLKEM_N || buflen % 24 != 0)
- {
- return -1;
- }
- return (int)rej_uniform_asm_clean(r, buf, buflen, rej_uniform_table);
-}
-
-#endif /* MLKEM_NATIVE_ARITH_PROFILE_IMPL_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_clean.S
deleted file mode 100644
index b0ae1ad46..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_clean.S
+++ /dev/null
@@ -1,389 +0,0 @@
-/// Copyright (c) 2024 The mlkem-native project authors
-/// Copyright (c) 2022 Arm Limited
-/// Copyright (c) 2022 Hanno Becker
-/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer
-/// SPDX-License-Identifier: MIT
-///
-/// Permission is hereby granted, free of charge, to any person obtaining a copy
-/// of this software and associated documentation files (the "Software"), to deal
-/// in the Software without restriction, including without limitation the rights
-/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-/// copies of the Software, and to permit persons to whom the Software is
-/// furnished to do so, subject to the following conditions:
-///
-/// The above copyright notice and this permission notice shall be included in all
-/// copies or substantial portions of the Software.
-///
-/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-/// SOFTWARE.
-///
-
-#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
-
-// Bounds:
-// If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2)
-//
-// See mlken/reduce.c and test/test_bounds.py for more details.
-.macro mulmodq dst, src, const, idx0, idx1
- // Signed barrett multiplication using
- // round-to-nearest-even-integer approximation.
- // Following https://eprint.iacr.org/2021/986.pdf, this
- // is functionally the same as a signed Montgomery multiplication
- // with a suitable constant of absolute value < q.
- sqrdmulh t2.8h, \src\().8h, \const\().h[\idx1\()]
- mul \dst\().8h, \src\().8h, \const\().h[\idx0\()]
- mls \dst\().8h, t2.8h, consts.h[0]
-.endm
-
-.macro mulmod dst, src, const, const_twisted
- sqrdmulh t2.8h, \src\().8h, \const_twisted\().8h
- mul \dst\().8h, \src\().8h, \const\().8h
- mls \dst\().8h, t2.8h, consts.h[0]
-.endm
-
-.macro gs_butterfly a, b, root, idx0, idx1
- sub tmp.8h, \a\().8h, \b\().8h
- add \a\().8h, \a\().8h, \b\().8h
- mulmodq \b, tmp, \root, \idx0, \idx1
-.endm
-
-.macro gs_butterfly_v a, b, root, root_twisted
- sub tmp.8h, \a\().8h, \b\().8h
- add \a\().8h, \a\().8h, \b\().8h
- mulmod \b, tmp, \root, \root_twisted
-.endm
-
-.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3
- mulmod \dst0, \src0, ninv, ninv_tw
- mulmod \dst1, \src1, ninv, ninv_tw
- mulmod \dst2, \src2, ninv, ninv_tw
- mulmod \dst3, \src3, ninv, ninv_tw
-.endm
-
-.macro barrett_reduce a
- sqdmulh t0.8h, \a\().8h, consts.h[1]
- srshr t0.8h, t0.8h, #11
- mls \a\().8h, t0.8h, consts.h[0]
-.endm
-
-.macro load_roots_012
- ldr q_root0, [r01234_ptr], #32
- ldr q_root1, [r01234_ptr, #-16]
-.endm
-
-.macro load_next_roots_34
- ldr q_root0, [r01234_ptr], #16
-.endm
-
-.macro load_next_roots_56
- ldr q_root0, [r56_ptr], #(6*16)
- ldr q_root0_tw, [r56_ptr, #(-6*16 + 1*16)]
- ldr q_root1, [r56_ptr, #(-6*16 + 2*16)]
- ldr q_root1_tw, [r56_ptr, #(-6*16 + 3*16)]
- ldr q_root2, [r56_ptr, #(-6*16 + 4*16)]
- ldr q_root2_tw, [r56_ptr, #(-6*16 + 5*16)]
-.endm
-
-.macro transpose4 data
- trn1 t0.4s, \data\()0.4s, \data\()1.4s
- trn2 t1.4s, \data\()0.4s, \data\()1.4s
- trn1 t2.4s, \data\()2.4s, \data\()3.4s
- trn2 t3.4s, \data\()2.4s, \data\()3.4s
-
- trn2 \data\()2.2d, t0.2d, t2.2d
- trn2 \data\()3.2d, t1.2d, t3.2d
- trn1 \data\()0.2d, t0.2d, t2.2d
- trn1 \data\()1.2d, t1.2d, t3.2d
-.endm
-
-.macro transpose_single data_out, data_in
- trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s
- trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s
- trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s
- trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s
-.endm
-
-.macro save_vregs
- sub sp, sp, #(16*4)
- stp d8, d9, [sp, #16*0]
- stp d10, d11, [sp, #16*1]
- stp d12, d13, [sp, #16*2]
- stp d14, d15, [sp, #16*3]
-.endm
-
-.macro restore_vregs
- ldp d8, d9, [sp, #16*0]
- ldp d10, d11, [sp, #16*1]
- ldp d12, d13, [sp, #16*2]
- ldp d14, d15, [sp, #16*3]
- add sp, sp, #(16*4)
-.endm
-
-.macro push_stack
- save_vregs
-.endm
-
-.macro pop_stack
- restore_vregs
-.endm
-
-// For comparability reasons, the output range for the coefficients of this
-// invNTT code is supposed to match the implementation from PQClean on commit
-// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients
-// are NOT canonically reduced. The ordering of the coefficients is canonical,
-// also matching PQClean.
-
-.text
- .global MLKEM_ASM_NAMESPACE(intt_asm_clean)
-
- in .req x0
- r01234_ptr .req x1
- r56_ptr .req x2
-
- inp .req x3
- count .req x4
- wtmp .req w5
-
- data0 .req v8
- data1 .req v9
- data2 .req v10
- data3 .req v11
- data4 .req v12
- data5 .req v13
- data6 .req v14
- data7 .req v15
-
- q_data0 .req q8
- q_data1 .req q9
- q_data2 .req q10
- q_data3 .req q11
- q_data4 .req q12
- q_data5 .req q13
- q_data6 .req q14
- q_data7 .req q15
-
- root0 .req v0
- root1 .req v1
- root2 .req v2
- root0_tw .req v4
- root1_tw .req v5
- root2_tw .req v6
-
- consts .req v7
- q_consts .req q7
-
- q_root0 .req q0
- q_root1 .req q1
- q_root2 .req q2
- q_root0_tw .req q4
- q_root1_tw .req q5
- q_root2_tw .req q6
-
- tmp .req v24
- t0 .req v25
- t1 .req v26
- t2 .req v27
- t3 .req v28
-
- ninv .req v29
- ninv_tw .req v30
-
-.balign 4
-MLKEM_ASM_NAMESPACE(intt_asm_clean):
- push_stack
-
- // Setup constants
- mov wtmp, #3329
- mov consts.h[0], wtmp
- mov wtmp, #20159
- mov consts.h[1], wtmp
- mov wtmp, #512
- dup ninv.8h, wtmp
- mov wtmp, #5040
- dup ninv_tw.8h, wtmp
-
- mov inp, in
- mov count, #8
-
-scale_start:
-
- ldr q_data0, [inp, #(16*0)]
- ldr q_data1, [inp, #(16*1)]
- ldr q_data2, [inp, #(16*2)]
- ldr q_data3, [inp, #(16*3)]
-
- mul_ninv data0, data1, data2, data3, data0, data1, data2, data3
- // Bounds: Absolute value < q
-
- str q_data0, [inp], #64
- str q_data1, [inp, #(-64 + 16*1)]
- str q_data2, [inp, #(-64 + 16*2)]
- str q_data3, [inp, #(-64 + 16*3)]
-
- subs count, count, #1
- cbnz count, scale_start
-
- mov inp, in
- mov count, #8
-
- .p2align 2
-layer3456_start:
-
- ldr q_data0, [inp, #(16*0)]
- ldr q_data1, [inp, #(16*1)]
- ldr q_data2, [inp, #(16*2)]
- ldr q_data3, [inp, #(16*3)]
-
- transpose4 data // manual ld4
-
- load_next_roots_56
-
- // Layer 7
- gs_butterfly_v data0, data1, root1, root1_tw
- gs_butterfly_v data2, data3, root2, root2_tw
- // Bounds:
- // data0, data2: < 2q
- // data1, data3: < q
-
- // Layer 6
- gs_butterfly_v data0, data2, root0, root0_tw
- gs_butterfly_v data1, data3, root0, root0_tw
- // Bounds:
- // data0: < 4q
- // data1: < 2q
- // data2, data3: < q
-
- transpose4 data
-
- load_next_roots_34
-
- // Layer 5
- gs_butterfly data0, data1, root0, 2, 3
- gs_butterfly data2, data3, root0, 4, 5
- // Max bound: 8q
-
- // Not all of those reductions are needed, but the bounds tracking
- // is easier if we uniformly reduce at this point.
- barrett_reduce data0
- barrett_reduce data2
- barrett_reduce data1
- barrett_reduce data3
-
- // Bounds: q/2
-
- // Layer 4
- gs_butterfly data0, data2, root0, 0, 1
- gs_butterfly data1, data3, root0, 0, 1
- // Bounds: < q
-
- str q_data0, [inp], #(64)
- str q_data1, [inp, #(-64 + 16*1)]
- str q_data2, [inp, #(-64 + 16*2)]
- str q_data3, [inp, #(-64 + 16*3)]
-
- subs count, count, #1
- cbnz count, layer3456_start
-
- // ---------------------------------------------------------------------
-
- mov count, #4
- load_roots_012
-
- .p2align 2
-
-layer012_start:
-
- ldr q_data0, [in, #0]
- ldr q_data1, [in, #(1*(512/8))]
- ldr q_data2, [in, #(2*(512/8))]
- ldr q_data3, [in, #(3*(512/8))]
- ldr q_data4, [in, #(4*(512/8))]
- ldr q_data5, [in, #(5*(512/8))]
- ldr q_data6, [in, #(6*(512/8))]
- ldr q_data7, [in, #(7*(512/8))]
-
- gs_butterfly data0, data1, root0, 6, 7
- gs_butterfly data2, data3, root1, 0, 1
- gs_butterfly data4, data5, root1, 2, 3
- gs_butterfly data6, data7, root1, 4, 5
-
- gs_butterfly data0, data2, root0, 2, 3
- gs_butterfly data1, data3, root0, 2, 3
- gs_butterfly data4, data6, root0, 4, 5
- gs_butterfly data5, data7, root0, 4, 5
-
- gs_butterfly data0, data4, root0, 0, 1
- gs_butterfly data1, data5, root0, 0, 1
- gs_butterfly data2, data6, root0, 0, 1
- gs_butterfly data3, data7, root0, 0, 1
-
- // Bounds: < 8q
-
- str q_data4, [in, #(4*(512/8))]
- str q_data5, [in, #(5*(512/8))]
- str q_data6, [in, #(6*(512/8))]
- str q_data7, [in, #(7*(512/8))]
-
- str q_data0, [in], #(16)
- str q_data1, [in, #(-16 + 1*(512/8))]
- str q_data2, [in, #(-16 + 2*(512/8))]
- str q_data3, [in, #(-16 + 3*(512/8))]
-
- subs count, count, #1
- cbnz count, layer012_start
-
- pop_stack
- ret
-
-/****************** REGISTER DEALLOCATIONS *******************/
- .unreq in
- .unreq r01234_ptr
- .unreq r56_ptr
- .unreq inp
- .unreq count
- .unreq wtmp
- .unreq data0
- .unreq data1
- .unreq data2
- .unreq data3
- .unreq data4
- .unreq data5
- .unreq data6
- .unreq data7
- .unreq q_data0
- .unreq q_data1
- .unreq q_data2
- .unreq q_data3
- .unreq q_data4
- .unreq q_data5
- .unreq q_data6
- .unreq q_data7
- .unreq root0
- .unreq root1
- .unreq root2
- .unreq root0_tw
- .unreq root1_tw
- .unreq root2_tw
- .unreq consts
- .unreq q_consts
- .unreq q_root0
- .unreq q_root1
- .unreq q_root2
- .unreq q_root0_tw
- .unreq q_root1_tw
- .unreq q_root2_tw
- .unreq tmp
- .unreq t0
- .unreq t1
- .unreq t2
- .unreq t3
- .unreq ninv
- .unreq ninv_tw
-
-#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_opt.S
index 191de3c4d..0f9e44307 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_opt.S
@@ -25,6 +25,7 @@
#include "../../../common.h"
#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+/* simpasm: header-end */
// Bounds:
// If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2)
@@ -139,9 +140,6 @@
// are NOT canonically reduced. The ordering of the coefficients is canonical,
// also matching PQClean.
-.text
- .global MLKEM_ASM_NAMESPACE(intt_asm_opt)
-
in .req x0
r01234_ptr .req x1
r56_ptr .req x2
@@ -194,7 +192,9 @@
ninv .req v29
ninv_tw .req v30
-.balign 4
+ .text
+ .global MLKEM_ASM_NAMESPACE(intt_asm_opt)
+ .balign 4
MLKEM_ASM_NAMESPACE(intt_asm_opt):
push_stack
@@ -1042,4 +1042,5 @@ layer012_start:
.unreq ninv
.unreq ninv_tw
+/* simpasm: footer-start */
#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_clean.S
deleted file mode 100644
index 4f844e212..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_clean.S
+++ /dev/null
@@ -1,317 +0,0 @@
-///
-/// Copyright (c) 2022 Arm Limited
-/// Copyright (c) 2022 Hanno Becker
-/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer
-/// Copyright (c) 2024 The mlkem-native project authors
-// SPDX-License-Identifier: MIT
-///
-/// Permission is hereby granted, free of charge, to any person obtaining a copy
-/// of this software and associated documentation files (the "Software"), to deal
-/// in the Software without restriction, including without limitation the rights
-/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-/// copies of the Software, and to permit persons to whom the Software is
-/// furnished to do so, subject to the following conditions:
-///
-/// The above copyright notice and this permission notice shall be included in all
-/// copies or substantial portions of the Software.
-///
-/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-/// SOFTWARE.
-///
-
-#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
-
-// Bounds:
-// If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2)
-//
-// See mlken/reduce.c and test/test_bounds.py for more details.
-.macro mulmodq dst, src, const, idx0, idx1
- // Signed barrett multiplication using
- // round-to-nearest-even-integer approximation.
- // Following https://eprint.iacr.org/2021/986.pdf, this
- // is functionally the same as a signed Montgomery multiplication
- // with a suitable constant of absolute value < q.
- sqrdmulh t2.8h, \src\().8h, \const\().h[\idx1\()]
- mul \dst\().8h, \src\().8h, \const\().h[\idx0\()]
- mls \dst\().8h, t2.8h, consts.h[0]
-.endm
-
-.macro mulmod dst, src, const, const_twisted
- sqrdmulh t2.8h, \src\().8h, \const_twisted\().8h
- mul \dst\().8h, \src\().8h, \const\().8h
- mls \dst\().8h, t2.8h, consts.h[0]
-.endm
-
-.macro ct_butterfly a, b, root, idx0, idx1
- mulmodq tmp, \b, \root, \idx0, \idx1
- sub \b\().8h, \a\().8h, tmp.8h
- add \a\().8h, \a\().8h, tmp.8h
-.endm
-
-.macro ct_butterfly_v a, b, root, root_twisted
- mulmod tmp, \b, \root, \root_twisted
- sub \b\().8h, \a\().8h, tmp.8h
- add \a\().8h, \a\().8h, tmp.8h
-.endm
-
-.macro load_roots_012
- ldr q_root0, [r01234_ptr], #32
- ldr q_root1, [r01234_ptr, #-16]
-.endm
-
-.macro load_next_roots_34
- ldr q_root0, [r01234_ptr], #16
-.endm
-
-.macro load_next_roots_56
- ldr q_root0, [r56_ptr], #(6*16)
- ldr q_root0_tw, [r56_ptr, #(-6*16 + 1*16)]
- ldr q_root1, [r56_ptr, #(-6*16 + 2*16)]
- ldr q_root1_tw, [r56_ptr, #(-6*16 + 3*16)]
- ldr q_root2, [r56_ptr, #(-6*16 + 4*16)]
- ldr q_root2_tw, [r56_ptr, #(-6*16 + 5*16)]
-.endm
-
-.macro transpose4 data
- trn1 t0.4s, \data\()0.4s, \data\()1.4s
- trn2 t1.4s, \data\()0.4s, \data\()1.4s
- trn1 t2.4s, \data\()2.4s, \data\()3.4s
- trn2 t3.4s, \data\()2.4s, \data\()3.4s
-
- trn2 \data\()2.2d, t0.2d, t2.2d
- trn2 \data\()3.2d, t1.2d, t3.2d
- trn1 \data\()0.2d, t0.2d, t2.2d
- trn1 \data\()1.2d, t1.2d, t3.2d
-.endm
-
-.macro save_vregs
- sub sp, sp, #(16*4)
- stp d8, d9, [sp, #16*0]
- stp d10, d11, [sp, #16*1]
- stp d12, d13, [sp, #16*2]
- stp d14, d15, [sp, #16*3]
-.endm
-
-.macro restore_vregs
- ldp d8, d9, [sp, #16*0]
- ldp d10, d11, [sp, #16*1]
- ldp d12, d13, [sp, #16*2]
- ldp d14, d15, [sp, #16*3]
- add sp, sp, #(16*4)
-.endm
-
-.macro push_stack
- save_vregs
-.endm
-
-.macro pop_stack
- restore_vregs
-.endm
-
- // Arguments
- in .req x0 // Input/output buffer
- r01234_ptr .req x1 // twiddles for layer 0,1,2,3,4
- r56_ptr .req x2 // twiddles for layer 5,6
-
- inp .req x3
- count .req x4
- wtmp .req w5
-
- data0 .req v8
- data1 .req v9
- data2 .req v10
- data3 .req v11
- data4 .req v12
- data5 .req v13
- data6 .req v14
- data7 .req v15
-
- q_data0 .req q8
- q_data1 .req q9
- q_data2 .req q10
- q_data3 .req q11
- q_data4 .req q12
- q_data5 .req q13
- q_data6 .req q14
- q_data7 .req q15
-
- root0 .req v0
- root1 .req v1
- root2 .req v2
- root0_tw .req v4
- root1_tw .req v5
- root2_tw .req v6
-
- q_root0 .req q0
- q_root1 .req q1
- q_root2 .req q2
- q_root0_tw .req q4
- q_root1_tw .req q5
- q_root2_tw .req q6
-
- consts .req v7
-
- tmp .req v24
- t0 .req v25
- t1 .req v26
- t2 .req v27
- t3 .req v28
-
- .text
- .global MLKEM_ASM_NAMESPACE(ntt_asm_clean)
-
- .balign 4
-MLKEM_ASM_NAMESPACE(ntt_asm_clean):
- push_stack
-
- mov wtmp, #3329
- mov consts.h[0], wtmp
- mov wtmp, #20159
- mov consts.h[1], wtmp
-
- mov inp, in
- mov count, #4
-
- load_roots_012
-
- .p2align 2
-
- // Bounds reasoning:
- // - There are 7 layers
- // - When passing from layer N to layer N+1, each layer-N value
- // is modified through the addition/subtraction of a Montgomery
- // product of a twiddle of absolute value < q/2 and a layer-N value.
- // - Recalling that for C such that |a| < C * q and |t|> 0);
- xtn out0.8b, data0.8h
-
- // r[3 * i + 1] = (t0 >> 8);
- shrn out1.8b, data0.8h, #8
- xtn tmp.8b, data1.8h
- // r[3 * i + 1] = (t0 >> 8) | (t1 << 4);
- sli out1.8b, tmp.8b, #4
-
- // r[3 * i + 2] = (t1 >> 4);
- shrn out2.8b, data1.8h, #4
-
- st3 {out0.8b, out1.8b, out2.8b}, [dst], #24
-
- subs count, count, #1
- cbnz count, poly_tobytes_asm_clean_asm_loop_start
- ret
-
- .unreq data0
- .unreq data1
- .unreq out0
- .unreq out1
- .unreq out2
- .unreq tmp
- .unreq dst
- .unreq src
- .unreq count
-
-/**********************************
- * poly_tomont() *
- **********************************/
-.global MLKEM_ASM_NAMESPACE(poly_tomont_asm_clean)
-
- src .req x0
- count .req x1
- wtmp .req w2
-
- data .req v0
- q_data .req q0
- res .req v1
- q_res .req q1
-
- factor .req v2
- factor_t .req v3
- modulus .req v4
- modulus_twisted .req v5
-
- tmp0 .req v6
-
-.balign 4
-MLKEM_ASM_NAMESPACE(poly_tomont_asm_clean):
-
- mov wtmp, #3329 // ML-KEM modulus
- dup modulus.8h, wtmp
-
- mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
- dup modulus_twisted.8h, wtmp
-
- mov wtmp, #-1044 // 2^16 % 3329
- dup factor.8h, wtmp
-
- mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16)
- dup factor_t.8h, wtmp
-
- mov count, #8
-poly_tomont_asm_loop:
-
- ldr q_data, [src], #64
- mulmod res, data, factor, factor_t
- str q_res, [src, #-64]
-
- ldr q_data, [src, #-48]
- mulmod res, data, factor, factor_t
- str q_res, [src, #-48]
-
- ldr q_data, [src, #-32]
- mulmod res, data, factor, factor_t
- str q_res, [src, #-32]
-
- ldr q_data, [src, #-16]
- mulmod res, data, factor, factor_t
- str q_res, [src, #-16]
-
- sub count, count, #1
- cbnz count, poly_tomont_asm_loop
-
- ret
-
- .unreq src
- .unreq count
- .unreq wtmp
-
- .unreq data
- .unreq q_data
- .unreq res
- .unreq q_res
-
- .unreq factor
- .unreq factor_t
- .unreq modulus
- .unreq modulus_twisted
-
- .unreq tmp0
-
-#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S
new file mode 100644
index 000000000..a3593b7fd
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "../../../common.h"
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+/* simpasm: header-end */
+
+/* Montgomery multiplication, with precomputed Montgomery twist
+ * Expects modulus in consts.h[0]. */
+.macro mulmod dst, src, const, const_twisted
+ sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h
+ mul \dst\().8h, \src\().8h, \const\().8h
+ mls \dst\().8h, tmp0.8h, modulus.h[0]
+.endm
+
+/********************************************
+ * poly_mulcache_compute() *
+ ********************************************/
+
+
+ cache_ptr .req x0
+ data_ptr .req x1
+ zeta_ptr .req x2
+ zeta_twisted_ptr .req x3
+ count .req x4
+ wtmp .req w5
+
+ data_odd .req v0
+ zeta .req v1
+ q_zeta .req q1
+ zeta_twisted .req v2
+ q_zeta_twisted .req q2
+
+ tmp0 .req v3
+ q_tmp0 .req q3
+ tmp1 .req v4
+ q_tmp1 .req q4
+ dst .req v5
+ q_dst .req q5
+
+ modulus .req v6
+ modulus_twisted .req v7
+
+ .text
+ .global MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt)
+ .balign 4
+MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt):
+ mov wtmp, #3329
+ dup modulus.8h, wtmp
+
+ mov wtmp, #20159
+ dup modulus_twisted.8h, wtmp
+
+ mov count, #16
+ // Instructions: 7
+ // Expected cycles: 12
+ // Expected IPC: 0.58
+
+ // Cycle bound: 12.0
+ // IPC bound: 0.58
+
+ // Wall time: 0.01s
+ // User time: 0.01s
+
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ ldr q1, [x1, #16] // *.............................
+ ldr q27, [x1], #32 // ..*...........................
+ ldr q23, [x2], #16 // ....*.........................
+ uzp2 v27.8H, v27.8H, v1.8H // ......*.......................
+ ldr q1, [x3], #16 // .......*......................
+ mul v2.8H, v27.8H, v23.8H // .........*....................
+ sqrdmulh v27.8H, v27.8H, v1.8H // ...........*..................
+
+ // ------ cycle (expected) ------>
+ // 0 25
+ // |------------------------|-----
+ // ldr q29, [x1, #16] // *..............................
+ // ldr q21, [x2], #16 // ....*..........................
+ // ldr q27, [x1], #32 // ..*............................
+ // ldr q7, [x3], #16 // .......*.......................
+ // uzp2 v28.8H, v27.8H, v29.8H // ......*........................
+ // mul v2.8H, v28.8H, v21.8H // .........*.....................
+ // sqrdmulh v27.8H, v28.8H, v7.8H // ...........*...................
+
+ sub count, count, #1
+poly_mulcache_compute_asm_opt_loop:
+ // Instructions: 9
+ // Expected cycles: 13
+ // Expected IPC: 0.69
+
+ // Cycle bound: 13.0
+ // IPC bound: 0.69
+
+ // Wall time: 0.09s
+ // User time: 0.09s
+
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ ldr q29, [x1, #16] // e.............................
+ ldr q21, [x2], #16 // ..e...........................
+ mls v2.8H, v27.8H, v6.H[0] // ....*.........................
+ ldr q27, [x1], #32 // .....e........................
+ ldr q7, [x3], #16 // .......e......................
+ uzp2 v28.8H, v27.8H, v29.8H // .........e....................
+ str q2, [x0], #16 // ..........*...................
+ mul v2.8H, v28.8H, v21.8H // ...........e..................
+ sqrdmulh v27.8H, v28.8H, v7.8H // ............e.................
+
+ // ------ cycle (expected) ------>
+ // 0 25
+ // |------------------------|-----
+ // ldr q3, [x1], #32 // .....e.......'....~.......'....
+ // ldr q4, [x1, #-16] // e............~............~....
+ // ldr q1, [x2], #16 // ..e..........'.~..........'.~..
+ // ldr q2, [x3], #16 // .......e.....'......~.....'....
+ // uzp2 v0.8h, v3.8h, v4.8h // .........e...'........~...'....
+ // sqrdmulh v3.8h, v0.8h, v2.8h // ............e'...........~'....
+ // mul v5.8h, v0.8h, v1.8h // ...........e.'..........~.'....
+ // mls v5.8h, v3.8h, v6.h[0] // ....~........'...*........'....
+ // str q5, [x0], #16 // ..........~..'.........*..'....
+
+ sub count, count, 1
+ cbnz count, poly_mulcache_compute_asm_opt_loop
+ // Instructions: 2
+ // Expected cycles: 5
+ // Expected IPC: 0.40
+
+ // Cycle bound: 5.0
+ // IPC bound: 0.40
+
+ // Wall time: 0.00s
+ // User time: 0.00s
+
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ mls v2.8H, v27.8H, v6.H[0] // *.............................
+ str q2, [x0], #16 // ....*.........................
+
+ // ------ cycle (expected) ------>
+ // 0 25
+ // |------------------------|-----
+ // mls v2.8H, v27.8H, v6.H[0] // *..............................
+ // str q2, [x0], #16 // ....*..........................
+
+
+ ret
+
+ .unreq cache_ptr
+ .unreq data_ptr
+ .unreq zeta_ptr
+ .unreq zeta_twisted_ptr
+ .unreq count
+ .unreq wtmp
+
+ .unreq data_odd
+ .unreq zeta
+ .unreq q_zeta
+ .unreq zeta_twisted
+ .unreq q_zeta_twisted
+
+ .unreq tmp0
+ .unreq q_tmp0
+ .unreq tmp1
+ .unreq q_tmp1
+ .unreq dst
+ .unreq q_dst
+
+ .unreq modulus
+ .unreq modulus_twisted
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_opt.S
deleted file mode 100644
index 79605818f..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_opt.S
+++ /dev/null
@@ -1,670 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
-
-/*
- * Some modular arithmetic macros
- */
-
-/* Barrett reduction */
-.macro barrett_reduce a
- sqdmulh tmp.8h, \a\().8h, modulus_twisted.h[0]
- srshr tmp.8h, tmp.8h, #11
- mls \a\().8h, tmp.8h, modulus.h[0]
-.endm
-
-/* Montgomery multiplication, with precomputed Montgomery twist
- * Expects modulus in consts.h[0]. */
-.macro mulmod dst, src, const, const_twisted
- sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h
- mul \dst\().8h, \src\().8h, \const\().8h
- mls \dst\().8h, tmp0.8h, modulus.h[0]
-.endm
-
-/* Turns signed-canonical to unsigned canonical representative
- * through conditional addition of the modulus.
- *
- * Expected modulus in `modulus`. */
-.macro scalar_signed_to_unsigned a
- sshr mask.8h, \a\().8h, #15
- and mask.16b, modulus.16b, mask.16b
- add \a\().8h, \a\().8h, mask.8h
-.endm
-
-/**********************************
- * poly_reduce() *
- **********************************/
-
-.global MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt)
-
- ptr .req x0
- count .req x1
- wtmp .req w2
-
- data .req v0
- q_data .req q0
-
- tmp .req v1
- mask .req v2
- modulus .req v3
- modulus_twisted .req v4
-
-.balign 4
-MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
-
- mov wtmp, #3329 // ML-KEM modulus
- dup modulus.8h, wtmp
-
- mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
- dup modulus_twisted.8h, wtmp
-
- mov count, #8
- // Instructions: 15
- // Expected cycles: 22
- // Expected IPC: 0.68
-
- // Cycle bound: 22.0
- // IPC bound: 0.68
-
- // Wall time: 0.05s
- // User time: 0.05s
-
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- ldr q21, [x0, #32] // *.............................
- ldr q23, [x0, #48] // ..*...........................
- sqdmulh v7.8H, v21.8H, v4.H[0] // ....*.........................
- sqdmulh v30.8H, v23.8H, v4.H[0] // ......*.......................
- srshr v7.8H, v7.8H, #11 // ........*.....................
- srshr v30.8H, v30.8H, #11 // ..........*...................
- mls v21.8H, v7.8H, v3.H[0] // ...........*..................
- mls v23.8H, v30.8H, v3.H[0] // .............*................
- ldr q5, [x0, #16] // ..............*...............
- sshr v7.8H, v21.8H, #15 // ................*.............
- sshr v30.8H, v23.8H, #15 // .................*............
- and v7.16B, v3.16B, v7.16B // ..................*...........
- add v21.8H, v21.8H, v7.8H // ...................*..........
- and v7.16B, v3.16B, v30.16B // ....................*.........
- add v16.8H, v23.8H, v7.8H // .....................*........
-
- // ------ cycle (expected) ------>
- // 0 25
- // |------------------------|-----
- // ldr q30, [x0, #32] // *..............................
- // sqdmulh v22.8H, v30.8H, v4.H[0] // ....*..........................
- // ldr q2, [x0, #48] // ..*............................
- // srshr v19.8H, v22.8H, #11 // ........*......................
- // mls v30.8H, v19.8H, v3.H[0] // ...........*...................
- // sqdmulh v25.8H, v2.8H, v4.H[0] // ......*........................
- // sshr v31.8H, v30.8H, #15 // ................*..............
- // srshr v25.8H, v25.8H, #11 // ..........*....................
- // and v18.16B, v3.16B, v31.16B // ..................*............
- // mls v2.8H, v25.8H, v3.H[0] // .............*.................
- // add v21.8H, v30.8H, v18.8H // ...................*...........
- // ldr q5, [x0, #16] // ..............*................
- // sshr v18.8H, v2.8H, #15 // .................*.............
- // and v27.16B, v3.16B, v18.16B // ....................*..........
- // add v16.8H, v2.8H, v27.8H // .....................*.........
-
- sub count, count, #1
-1:
- // Instructions: 32
- // Expected cycles: 36
- // Expected IPC: 0.89
-
- // Cycle bound: 36.0
- // IPC bound: 0.89
-
- // Wall time: 1.05s
- // User time: 1.05s
-
- // -------- cycle (expected) --------->
- // 0 25
- // |------------------------|----------
- ldr q6, [x0], #64 // *...................................
- ldr q30, [x0, #32] // ..e.................................
- sqdmulh v31.8H, v6.8H, v4.H[0] // ....*...............................
- sqdmulh v29.8H, v5.8H, v4.H[0] // .....*..............................
- sqdmulh v22.8H, v30.8H, v4.H[0] // ......e.............................
- str q16, [x0, #-16] // .......*............................
- srshr v20.8H, v31.8H, #11 // ........*...........................
- srshr v28.8H, v29.8H, #11 // .........*..........................
- str q21, [x0, #-32] // ..........*.........................
- mls v6.8H, v20.8H, v3.H[0] // ...........*........................
- mls v5.8H, v28.8H, v3.H[0] // ............*.......................
- ldr q2, [x0, #48] // .............e......................
- sshr v31.8H, v6.8H, #15 // ...............*....................
- srshr v19.8H, v22.8H, #11 // ................e...................
- and v22.16B, v3.16B, v31.16B // .................*..................
- add v0.8H, v6.8H, v22.8H // ..................*.................
- mls v30.8H, v19.8H, v3.H[0] // ...................e................
- sshr v26.8H, v5.8H, #15 // ....................*...............
- sqdmulh v25.8H, v2.8H, v4.H[0] // .....................e..............
- and v17.16B, v3.16B, v26.16B // ......................*.............
- add v1.8H, v5.8H, v17.8H // .......................*............
- sshr v31.8H, v30.8H, #15 // ........................e...........
- srshr v25.8H, v25.8H, #11 // .........................e..........
- str q1, [x0, #-48] // ..........................*.........
- and v18.16B, v3.16B, v31.16B // ...........................e........
- mls v2.8H, v25.8H, v3.H[0] // ............................e.......
- add v21.8H, v30.8H, v18.8H // .............................e......
- ldr q5, [x0, #16] // ..............................e.....
- sshr v18.8H, v2.8H, #15 // ................................e...
- str q0, [x0, #-64] // .................................*..
- and v27.16B, v3.16B, v18.16B // ..................................e.
- add v16.8H, v2.8H, v27.8H // ...................................e
-
- // ------------------------ cycle (expected) ------------------------->
- // 0 25 50
- // |------------------------|------------------------|-----------------
- // ldr q0, [x0], #64 // ..................................*.................................
- // sqdmulh v1.8h, v0.8h, v4.h[0] // ..~...............................'...*.............................
- // srshr v1.8h, v1.8h, #11 // ......~...........................'.......*.........................
- // mls v0.8h, v1.8h, v3.h[0] // .........~........................'..........*......................
- // sshr v2.8h, v0.8h, #15 // .............~....................'..............*..................
- // and v2.16b, v3.16b, v2.16b // ...............~..................'................*................
- // add v0.8h, v0.8h, v2.8h // ................~.................'.................*...............
- // str q0, [x0, #-64] // ...............................~..'................................*
- // ldr q0, [x0, #-48] // ............................e.....'.............................~...
- // sqdmulh v1.8h, v0.8h, v4.h[0] // ...~..............................'....*............................
- // srshr v1.8h, v1.8h, #11 // .......~..........................'........*........................
- // mls v0.8h, v1.8h, v3.h[0] // ..........~.......................'...........*.....................
- // sshr v2.8h, v0.8h, #15 // ..................~...............'...................*.............
- // and v2.16b, v3.16b, v2.16b // ....................~.............'.....................*...........
- // add v0.8h, v0.8h, v2.8h // .....................~............'......................*..........
- // str q0, [x0, #-48] // ........................~.........'.........................*.......
- // ldr q0, [x0, #-32] // e.................................'.~...............................
- // sqdmulh v1.8h, v0.8h, v4.h[0] // ....e.............................'.....~...........................
- // srshr v1.8h, v1.8h, #11 // ..............e...................'...............~.................
- // mls v0.8h, v1.8h, v3.h[0] // .................e................'..................~..............
- // sshr v2.8h, v0.8h, #15 // ......................e...........'.......................~.........
- // and v2.16b, v3.16b, v2.16b // .........................e........'..........................~......
- // add v0.8h, v0.8h, v2.8h // ...........................e......'............................~....
- // str q0, [x0, #-32] // ........~.........................'.........*.......................
- // ldr q0, [x0, #-16] // ...........e......................'............~....................
- // sqdmulh v1.8h, v0.8h, v4.h[0] // ...................e..............'....................~............
- // srshr v1.8h, v1.8h, #11 // .......................e..........'........................~........
- // mls v0.8h, v1.8h, v3.h[0] // ..........................e.......'...........................~.....
- // sshr v2.8h, v0.8h, #15 // ..............................e...'...............................~.
- // and v2.16b, v3.16b, v2.16b // ................................e.'.................................
- // add v0.8h, v0.8h, v2.8h // .................................e'.................................
- // str q0, [x0, #-16] // .....~............................'......*..........................
-
- sub count, count, 1
- cbnz count, 1b
- // Instructions: 17
- // Expected cycles: 23
- // Expected IPC: 0.74
-
- // Cycle bound: 23.0
- // IPC bound: 0.74
-
- // Wall time: 0.05s
- // User time: 0.05s
-
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- sqdmulh v20.8H, v5.8H, v4.H[0] // *.............................
- ldr q24, [x0], #64 // .*............................
- str q21, [x0, #-32] // ...*..........................
- srshr v20.8H, v20.8H, #11 // ....*.........................
- sqdmulh v25.8H, v24.8H, v4.H[0] // .....*........................
- str q16, [x0, #-16] // ......*.......................
- mls v5.8H, v20.8H, v3.H[0] // .......*......................
- srshr v20.8H, v25.8H, #11 // .........*....................
- sshr v2.8H, v5.8H, #15 // ...........*..................
- mls v24.8H, v20.8H, v3.H[0] // ............*.................
- and v20.16B, v3.16B, v2.16B // .............*................
- add v31.8H, v5.8H, v20.8H // ..............*...............
- sshr v20.8H, v24.8H, #15 // ................*.............
- str q31, [x0, #-48] // .................*............
- and v31.16B, v3.16B, v20.16B // ..................*...........
- add v24.8H, v24.8H, v31.8H // ...................*..........
- str q24, [x0, #-64] // ......................*.......
-
- // ------ cycle (expected) ------>
- // 0 25
- // |------------------------|-----
- // ldr q6, [x0], #64 // .*.............................
- // sqdmulh v31.8H, v6.8H, v4.H[0] // .....*.........................
- // sqdmulh v29.8H, v5.8H, v4.H[0] // *..............................
- // str q16, [x0, #-16] // ......*........................
- // srshr v20.8H, v31.8H, #11 // .........*.....................
- // srshr v28.8H, v29.8H, #11 // ....*..........................
- // str q21, [x0, #-32] // ...*...........................
- // mls v6.8H, v20.8H, v3.H[0] // ............*..................
- // mls v5.8H, v28.8H, v3.H[0] // .......*.......................
- // sshr v31.8H, v6.8H, #15 // ................*..............
- // and v22.16B, v3.16B, v31.16B // ..................*............
- // add v0.8H, v6.8H, v22.8H // ...................*...........
- // sshr v26.8H, v5.8H, #15 // ...........*...................
- // and v17.16B, v3.16B, v26.16B // .............*.................
- // add v1.8H, v5.8H, v17.8H // ..............*................
- // str q1, [x0, #-48] // .................*.............
- // str q0, [x0, #-64] // ......................*........
-
-
- ret
-
- .unreq ptr
- .unreq count
- .unreq wtmp
-
- .unreq data
- .unreq q_data
-
- .unreq tmp
- .unreq mask
- .unreq modulus
- .unreq modulus_twisted
-
-/********************************************
- * poly_mulcache_compute() *
- ********************************************/
-
-.global MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt)
-
- cache_ptr .req x0
- data_ptr .req x1
- zeta_ptr .req x2
- zeta_twisted_ptr .req x3
- count .req x4
- wtmp .req w5
-
- data_odd .req v0
- zeta .req v1
- q_zeta .req q1
- zeta_twisted .req v2
- q_zeta_twisted .req q2
-
- tmp0 .req v3
- q_tmp0 .req q3
- tmp1 .req v4
- q_tmp1 .req q4
- dst .req v5
- q_dst .req q5
-
- modulus .req v6
- modulus_twisted .req v7
-
-.balign 4
-MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt):
- mov wtmp, #3329
- dup modulus.8h, wtmp
-
- mov wtmp, #20159
- dup modulus_twisted.8h, wtmp
-
- mov count, #16
- // Instructions: 7
- // Expected cycles: 12
- // Expected IPC: 0.58
-
- // Cycle bound: 12.0
- // IPC bound: 0.58
-
- // Wall time: 0.01s
- // User time: 0.01s
-
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- ldr q1, [x1, #16] // *.............................
- ldr q27, [x1], #32 // ..*...........................
- ldr q23, [x2], #16 // ....*.........................
- uzp2 v27.8H, v27.8H, v1.8H // ......*.......................
- ldr q1, [x3], #16 // .......*......................
- mul v2.8H, v27.8H, v23.8H // .........*....................
- sqrdmulh v27.8H, v27.8H, v1.8H // ...........*..................
-
- // ------ cycle (expected) ------>
- // 0 25
- // |------------------------|-----
- // ldr q29, [x1, #16] // *..............................
- // ldr q21, [x2], #16 // ....*..........................
- // ldr q27, [x1], #32 // ..*............................
- // ldr q7, [x3], #16 // .......*.......................
- // uzp2 v28.8H, v27.8H, v29.8H // ......*........................
- // mul v2.8H, v28.8H, v21.8H // .........*.....................
- // sqrdmulh v27.8H, v28.8H, v7.8H // ...........*...................
-
- sub count, count, #1
-1:
- // Instructions: 9
- // Expected cycles: 13
- // Expected IPC: 0.69
-
- // Cycle bound: 13.0
- // IPC bound: 0.69
-
- // Wall time: 0.09s
- // User time: 0.09s
-
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- ldr q29, [x1, #16] // e.............................
- ldr q21, [x2], #16 // ..e...........................
- mls v2.8H, v27.8H, v6.H[0] // ....*.........................
- ldr q27, [x1], #32 // .....e........................
- ldr q7, [x3], #16 // .......e......................
- uzp2 v28.8H, v27.8H, v29.8H // .........e....................
- str q2, [x0], #16 // ..........*...................
- mul v2.8H, v28.8H, v21.8H // ...........e..................
- sqrdmulh v27.8H, v28.8H, v7.8H // ............e.................
-
- // ------ cycle (expected) ------>
- // 0 25
- // |------------------------|-----
- // ldr q3, [x1], #32 // .....e.......'....~.......'....
- // ldr q4, [x1, #-16] // e............~............~....
- // ldr q1, [x2], #16 // ..e..........'.~..........'.~..
- // ldr q2, [x3], #16 // .......e.....'......~.....'....
- // uzp2 v0.8h, v3.8h, v4.8h // .........e...'........~...'....
- // sqrdmulh v3.8h, v0.8h, v2.8h // ............e'...........~'....
- // mul v5.8h, v0.8h, v1.8h // ...........e.'..........~.'....
- // mls v5.8h, v3.8h, v6.h[0] // ....~........'...*........'....
- // str q5, [x0], #16 // ..........~..'.........*..'....
-
- sub count, count, 1
- cbnz count, 1b
- // Instructions: 2
- // Expected cycles: 5
- // Expected IPC: 0.40
-
- // Cycle bound: 5.0
- // IPC bound: 0.40
-
- // Wall time: 0.00s
- // User time: 0.00s
-
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- mls v2.8H, v27.8H, v6.H[0] // *.............................
- str q2, [x0], #16 // ....*.........................
-
- // ------ cycle (expected) ------>
- // 0 25
- // |------------------------|-----
- // mls v2.8H, v27.8H, v6.H[0] // *..............................
- // str q2, [x0], #16 // ....*..........................
-
-
- ret
-
- .unreq cache_ptr
- .unreq data_ptr
- .unreq zeta_ptr
- .unreq zeta_twisted_ptr
- .unreq count
- .unreq wtmp
-
- .unreq data_odd
- .unreq zeta
- .unreq q_zeta
- .unreq zeta_twisted
- .unreq q_zeta_twisted
-
- .unreq tmp0
- .unreq q_tmp0
- .unreq tmp1
- .unreq q_tmp1
- .unreq dst
- .unreq q_dst
-
- .unreq modulus
- .unreq modulus_twisted
-
-/********************************************
- * poly_tobytes() *
- ********************************************/
-.global MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt)
-
- data0 .req v0
- data1 .req v1
- out0 .req v2
- out1 .req v3
- out2 .req v4
- tmp .req v5
-
- dst .req x0
- src .req x1
- count .req x2
-
-.balign 4
-MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt):
-
- mov count, #16
-poly_tobytes_asm_opt_asm_loop_start:
- ld2 {data0.8h, data1.8h}, [src], #32
-
- // r[3 * i + 0] = (t0 >> 0);
- xtn out0.8b, data0.8h
-
- // r[3 * i + 1] = (t0 >> 8);
- shrn out1.8b, data0.8h, #8
- xtn tmp.8b, data1.8h
- // r[3 * i + 1] = (t0 >> 8) | (t1 << 4);
- sli out1.8b, tmp.8b, #4
-
- // r[3 * i + 2] = (t1 >> 4);
- shrn out2.8b, data1.8h, #4
-
- st3 {out0.8b, out1.8b, out2.8b}, [dst], #24
-
- subs count, count, #1
- cbnz count, poly_tobytes_asm_opt_asm_loop_start
- ret
-
- .unreq data0
- .unreq data1
- .unreq out0
- .unreq out1
- .unreq out2
- .unreq tmp
- .unreq dst
- .unreq src
- .unreq count
-
-/**********************************
- * poly_tomont() *
- **********************************/
-.global MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt)
-
- src .req x0
- count .req x1
- wtmp .req w2
-
- data .req v0
- q_data .req q0
- res .req v1
- q_res .req q1
-
- factor .req v2
- factor_t .req v3
- modulus .req v4
- modulus_twisted .req v5
-
- tmp0 .req v6
-
-.balign 4
-MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt):
-
- mov wtmp, #3329 // ML-KEM modulus
- dup modulus.8h, wtmp
-
- mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
- dup modulus_twisted.8h, wtmp
-
- mov wtmp, #-1044 // 2^16 % 3329
- dup factor.8h, wtmp
-
- mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16)
- dup factor_t.8h, wtmp
-
- mov count, #8
- // Instructions: 5
- // Expected cycles: 7
- // Expected IPC: 0.71
- //
- // Cycle bound: 7.0
- // IPC bound: 0.71
- //
- // Wall time: 0.01s
- // User time: 0.01s
- //
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- ldr q26, [x0, #48] // *.............................
- ldr q23, [x0, #16] // ..*...........................
- mul v17.8H, v26.8H, v2.8H // ....*.........................
- sqrdmulh v7.8H, v26.8H, v3.8H // .....*........................
- ldr q27, [x0, #32] // ......*.......................
-
- // ------ cycle (expected) ------>
- // 0 25
- // |------------------------|-----
- // ldr q7, [x0, #48] // *..............................
- // ldr q23, [x0, #16] // ..*............................
- // mul v17.8H, v7.8H, v2.8H // ....*..........................
- // sqrdmulh v7.8H, v7.8H, v3.8H // .....*.........................
- // ldr q27, [x0, #32] // ......*........................
-
- sub count, count, #1
-1:
- // Instructions: 20
- // Expected cycles: 24
- // Expected IPC: 0.83
- //
- // Cycle bound: 24.0
- // IPC bound: 0.83
- //
- // Wall time: 0.73s
- // User time: 0.73s
- //
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- mls v17.8H, v7.8H, v4.H[0] // *.............................
- sqrdmulh v5.8H, v23.8H, v3.8H // .*............................
- ldr q7, [x0], #64 // ..*...........................
- str q17, [x0, #-16] // ....*.........................
- sqrdmulh v29.8H, v27.8H, v3.8H // .....*........................
- sqrdmulh v19.8H, v7.8H, v3.8H // ......*.......................
- mul v25.8H, v23.8H, v2.8H // .......*......................
- mul v0.8H, v7.8H, v2.8H // ........*.....................
- mul v26.8H, v27.8H, v2.8H // .........*....................
- ldr q7, [x0, #48] // ..........e...................
- mls v25.8H, v5.8H, v4.H[0] // ............*.................
- ldr q23, [x0, #16] // .............e................
- mls v26.8H, v29.8H, v4.H[0] // ...............*..............
- mls v0.8H, v19.8H, v4.H[0] // ................*.............
- str q25, [x0, #-48] // .................*............
- mul v17.8H, v7.8H, v2.8H // ..................e...........
- sqrdmulh v7.8H, v7.8H, v3.8H // ...................e..........
- str q0, [x0, #-64] // ....................*.........
- ldr q27, [x0, #32] // .....................e........
- str q26, [x0, #-32] // .......................*......
-
- // --------- cycle (expected) ---------->
- // 0 25
- // |------------------------|------------
- // ldr q0, [x0], #64 // ..............'.*.....................
- // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'.....*.................
- // mul v1.8h, v0.8h, v2.8h // ..............'.......*...............
- // mls v1.8h, v6.8h, v4.h[0] // ......~.......'...............*.......
- // str q1, [x0, #-64] // ..........~...'...................*...
- // ldr q0, [x0, #-48] // ...e..........'............~..........
- // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'*......................
- // mul v1.8h, v0.8h, v2.8h // ..............'......*................
- // mls v1.8h, v6.8h, v4.h[0] // ..~...........'...........*...........
- // str q1, [x0, #-48] // .......~......'................*......
- // ldr q0, [x0, #-32] // ...........e..'....................~..
- // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'....*..................
- // mul v1.8h, v0.8h, v2.8h // ..............'........*..............
- // mls v1.8h, v6.8h, v4.h[0] // .....~........'..............*........
- // str q1, [x0, #-32] // .............~'......................*
- // ldr q0, [x0, #-16] // e.............'.........~.............
- // sqrdmulh v6.8h, v0.8h, v3.8h // .........e....'..................~....
- // mul v1.8h, v0.8h, v2.8h // ........e.....'.................~.....
- // mls v1.8h, v6.8h, v4.h[0] // ..............*.......................
- // str q1, [x0, #-16] // ..............'...*...................
-
- sub count, count, 1
- cbnz count, 1b
- // Instructions: 15
- // Expected cycles: 18
- // Expected IPC: 0.83
- //
- // Cycle bound: 18.0
- // IPC bound: 0.83
- //
- // Wall time: 0.07s
- // User time: 0.07s
- //
- // ----- cycle (expected) ------>
- // 0 25
- // |------------------------|----
- mls v17.8H, v7.8H, v4.H[0] // *.............................
- sqrdmulh v7.8H, v23.8H, v3.8H // .*............................
- mul v26.8H, v23.8H, v2.8H // ..*...........................
- sqrdmulh v25.8H, v27.8H, v3.8H // ...*..........................
- ldr q23, [x0], #64 // ....*.........................
- mul v27.8H, v27.8H, v2.8H // ......*.......................
- mls v26.8H, v7.8H, v4.H[0] // .......*......................
- sqrdmulh v7.8H, v23.8H, v3.8H // ........*.....................
- mul v23.8H, v23.8H, v2.8H // .........*....................
- str q17, [x0, #-16] // ..........*...................
- mls v27.8H, v25.8H, v4.H[0] // ...........*..................
- str q26, [x0, #-48] // ............*.................
- mls v23.8H, v7.8H, v4.H[0] // .............*................
- str q27, [x0, #-32] // ...............*..............
- str q23, [x0, #-64] // .................*............
-
- // ------ cycle (expected) ------>
- // 0 25
- // |------------------------|-----
- // mls v17.8H, v7.8H, v4.H[0] // *..............................
- // sqrdmulh v5.8H, v23.8H, v3.8H // .*.............................
- // ldr q7, [x0], #64 // ....*..........................
- // str q17, [x0, #-16] // ..........*....................
- // sqrdmulh v29.8H, v27.8H, v3.8H // ...*...........................
- // sqrdmulh v19.8H, v7.8H, v3.8H // ........*......................
- // mul v25.8H, v23.8H, v2.8H // ..*............................
- // mul v0.8H, v7.8H, v2.8H // .........*.....................
- // mul v26.8H, v27.8H, v2.8H // ......*........................
- // mls v25.8H, v5.8H, v4.H[0] // .......*.......................
- // mls v26.8H, v29.8H, v4.H[0] // ...........*...................
- // mls v0.8H, v19.8H, v4.H[0] // .............*.................
- // str q25, [x0, #-48] // ............*..................
- // str q0, [x0, #-64] // .................*.............
- // str q26, [x0, #-32] // ...............*...............
-
-
- ret
-
- .unreq src
- .unreq count
- .unreq wtmp
-
- .unreq data
- .unreq q_data
- .unreq res
- .unreq q_res
-
- .unreq factor
- .unreq factor_t
- .unreq modulus
- .unreq modulus_twisted
-
- .unreq tmp0
-
-#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_reduce_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_reduce_asm_opt.S
new file mode 100644
index 000000000..410950730
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_reduce_asm_opt.S
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "../../../common.h"
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+/* simpasm: header-end */
+
+/* Barrett reduction */
+.macro barrett_reduce a
+ sqdmulh tmp.8h, \a\().8h, modulus_twisted.h[0]
+ srshr tmp.8h, tmp.8h, #11
+ mls \a\().8h, tmp.8h, modulus.h[0]
+.endm
+
+/* Turns signed-canonical to unsigned canonical representative
+ * through conditional addition of the modulus.
+ *
+ * Expected modulus in `modulus`. */
+.macro scalar_signed_to_unsigned a
+ sshr mask.8h, \a\().8h, #15
+ and mask.16b, modulus.16b, mask.16b
+ add \a\().8h, \a\().8h, mask.8h
+.endm
+
+/**********************************
+ * poly_reduce() *
+ **********************************/
+
+ ptr .req x0
+ count .req x1
+ wtmp .req w2
+
+ data .req v0
+ q_data .req q0
+
+ tmp .req v1
+ mask .req v2
+ modulus .req v3
+ modulus_twisted .req v4
+
+ .text
+ .global MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt)
+ .balign 4
+MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
+
+ mov wtmp, #3329 // ML-KEM modulus
+ dup modulus.8h, wtmp
+
+ mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
+ dup modulus_twisted.8h, wtmp
+
+ mov count, #8
+ // Instructions: 15
+ // Expected cycles: 22
+ // Expected IPC: 0.68
+
+ // Cycle bound: 22.0
+ // IPC bound: 0.68
+
+ // Wall time: 0.05s
+ // User time: 0.05s
+
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ ldr q21, [x0, #32] // *.............................
+ ldr q23, [x0, #48] // ..*...........................
+ sqdmulh v7.8H, v21.8H, v4.H[0] // ....*.........................
+ sqdmulh v30.8H, v23.8H, v4.H[0] // ......*.......................
+ srshr v7.8H, v7.8H, #11 // ........*.....................
+ srshr v30.8H, v30.8H, #11 // ..........*...................
+ mls v21.8H, v7.8H, v3.H[0] // ...........*..................
+ mls v23.8H, v30.8H, v3.H[0] // .............*................
+ ldr q5, [x0, #16] // ..............*...............
+ sshr v7.8H, v21.8H, #15 // ................*.............
+ sshr v30.8H, v23.8H, #15 // .................*............
+ and v7.16B, v3.16B, v7.16B // ..................*...........
+ add v21.8H, v21.8H, v7.8H // ...................*..........
+ and v7.16B, v3.16B, v30.16B // ....................*.........
+ add v16.8H, v23.8H, v7.8H // .....................*........
+
+ // ------ cycle (expected) ------>
+ // 0 25
+ // |------------------------|-----
+ // ldr q30, [x0, #32] // *..............................
+ // sqdmulh v22.8H, v30.8H, v4.H[0] // ....*..........................
+ // ldr q2, [x0, #48] // ..*............................
+ // srshr v19.8H, v22.8H, #11 // ........*......................
+ // mls v30.8H, v19.8H, v3.H[0] // ...........*...................
+ // sqdmulh v25.8H, v2.8H, v4.H[0] // ......*........................
+ // sshr v31.8H, v30.8H, #15 // ................*..............
+ // srshr v25.8H, v25.8H, #11 // ..........*....................
+ // and v18.16B, v3.16B, v31.16B // ..................*............
+ // mls v2.8H, v25.8H, v3.H[0] // .............*.................
+ // add v21.8H, v30.8H, v18.8H // ...................*...........
+ // ldr q5, [x0, #16] // ..............*................
+ // sshr v18.8H, v2.8H, #15 // .................*.............
+ // and v27.16B, v3.16B, v18.16B // ....................*..........
+ // add v16.8H, v2.8H, v27.8H // .....................*.........
+
+ sub count, count, #1
+poly_reduce_asm_opt_loop:
+ // Instructions: 32
+ // Expected cycles: 36
+ // Expected IPC: 0.89
+
+ // Cycle bound: 36.0
+ // IPC bound: 0.89
+
+ // Wall time: 1.05s
+ // User time: 1.05s
+
+ // -------- cycle (expected) --------->
+ // 0 25
+ // |------------------------|----------
+ ldr q6, [x0], #64 // *...................................
+ ldr q30, [x0, #32] // ..e.................................
+ sqdmulh v31.8H, v6.8H, v4.H[0] // ....*...............................
+ sqdmulh v29.8H, v5.8H, v4.H[0] // .....*..............................
+ sqdmulh v22.8H, v30.8H, v4.H[0] // ......e.............................
+ str q16, [x0, #-16] // .......*............................
+ srshr v20.8H, v31.8H, #11 // ........*...........................
+ srshr v28.8H, v29.8H, #11 // .........*..........................
+ str q21, [x0, #-32] // ..........*.........................
+ mls v6.8H, v20.8H, v3.H[0] // ...........*........................
+ mls v5.8H, v28.8H, v3.H[0] // ............*.......................
+ ldr q2, [x0, #48] // .............e......................
+ sshr v31.8H, v6.8H, #15 // ...............*....................
+ srshr v19.8H, v22.8H, #11 // ................e...................
+ and v22.16B, v3.16B, v31.16B // .................*..................
+ add v0.8H, v6.8H, v22.8H // ..................*.................
+ mls v30.8H, v19.8H, v3.H[0] // ...................e................
+ sshr v26.8H, v5.8H, #15 // ....................*...............
+ sqdmulh v25.8H, v2.8H, v4.H[0] // .....................e..............
+ and v17.16B, v3.16B, v26.16B // ......................*.............
+ add v1.8H, v5.8H, v17.8H // .......................*............
+ sshr v31.8H, v30.8H, #15 // ........................e...........
+ srshr v25.8H, v25.8H, #11 // .........................e..........
+ str q1, [x0, #-48] // ..........................*.........
+ and v18.16B, v3.16B, v31.16B // ...........................e........
+ mls v2.8H, v25.8H, v3.H[0] // ............................e.......
+ add v21.8H, v30.8H, v18.8H // .............................e......
+ ldr q5, [x0, #16] // ..............................e.....
+ sshr v18.8H, v2.8H, #15 // ................................e...
+ str q0, [x0, #-64] // .................................*..
+ and v27.16B, v3.16B, v18.16B // ..................................e.
+ add v16.8H, v2.8H, v27.8H // ...................................e
+
+ // ------------------------ cycle (expected) ------------------------->
+ // 0 25 50
+ // |------------------------|------------------------|-----------------
+ // ldr q0, [x0], #64 // ..................................*.................................
+ // sqdmulh v1.8h, v0.8h, v4.h[0] // ..~...............................'...*.............................
+ // srshr v1.8h, v1.8h, #11 // ......~...........................'.......*.........................
+ // mls v0.8h, v1.8h, v3.h[0] // .........~........................'..........*......................
+ // sshr v2.8h, v0.8h, #15 // .............~....................'..............*..................
+ // and v2.16b, v3.16b, v2.16b // ...............~..................'................*................
+ // add v0.8h, v0.8h, v2.8h // ................~.................'.................*...............
+ // str q0, [x0, #-64] // ...............................~..'................................*
+ // ldr q0, [x0, #-48] // ............................e.....'.............................~...
+ // sqdmulh v1.8h, v0.8h, v4.h[0] // ...~..............................'....*............................
+ // srshr v1.8h, v1.8h, #11 // .......~..........................'........*........................
+ // mls v0.8h, v1.8h, v3.h[0] // ..........~.......................'...........*.....................
+ // sshr v2.8h, v0.8h, #15 // ..................~...............'...................*.............
+ // and v2.16b, v3.16b, v2.16b // ....................~.............'.....................*...........
+ // add v0.8h, v0.8h, v2.8h // .....................~............'......................*..........
+ // str q0, [x0, #-48] // ........................~.........'.........................*.......
+ // ldr q0, [x0, #-32] // e.................................'.~...............................
+ // sqdmulh v1.8h, v0.8h, v4.h[0] // ....e.............................'.....~...........................
+ // srshr v1.8h, v1.8h, #11 // ..............e...................'...............~.................
+ // mls v0.8h, v1.8h, v3.h[0] // .................e................'..................~..............
+ // sshr v2.8h, v0.8h, #15 // ......................e...........'.......................~.........
+ // and v2.16b, v3.16b, v2.16b // .........................e........'..........................~......
+ // add v0.8h, v0.8h, v2.8h // ...........................e......'............................~....
+ // str q0, [x0, #-32] // ........~.........................'.........*.......................
+ // ldr q0, [x0, #-16] // ...........e......................'............~....................
+ // sqdmulh v1.8h, v0.8h, v4.h[0] // ...................e..............'....................~............
+ // srshr v1.8h, v1.8h, #11 // .......................e..........'........................~........
+ // mls v0.8h, v1.8h, v3.h[0] // ..........................e.......'...........................~.....
+ // sshr v2.8h, v0.8h, #15 // ..............................e...'...............................~.
+ // and v2.16b, v3.16b, v2.16b // ................................e.'.................................
+ // add v0.8h, v0.8h, v2.8h // .................................e'.................................
+ // str q0, [x0, #-16] // .....~............................'......*..........................
+
+ sub count, count, 1
+ cbnz count, poly_reduce_asm_opt_loop
+ // Instructions: 17
+ // Expected cycles: 23
+ // Expected IPC: 0.74
+
+ // Cycle bound: 23.0
+ // IPC bound: 0.74
+
+ // Wall time: 0.05s
+ // User time: 0.05s
+
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ sqdmulh v20.8H, v5.8H, v4.H[0] // *.............................
+ ldr q24, [x0], #64 // .*............................
+ str q21, [x0, #-32] // ...*..........................
+ srshr v20.8H, v20.8H, #11 // ....*.........................
+ sqdmulh v25.8H, v24.8H, v4.H[0] // .....*........................
+ str q16, [x0, #-16] // ......*.......................
+ mls v5.8H, v20.8H, v3.H[0] // .......*......................
+ srshr v20.8H, v25.8H, #11 // .........*....................
+ sshr v2.8H, v5.8H, #15 // ...........*..................
+ mls v24.8H, v20.8H, v3.H[0] // ............*.................
+ and v20.16B, v3.16B, v2.16B // .............*................
+ add v31.8H, v5.8H, v20.8H // ..............*...............
+ sshr v20.8H, v24.8H, #15 // ................*.............
+ str q31, [x0, #-48] // .................*............
+ and v31.16B, v3.16B, v20.16B // ..................*...........
+ add v24.8H, v24.8H, v31.8H // ...................*..........
+ str q24, [x0, #-64] // ......................*.......
+
+ // ------ cycle (expected) ------>
+ // 0 25
+ // |------------------------|-----
+ // ldr q6, [x0], #64 // .*.............................
+ // sqdmulh v31.8H, v6.8H, v4.H[0] // .....*.........................
+ // sqdmulh v29.8H, v5.8H, v4.H[0] // *..............................
+ // str q16, [x0, #-16] // ......*........................
+ // srshr v20.8H, v31.8H, #11 // .........*.....................
+ // srshr v28.8H, v29.8H, #11 // ....*..........................
+ // str q21, [x0, #-32] // ...*...........................
+ // mls v6.8H, v20.8H, v3.H[0] // ............*..................
+ // mls v5.8H, v28.8H, v3.H[0] // .......*.......................
+ // sshr v31.8H, v6.8H, #15 // ................*..............
+ // and v22.16B, v3.16B, v31.16B // ..................*............
+ // add v0.8H, v6.8H, v22.8H // ...................*...........
+ // sshr v26.8H, v5.8H, #15 // ...........*...................
+ // and v17.16B, v3.16B, v26.16B // .............*.................
+ // add v1.8H, v5.8H, v17.8H // ..............*................
+ // str q1, [x0, #-48] // .................*.............
+ // str q0, [x0, #-64] // ......................*........
+
+
+ ret
+
+ .unreq ptr
+ .unreq count
+ .unreq wtmp
+
+ .unreq data
+ .unreq q_data
+
+ .unreq tmp
+ .unreq mask
+ .unreq modulus
+ .unreq modulus_twisted
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S
new file mode 100644
index 000000000..bc33afd43
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "../../../common.h"
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+/* simpasm: header-end */
+
+/********************************************
+ * poly_tobytes() *
+ ********************************************/
+
+ data0 .req v0
+ data1 .req v1
+ out0 .req v2
+ out1 .req v3
+ out2 .req v4
+ tmp .req v5
+
+ dst .req x0
+ src .req x1
+ count .req x2
+
+ .text
+ .global MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt)
+ .balign 4
+MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt):
+
+ mov count, #16
+poly_tobytes_asm_opt_asm_loop_start:
+ ld2 {data0.8h, data1.8h}, [src], #32
+
+ // r[3 * i + 0] = (t0 >> 0);
+ xtn out0.8b, data0.8h
+
+ // r[3 * i + 1] = (t0 >> 8);
+ shrn out1.8b, data0.8h, #8
+ xtn tmp.8b, data1.8h
+ // r[3 * i + 1] = (t0 >> 8) | (t1 << 4);
+ sli out1.8b, tmp.8b, #4
+
+ // r[3 * i + 2] = (t1 >> 4);
+ shrn out2.8b, data1.8h, #4
+
+ st3 {out0.8b, out1.8b, out2.8b}, [dst], #24
+
+ subs count, count, #1
+ cbnz count, poly_tobytes_asm_opt_asm_loop_start
+ ret
+
+ .unreq data0
+ .unreq data1
+ .unreq out0
+ .unreq out1
+ .unreq out2
+ .unreq tmp
+ .unreq dst
+ .unreq src
+ .unreq count
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_tomont_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_tomont_asm_opt.S
new file mode 100644
index 000000000..bcbff9adb
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_tomont_asm_opt.S
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "../../../common.h"
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+/* simpasm: header-end */
+
+/* Montgomery multiplication, with precomputed Montgomery twist
+ * Expects modulus in consts.h[0]. */
+.macro mulmod dst, src, const, const_twisted
+ sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h
+ mul \dst\().8h, \src\().8h, \const\().8h
+ mls \dst\().8h, tmp0.8h, modulus.h[0]
+.endm
+
+/**********************************
+ * poly_tomont() *
+ **********************************/
+
+ src .req x0
+ count .req x1
+ wtmp .req w2
+
+ data .req v0
+ q_data .req q0
+ res .req v1
+ q_res .req q1
+
+ factor .req v2
+ factor_t .req v3
+ modulus .req v4
+ modulus_twisted .req v5
+
+ tmp0 .req v6
+
+
+ .text
+ .global MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt)
+ .balign 4
+MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt):
+
+ mov wtmp, #3329 // ML-KEM modulus
+ dup modulus.8h, wtmp
+
+ mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
+ dup modulus_twisted.8h, wtmp
+
+ mov wtmp, #-1044 // 2^16 % 3329
+ dup factor.8h, wtmp
+
+ mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16)
+ dup factor_t.8h, wtmp
+
+ mov count, #8
+ // Instructions: 5
+ // Expected cycles: 7
+ // Expected IPC: 0.71
+ //
+ // Cycle bound: 7.0
+ // IPC bound: 0.71
+ //
+ // Wall time: 0.01s
+ // User time: 0.01s
+ //
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ ldr q26, [x0, #48] // *.............................
+ ldr q23, [x0, #16] // ..*...........................
+ mul v17.8H, v26.8H, v2.8H // ....*.........................
+ sqrdmulh v7.8H, v26.8H, v3.8H // .....*........................
+ ldr q27, [x0, #32] // ......*.......................
+
+ // ------ cycle (expected) ------>
+ // 0 25
+ // |------------------------|-----
+ // ldr q7, [x0, #48] // *..............................
+ // ldr q23, [x0, #16] // ..*............................
+ // mul v17.8H, v7.8H, v2.8H // ....*..........................
+ // sqrdmulh v7.8H, v7.8H, v3.8H // .....*.........................
+ // ldr q27, [x0, #32] // ......*........................
+
+ sub count, count, #1
+poly_tomont_asm_opt_loop:
+ // Instructions: 20
+ // Expected cycles: 24
+ // Expected IPC: 0.83
+ //
+ // Cycle bound: 24.0
+ // IPC bound: 0.83
+ //
+ // Wall time: 0.73s
+ // User time: 0.73s
+ //
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ mls v17.8H, v7.8H, v4.H[0] // *.............................
+ sqrdmulh v5.8H, v23.8H, v3.8H // .*............................
+ ldr q7, [x0], #64 // ..*...........................
+ str q17, [x0, #-16] // ....*.........................
+ sqrdmulh v29.8H, v27.8H, v3.8H // .....*........................
+ sqrdmulh v19.8H, v7.8H, v3.8H // ......*.......................
+ mul v25.8H, v23.8H, v2.8H // .......*......................
+ mul v0.8H, v7.8H, v2.8H // ........*.....................
+ mul v26.8H, v27.8H, v2.8H // .........*....................
+ ldr q7, [x0, #48] // ..........e...................
+ mls v25.8H, v5.8H, v4.H[0] // ............*.................
+ ldr q23, [x0, #16] // .............e................
+ mls v26.8H, v29.8H, v4.H[0] // ...............*..............
+ mls v0.8H, v19.8H, v4.H[0] // ................*.............
+ str q25, [x0, #-48] // .................*............
+ mul v17.8H, v7.8H, v2.8H // ..................e...........
+ sqrdmulh v7.8H, v7.8H, v3.8H // ...................e..........
+ str q0, [x0, #-64] // ....................*.........
+ ldr q27, [x0, #32] // .....................e........
+ str q26, [x0, #-32] // .......................*......
+
+ // --------- cycle (expected) ---------->
+ // 0 25
+ // |------------------------|------------
+ // ldr q0, [x0], #64 // ..............'.*.....................
+ // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'.....*.................
+ // mul v1.8h, v0.8h, v2.8h // ..............'.......*...............
+ // mls v1.8h, v6.8h, v4.h[0] // ......~.......'...............*.......
+ // str q1, [x0, #-64] // ..........~...'...................*...
+ // ldr q0, [x0, #-48] // ...e..........'............~..........
+ // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'*......................
+ // mul v1.8h, v0.8h, v2.8h // ..............'......*................
+ // mls v1.8h, v6.8h, v4.h[0] // ..~...........'...........*...........
+ // str q1, [x0, #-48] // .......~......'................*......
+ // ldr q0, [x0, #-32] // ...........e..'....................~..
+ // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'....*..................
+ // mul v1.8h, v0.8h, v2.8h // ..............'........*..............
+ // mls v1.8h, v6.8h, v4.h[0] // .....~........'..............*........
+ // str q1, [x0, #-32] // .............~'......................*
+ // ldr q0, [x0, #-16] // e.............'.........~.............
+ // sqrdmulh v6.8h, v0.8h, v3.8h // .........e....'..................~....
+ // mul v1.8h, v0.8h, v2.8h // ........e.....'.................~.....
+ // mls v1.8h, v6.8h, v4.h[0] // ..............*.......................
+ // str q1, [x0, #-16] // ..............'...*...................
+
+ sub count, count, 1
+ cbnz count, poly_tomont_asm_opt_loop
+ // Instructions: 15
+ // Expected cycles: 18
+ // Expected IPC: 0.83
+ //
+ // Cycle bound: 18.0
+ // IPC bound: 0.83
+ //
+ // Wall time: 0.07s
+ // User time: 0.07s
+ //
+ // ----- cycle (expected) ------>
+ // 0 25
+ // |------------------------|----
+ mls v17.8H, v7.8H, v4.H[0] // *.............................
+ sqrdmulh v7.8H, v23.8H, v3.8H // .*............................
+ mul v26.8H, v23.8H, v2.8H // ..*...........................
+ sqrdmulh v25.8H, v27.8H, v3.8H // ...*..........................
+ ldr q23, [x0], #64 // ....*.........................
+ mul v27.8H, v27.8H, v2.8H // ......*.......................
+ mls v26.8H, v7.8H, v4.H[0] // .......*......................
+ sqrdmulh v7.8H, v23.8H, v3.8H // ........*.....................
+ mul v23.8H, v23.8H, v2.8H // .........*....................
+ str q17, [x0, #-16] // ..........*...................
+ mls v27.8H, v25.8H, v4.H[0] // ...........*..................
+ str q26, [x0, #-48] // ............*.................
+ mls v23.8H, v7.8H, v4.H[0] // .............*................
+ str q27, [x0, #-32] // ...............*..............
+ str q23, [x0, #-64] // .................*............
+
+ // ------ cycle (expected) ------>
+ // 0 25
+ // |------------------------|-----
+ // mls v17.8H, v7.8H, v4.H[0] // *..............................
+ // sqrdmulh v5.8H, v23.8H, v3.8H // .*.............................
+ // ldr q7, [x0], #64 // ....*..........................
+ // str q17, [x0, #-16] // ..........*....................
+ // sqrdmulh v29.8H, v27.8H, v3.8H // ...*...........................
+ // sqrdmulh v19.8H, v7.8H, v3.8H // ........*......................
+ // mul v25.8H, v23.8H, v2.8H // ..*............................
+ // mul v0.8H, v7.8H, v2.8H // .........*.....................
+ // mul v26.8H, v27.8H, v2.8H // ......*........................
+ // mls v25.8H, v5.8H, v4.H[0] // .......*.......................
+ // mls v26.8H, v29.8H, v4.H[0] // ...........*...................
+ // mls v0.8H, v19.8H, v4.H[0] // .............*.................
+ // str q25, [x0, #-48] // ............*..................
+ // str q0, [x0, #-64] // .................*.............
+ // str q26, [x0, #-32] // ...............*...............
+
+
+ ret
+
+ .unreq src
+ .unreq count
+ .unreq wtmp
+
+ .unreq data
+ .unreq q_data
+ .unreq res
+ .unreq q_res
+
+ .unreq factor
+ .unreq factor_t
+ .unreq modulus
+ .unreq modulus_twisted
+
+ .unreq tmp0
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S
new file mode 100644
index 000000000..e336b92cb
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S
@@ -0,0 +1,546 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// AArch64 re-implementation of the asymmetric base multiplication from:
+
+// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
+// https://eprint.iacr.org/2021/986
+// https://github.com/neon-ntt/neon-ntt
+
+#include "../../../common.h"
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) && MLKEM_K == 2
+/* simpasm: header-end */
+
+// Input:
+// - Vectors al, ah of 32-bit entries
+// Output:
+// - Montgomery reductions of al || ah, stored in al
+.macro montgomery_reduce_long x, a
+ uzp1 t0.8h, \a\()l.8h, \a\()h.8h
+ mul t0.8h, t0.8h, modulus_twisted.8h
+ smlal \a\()l.4s, t0.4h, modulus.4h
+ smlal2 \a\()h.4s, t0.8h, modulus.8h
+ uzp2 \x\().8h, \a\()l.8h, \a\()h.8h
+.endm
+
+// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit.
+
+// Bounds:
+// - Assume |a| < 4096,
+// - Result: < 2*4096*2^15 = 2^28
+.macro pmull d, a, b
+ smull \d\()0l.4s, \a\()0.4h, \b\()0.4h
+ smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
+ smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
+ smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
+
+ smull \d\()1l.4s, \a\()0.4h, \b\()1.4h
+ smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
+ smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
+ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
+.endm
+
+.macro pmlal d, a, b
+ smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h
+ smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
+ smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
+ smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
+
+ smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h
+ smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
+ smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
+ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
+.endm
+
+.macro ld2_wrap a, ptr
+ ldr q_tmp0, [\ptr\()], #32
+ ldr q_tmp1, [\ptr\(), #-16]
+ uzp1 \a\()0.8h, tmp0.8h, tmp1.8h
+ uzp2 \a\()1.8h, tmp0.8h, tmp1.8h
+.endm
+
+.macro st2_wrap a, ptr
+ zip1 tmp0.8h, \a\()0.8h, \a\()1.8h
+ zip2 tmp1.8h, \a\()0.8h, \a\()1.8h
+ str q_tmp0, [\ptr\()], #32
+ str q_tmp1, [\ptr\(), #-16]
+.endm
+
+.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr
+ ld2_wrap \a\(), \a_ptr
+ ld2_wrap \b\(), \b_ptr
+ ld1 {\b\()1t.8h}, [\b_cache_ptr], #16
+.endm
+
+.macro save_vregs
+ sub sp, sp, #(16*4)
+ stp d8, d9, [sp, #16*0]
+ stp d10, d11, [sp, #16*1]
+ stp d12, d13, [sp, #16*2]
+ stp d14, d15, [sp, #16*3]
+.endm
+
+.macro restore_vregs
+ ldp d8, d9, [sp, #16*0]
+ ldp d10, d11, [sp, #16*1]
+ ldp d12, d13, [sp, #16*2]
+ ldp d14, d15, [sp, #16*3]
+ add sp, sp, #(16*4)
+.endm
+
+.macro push_stack
+ save_vregs
+.endm
+
+.macro pop_stack
+ restore_vregs
+.endm
+
+ out .req x0
+ a0_ptr .req x1
+ b0_ptr .req x2
+ b0_cache_ptr .req x3
+ a1_ptr .req x4
+ b1_ptr .req x5
+ b1_cache_ptr .req x6
+ a2_ptr .req x7
+ b2_ptr .req x8
+ b2_cache_ptr .req x9
+ a3_ptr .req x10
+ b3_ptr .req x11
+ b3_cache_ptr .req x12
+ count .req x13
+ wtmp .req w14
+
+ modulus .req v0
+ modulus_twisted .req v2
+
+ aa0 .req v3
+ aa1 .req v4
+ bb0 .req v5
+ bb1 .req v6
+ bb1t .req v7
+
+ res0l .req v8
+ res1l .req v9
+ res0h .req v10
+ res1h .req v11
+
+ tmp0 .req v12
+ tmp1 .req v13
+ q_tmp0 .req q12
+ q_tmp1 .req q13
+
+ out0 .req v26
+ out1 .req v27
+
+ t0 .req v28
+
+ .text
+ .global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2_opt)
+ .balign 4
+MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2_opt):
+ push_stack
+
+ mov wtmp, #3329
+ dup modulus.8h, wtmp
+
+ mov wtmp, #3327
+ dup modulus_twisted.8h, wtmp
+
+ // Computed bases of vector entries
+
+ add a1_ptr, a0_ptr, #(1 * 512)
+ add b1_ptr, b0_ptr, #(1 * 512)
+ add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
+
+ mov count, #(MLKEM_N / 16)
+ // Instructions: 75
+ // Expected cycles: 94
+ // Expected IPC: 0.80
+
+ // Cycle bound: 94.0
+ // IPC bound: 0.80
+
+ // Wall time: 1.49s
+ // User time: 1.49s
+
+ // --------------------------- original position ---------------------------->
+ // 0 25 50
+ // |------------------------|------------------------|
+ ldr q9, [x4], #32 // *..........................................................................
+ ldr q5, [x4, #-16] // ......*....................................................................
+ ldr q11, [x5], #32 // .*.........................................................................
+ uzp1 v23.8H, v9.8H, v5.8H // .........*.................................................................
+ uzp2 v9.8H, v9.8H, v5.8H // .....................*.....................................................
+ ldr q5, [x2], #32 // ..*........................................................................
+ ldr q7, [x5, #-16] // ..............*............................................................
+ ldr q21, [x2, #-16] // ...*.......................................................................
+ uzp2 v10.8H, v11.8H, v7.8H // .................*.........................................................
+ uzp1 v11.8H, v11.8H, v7.8H // ..................*........................................................
+ uzp1 v7.8H, v5.8H, v21.8H // ....*......................................................................
+ uzp2 v5.8H, v5.8H, v21.8H // .....*.....................................................................
+ ldr q21, [x1], #32 // .......*...................................................................
+ ldr q25, [x1, #-16] // ........*..................................................................
+ ld1 {v6.8H}, [x3], #16 // ............................*..............................................
+ uzp1 v26.8H, v21.8H, v25.8H // ..........*................................................................
+ uzp2 v21.8H, v21.8H, v25.8H // ...........*...............................................................
+ smull v25.4S, v26.4H, v5.4H // ............*..............................................................
+ smull2 v5.4S, v26.8H, v5.8H // .............*.............................................................
+ smull v19.4S, v26.4H, v7.4H // ..........................*................................................
+ smull2 v26.4S, v26.8H, v7.8H // ..............................*............................................
+ smlal v25.4S, v21.4H, v7.4H // ...............*...........................................................
+ smlal2 v5.4S, v21.8H, v7.8H // ................*..........................................................
+ smlal v19.4S, v21.4H, v6.4H // ...................................*.......................................
+ smlal2 v26.4S, v21.8H, v6.8H // .................................*.........................................
+ smlal v25.4S, v23.4H, v10.4H // ...................*.......................................................
+ smlal2 v5.4S, v23.8H, v10.8H // ....................*......................................................
+ smlal v19.4S, v23.4H, v11.4H // ......................................*....................................
+ smlal2 v26.4S, v23.8H, v11.8H // ....................................*......................................
+ ld1 {v23.8H}, [x6], #16 // ........................*..................................................
+ smlal v25.4S, v9.4H, v11.4H // ......................*....................................................
+ smlal2 v5.4S, v9.8H, v11.8H // .......................*...................................................
+ smlal2 v26.4S, v9.8H, v23.8H // .......................................*...................................
+ smlal v19.4S, v9.4H, v23.4H // .........................................*.................................
+ ldr q9, [x4], #32 // ...............................*...........................................
+ uzp1 v11.8H, v25.8H, v5.8H // .........................*.................................................
+ uzp1 v23.8H, v19.8H, v26.8H // .............................................*.............................
+ mul v11.8H, v11.8H, v2.8H // ...........................*...............................................
+ mul v23.8H, v23.8H, v2.8H // ..............................................*............................
+ ldr q7, [x5], #32 // ................................*..........................................
+ smlal2 v5.4S, v11.8H, v0.8H // .............................*.............................................
+ smlal v25.4S, v11.4H, v0.4H // ..................................*........................................
+ ldr q11, [x2], #32 // .....................................*.....................................
+ ldr q21, [x2, #-16] // ........................................*..................................
+ ldr q6, [x4, #-16] // ...............................................*...........................
+ uzp1 v17.8H, v11.8H, v21.8H // ...........................................*...............................
+ ldr q10, [x1], #32 // ................................................*..........................
+ ldr q29, [x1, #-16] // .................................................*.........................
+ uzp2 v11.8H, v11.8H, v21.8H // ............................................*..............................
+ uzp1 v13.8H, v9.8H, v6.8H // ...................................................*.......................
+ uzp1 v3.8H, v10.8H, v29.8H // ....................................................*......................
+ uzp2 v10.8H, v10.8H, v29.8H // .....................................................*.....................
+ smull v12.4S, v3.4H, v11.4H // ......................................................*....................
+ smull2 v11.4S, v3.8H, v11.8H // .......................................................*...................
+ ldr q21, [x5, #-16] // ........................................................*..................
+ smlal v12.4S, v10.4H, v17.4H // .........................................................*.................
+ smlal2 v11.4S, v10.8H, v17.8H // ..........................................................*................
+ uzp2 v29.8H, v7.8H, v21.8H // ...........................................................*...............
+ uzp1 v15.8H, v7.8H, v21.8H // ............................................................*..............
+ smlal v12.4S, v13.4H, v29.4H // .............................................................*.............
+ smlal2 v11.4S, v13.8H, v29.8H // ..............................................................*............
+ uzp2 v28.8H, v9.8H, v6.8H // ...............................................................*...........
+ smlal2 v26.4S, v23.8H, v0.8H // ..................................................*........................
+ smlal v12.4S, v28.4H, v15.4H // .................................................................*.........
+ smlal2 v11.4S, v28.8H, v15.8H // ..................................................................*........
+ smlal v19.4S, v23.4H, v0.4H // ................................................................*..........
+ uzp2 v27.8H, v25.8H, v5.8H // ..........................................*................................
+ smull v23.4S, v3.4H, v17.4H // ......................................................................*....
+ uzp1 v9.8H, v12.8H, v11.8H // .....................................................................*.....
+ uzp2 v19.8H, v19.8H, v26.8H // ....................................................................*......
+ mul v14.8H, v9.8H, v2.8H // .......................................................................*...
+ ld1 {v22.8H}, [x6], #16 // ...................................................................*.......
+ zip2 v9.8H, v19.8H, v27.8H // ........................................................................*..
+ smlal2 v11.4S, v14.8H, v0.8H // ..........................................................................*
+ ld1 {v4.8H}, [x3], #16 // .........................................................................*.
+
+ // ------------------------------ new position ------------------------------>
+ // 0 25 50
+ // |------------------------|------------------------|------------------------
+ // ldr q18, [x4], #32 // *..........................................................................
+ // ldr q30, [x5], #32 // ..*........................................................................
+ // ldr q8, [x2], #32 // .....*.....................................................................
+ // ldr q9, [x2, #-16] // .......*...................................................................
+ // uzp1 v17.8H, v8.8H, v9.8H // ..........*................................................................
+ // uzp2 v4.8H, v8.8H, v9.8H // ...........*...............................................................
+ // ldr q19, [x4, #-16] // .*.........................................................................
+ // ldr q29, [x1], #32 // ............*..............................................................
+ // ldr q12, [x1, #-16] // .............*.............................................................
+ // uzp1 v13.8H, v18.8H, v19.8H // ...*.......................................................................
+ // uzp1 v3.8H, v29.8H, v12.8H // ...............*...........................................................
+ // uzp2 v10.8H, v29.8H, v12.8H // ................*..........................................................
+ // smull v12.4S, v3.4H, v4.4H // .................*.........................................................
+ // smull2 v11.4S, v3.8H, v4.8H // ..................*........................................................
+ // ldr q5, [x5, #-16] // ......*....................................................................
+ // smlal v12.4S, v10.4H, v17.4H // .....................*.....................................................
+ // smlal2 v11.4S, v10.8H, v17.8H // ......................*....................................................
+ // uzp2 v14.8H, v30.8H, v5.8H // ........*..................................................................
+ // uzp1 v15.8H, v30.8H, v5.8H // .........*.................................................................
+ // smlal v12.4S, v13.4H, v14.4H // .........................*.................................................
+ // smlal2 v11.4S, v13.8H, v14.8H // ..........................*................................................
+ // uzp2 v28.8H, v18.8H, v19.8H // ....*......................................................................
+ // smlal v12.4S, v28.4H, v15.4H // ..............................*............................................
+ // smlal2 v11.4S, v28.8H, v15.8H // ...............................*...........................................
+ // ld1 {v22.8H}, [x6], #16 // .............................*.............................................
+ // uzp1 v1.8H, v12.8H, v11.8H // ...................................*.......................................
+ // smull v23.4S, v3.4H, v17.4H // ...................*.......................................................
+ // mul v14.8H, v1.8H, v2.8H // .....................................*.....................................
+ // ld1 {v4.8H}, [x3], #16 // ..............*............................................................
+ // smlal2 v11.4S, v14.8H, v0.8H // ........................................*..................................
+ // smull2 v20.4S, v3.8H, v17.8H // ....................*......................................................
+ // ldr q18, [x4], #32 // ..................................*........................................
+ // ldr q30, [x5], #32 // .......................................*...................................
+ // smlal2 v20.4S, v10.8H, v4.8H // ........................*..................................................
+ // smlal v12.4S, v14.4H, v0.4H // .........................................*.................................
+ // smlal v23.4S, v10.4H, v4.4H // .......................*...................................................
+ // smlal2 v20.4S, v13.8H, v15.8H // ............................*..............................................
+ // ldr q8, [x2], #32 // ..........................................*................................
+ // smlal v23.4S, v13.4H, v15.4H // ...........................*...............................................
+ // smlal2 v20.4S, v28.8H, v22.8H // ................................*..........................................
+ // ldr q9, [x2, #-16] // ...........................................*...............................
+ // smlal v23.4S, v28.4H, v22.4H // .................................*.........................................
+ // uzp2 v27.8H, v12.8H, v11.8H // ..................................................................*........
+ // uzp1 v17.8H, v8.8H, v9.8H // .............................................*.............................
+ // uzp2 v4.8H, v8.8H, v9.8H // ................................................*..........................
+ // uzp1 v5.8H, v23.8H, v20.8H // ....................................*......................................
+ // mul v31.8H, v5.8H, v2.8H // ......................................*....................................
+ // ldr q19, [x4, #-16] // ............................................*..............................
+ // ldr q29, [x1], #32 // ..............................................*............................
+ // ldr q12, [x1, #-16] // ...............................................*...........................
+ // smlal2 v20.4S, v31.8H, v0.8H // ..............................................................*............
+ // uzp1 v13.8H, v18.8H, v19.8H // .................................................*.........................
+ // uzp1 v3.8H, v29.8H, v12.8H // ..................................................*........................
+ // uzp2 v10.8H, v29.8H, v12.8H // ...................................................*.......................
+ // smull v12.4S, v3.4H, v4.4H // ....................................................*......................
+ // smull2 v11.4S, v3.8H, v4.8H // .....................................................*.....................
+ // ldr q5, [x5, #-16] // ......................................................*....................
+ // smlal v12.4S, v10.4H, v17.4H // .......................................................*...................
+ // smlal2 v11.4S, v10.8H, v17.8H // ........................................................*..................
+ // uzp2 v14.8H, v30.8H, v5.8H // .........................................................*.................
+ // uzp1 v15.8H, v30.8H, v5.8H // ..........................................................*................
+ // smlal v12.4S, v13.4H, v14.4H // ...........................................................*...............
+ // smlal2 v11.4S, v13.8H, v14.8H // ............................................................*..............
+ // uzp2 v28.8H, v18.8H, v19.8H // .............................................................*.............
+ // smlal v23.4S, v31.4H, v0.4H // .................................................................*.........
+ // smlal v12.4S, v28.4H, v15.4H // ...............................................................*...........
+ // smlal2 v11.4S, v28.8H, v15.8H // ................................................................*..........
+ // ld1 {v22.8H}, [x6], #16 // .......................................................................*...
+ // uzp2 v19.8H, v23.8H, v20.8H // .....................................................................*.....
+ // uzp1 v1.8H, v12.8H, v11.8H // ....................................................................*......
+ // smull v23.4S, v3.4H, v17.4H // ...................................................................*.......
+ // mul v14.8H, v1.8H, v2.8H // ......................................................................*....
+ // zip2 v9.8H, v19.8H, v27.8H // ........................................................................*..
+ // ld1 {v4.8H}, [x3], #16 // ..........................................................................*
+ // smlal2 v11.4S, v14.8H, v0.8H // .........................................................................*.
+
+ sub count, count, #2
+polyvec_basemul_acc_montgomery_cached_asm_k2_opt_loop:
+ // Instructions: 48
+ // Expected cycles: 58
+ // Expected IPC: 0.83
+
+ // Cycle bound: 58.0
+ // IPC bound: 0.83
+
+ // Wall time: 6.39s
+ // User time: 6.39s
+
+ // -------------- original position -------------->
+ // 0 25
+ // |------------------------|----------------------
+ smull2 v20.4S, v3.8H, v17.8H // ..........*.....................................
+ ldr q18, [x4], #32 // .................e..............................
+ ldr q30, [x5], #32 // .....................e..........................
+ smlal2 v20.4S, v10.8H, v4.8H // ............*...................................
+ smlal v12.4S, v14.4H, v0.4H // .........................................*......
+ smlal v23.4S, v10.4H, v4.4H // ...........*....................................
+ str q9, [x0, #16] // ...............................................l
+ smlal2 v20.4S, v13.8H, v15.8H // ...........................*....................
+ ldr q8, [x2], #32 // ....e...........................................
+ smlal v23.4S, v13.4H, v15.4H // ..........................*.....................
+ smlal2 v20.4S, v28.8H, v22.8H // .............................*..................
+ zip1 v26.8H, v19.8H, v27.8H // ............................................l...
+ ldr q9, [x2, #-16] // .....e..........................................
+ smlal v23.4S, v28.4H, v22.4H // ............................*...................
+ uzp2 v27.8H, v12.8H, v11.8H // ...........................................*....
+ uzp1 v17.8H, v8.8H, v9.8H // ......e.........................................
+ uzp2 v4.8H, v8.8H, v9.8H // .......e........................................
+ uzp1 v5.8H, v23.8H, v20.8H // ..................................*.............
+ str q26, [x0], #32 // ..............................................l.
+ mul v31.8H, v5.8H, v2.8H // ...................................*............
+ ldr q19, [x4, #-16] // ..................e.............................
+ ldr q29, [x1], #32 // e...............................................
+ ldr q12, [x1, #-16] // .e..............................................
+ smlal2 v20.4S, v31.8H, v0.8H // .....................................*..........
+ uzp1 v13.8H, v18.8H, v19.8H // ...................e............................
+ uzp1 v3.8H, v29.8H, v12.8H // ..e.............................................
+ uzp2 v10.8H, v29.8H, v12.8H // ...e............................................
+ smull v12.4S, v3.4H, v4.4H // .............e..................................
+ smull2 v11.4S, v3.8H, v4.8H // ..............e.................................
+ ldr q5, [x5, #-16] // ......................e.........................
+ smlal v12.4S, v10.4H, v17.4H // ...............e................................
+ smlal2 v11.4S, v10.8H, v17.8H // ................e...............................
+ uzp2 v14.8H, v30.8H, v5.8H // ........................e.......................
+ uzp1 v15.8H, v30.8H, v5.8H // .......................e........................
+ smlal v12.4S, v13.4H, v14.4H // ..............................e.................
+ smlal2 v11.4S, v13.8H, v14.8H // ...............................e................
+ uzp2 v28.8H, v18.8H, v19.8H // ....................e...........................
+ smlal v23.4S, v31.4H, v0.4H // ....................................*...........
+ smlal v12.4S, v28.4H, v15.4H // ................................e...............
+ smlal2 v11.4S, v28.8H, v15.8H // .................................e..............
+ ld1 {v22.8H}, [x6], #16 // .........................e......................
+ uzp2 v19.8H, v23.8H, v20.8H // ......................................*.........
+ uzp1 v1.8H, v12.8H, v11.8H // .......................................e........
+ smull v23.4S, v3.4H, v17.4H // .........e......................................
+ mul v14.8H, v1.8H, v2.8H // ........................................e.......
+ zip2 v9.8H, v19.8H, v27.8H // .............................................*..
+ ld1 {v4.8H}, [x3], #16 // ........e.......................................
+ smlal2 v11.4S, v14.8H, v0.8H // ..........................................e.....
+
+ // ------------------------------------------------- new position -------------------------------------------------->
+ // 0 25 50 75 100
+ // |------------------------|------------------------|------------------------|------------------------|-------------
+ // ldr q12, [x1], #32 // ....................e..........................'....................~..........................'..................
+ // ldr q13, [x1, #-16] // .....................e.........................'.....................~.........................'..................
+ // uzp1 v3.8h, v12.8h, v13.8h // ........................e......................'........................~......................'..................
+ // uzp2 v4.8h, v12.8h, v13.8h // .........................e.....................'.........................~.....................'..................
+ // ldr q12, [x2], #32 // .......e.......................................'.......~.......................................'.......~..........
+ // ldr q13, [x2, #-16] // ...........e...................................'...........~...................................'...........~......
+ // uzp1 v5.8h, v12.8h, v13.8h // ..............e................................'..............~................................'..............~...
+ // uzp2 v6.8h, v12.8h, v13.8h // ...............e...............................'...............~...............................'...............~..
+ // ld1 {v7.8h}, [x3], #16 // .............................................e.'.............................................~.'..................
+ // smull v8.4s, v3.4h, v5.4h // ..........................................e....'..........................................~....'..................
+ // smull2 v10.4s, v3.8h, v5.8h // ...............................................*...............................................~..................
+ // smlal v8.4s, v4.4h, v7.4h // ....~..........................................'....*..........................................'....~.............
+ // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................'..*............................................'..~...............
+ // smull v9.4s, v3.4h, v6.4h // ..........................e....................'..........................~....................'..................
+ // smull2 v11.4s, v3.8h, v6.8h // ...........................e...................'...........................~...................'..................
+ // smlal v9.4s, v4.4h, v5.4h // .............................e.................'.............................~.................'..................
+ // smlal2 v11.4s, v4.8h, v5.8h // ..............................e................'..............................~................'..................
+ // ldr q12, [x4], #32 // e..............................................'~..............................................'~.................
+ // ldr q13, [x4, #-16] // ...................e...........................'...................~...........................'..................
+ // uzp1 v3.8h, v12.8h, v13.8h // .......................e.......................'.......................~.......................'..................
+ // uzp2 v4.8h, v12.8h, v13.8h // ...................................e...........'...................................~...........'..................
+ // ldr q12, [x5], #32 // .e.............................................'.~.............................................'.~................
+ // ldr q13, [x5, #-16] // ............................e..................'............................~..................'..................
+ // uzp1 v5.8h, v12.8h, v13.8h // ................................e..............'................................~..............'..................
+ // uzp2 v6.8h, v12.8h, v13.8h // ...............................e...............'...............................~...............'..................
+ // ld1 {v7.8h}, [x6], #16 // .......................................e.......'.......................................~.......'..................
+ // smlal v8.4s, v3.4h, v5.4h // ........~......................................'........*......................................'........~.........
+ // smlal2 v10.4s, v3.8h, v5.8h // ......~........................................'......*........................................'......~...........
+ // smlal v8.4s, v4.4h, v7.4h // ............~..................................'............*..................................'............~.....
+ // smlal2 v10.4s, v4.8h, v7.8h // .........~.....................................'.........*.....................................'.........~........
+ // smlal v9.4s, v3.4h, v6.4h // .................................e.............'.................................~.............'..................
+ // smlal2 v11.4s, v3.8h, v6.8h // ..................................e............'..................................~............'..................
+ // smlal v9.4s, v4.4h, v5.4h // .....................................e.........'.....................................~.........'..................
+ // smlal2 v11.4s, v4.8h, v5.8h // ......................................e........'......................................~........'..................
+ // uzp1 v28.8h, v8.8h, v10.8h // ................~..............................'................*..............................'................~.
+ // mul v28.8h, v28.8h, v2.8h // ..................~............................'..................*............................'..................
+ // smlal v8.4s, v28.4h, v0.4h // ....................................~..........'....................................*..........'..................
+ // smlal2 v10.4s, v28.8h, v0.8h // ......................~........................'......................*........................'..................
+ // uzp2 v26.8h, v8.8h, v10.8h // ........................................~......'........................................*......'..................
+ // uzp1 v28.8h, v9.8h, v11.8h // .........................................e.....'.........................................~.....'..................
+ // mul v28.8h, v28.8h, v2.8h // ...........................................e...'...........................................~...'..................
+ // smlal v9.4s, v28.4h, v0.4h // ...~...........................................'...*...........................................'...~..............
+ // smlal2 v11.4s, v28.8h, v0.8h // ..............................................e'..............................................~'..................
+ // uzp2 v27.8h, v9.8h, v11.8h // .............~.................................'.............*.................................'.............~....
+ // zip1 v12.8h, v26.8h, v27.8h // ..........~....................................'..........~....................................'..........l.......
+ // zip2 v13.8h, v26.8h, v27.8h // ............................................~..'............................................*..'..................
+ // str q12, [x0], #32 // .................~.............................'.................~.............................'.................l
+ // str q13, [x0, #-16] // .....~.........................................'.....~.........................................'.....l............
+
+ sub count, count, #1
+ cbnz count, polyvec_basemul_acc_montgomery_cached_asm_k2_opt_loop
+ // Instructions: 21
+ // Expected cycles: 35
+ // Expected IPC: 0.60
+
+ // Cycle bound: 35.0
+ // IPC bound: 0.60
+
+ // Wall time: 0.08s
+ // User time: 0.08s
+
+ // ----- original position ----->
+ // 0 25
+ // |------------------------|----
+ smull2 v5.4S, v3.8H, v17.8H // *.............................
+ smlal v12.4S, v14.4H, v0.4H // ..*...........................
+ smlal v23.4S, v10.4H, v4.4H // ...*..........................
+ str q9, [x0, #16] // ....*.........................
+ smlal2 v5.4S, v10.8H, v4.8H // .*............................
+ uzp2 v11.8H, v12.8H, v11.8H // ..........*...................
+ zip1 v9.8H, v19.8H, v27.8H // ........*.....................
+ smlal v23.4S, v13.4H, v15.4H // ......*.......................
+ smlal2 v5.4S, v13.8H, v15.8H // .....*........................
+ str q9, [x0], #32 // ............*.................
+ smlal v23.4S, v28.4H, v22.4H // .........*....................
+ smlal2 v5.4S, v28.8H, v22.8H // .......*......................
+ uzp1 v9.8H, v23.8H, v5.8H // ...........*..................
+ mul v9.8H, v9.8H, v2.8H // .............*................
+ smlal2 v5.4S, v9.8H, v0.8H // ..............*...............
+ smlal v23.4S, v9.4H, v0.4H // ...............*..............
+ uzp2 v9.8H, v23.8H, v5.8H // ................*.............
+ zip2 v5.8H, v9.8H, v11.8H // .................*............
+ zip1 v9.8H, v9.8H, v11.8H // ...................*..........
+ str q5, [x0, #16] // ..................*...........
+ str q9, [x0], #32 // ....................*.........
+
+ // -------- new position -------->
+ // 0 25
+ // |------------------------|-----
+ // smull2 v20.4S, v3.8H, v17.8H // *..............................
+ // smlal2 v20.4S, v10.8H, v4.8H // ....*..........................
+ // smlal v12.4S, v14.4H, v0.4H // .*.............................
+ // smlal v23.4S, v10.4H, v4.4H // ..*............................
+ // str q9, [x0, #16] // ...*...........................
+ // smlal2 v20.4S, v13.8H, v15.8H // ........*......................
+ // smlal v23.4S, v13.4H, v15.4H // .......*.......................
+ // smlal2 v20.4S, v28.8H, v22.8H // ...........*...................
+ // zip1 v26.8H, v19.8H, v27.8H // ......*........................
+ // smlal v23.4S, v28.4H, v22.4H // ..........*....................
+ // uzp2 v27.8H, v12.8H, v11.8H // .....*.........................
+ // uzp1 v5.8H, v23.8H, v20.8H // ............*..................
+ // str q26, [x0], #32 // .........*.....................
+ // mul v31.8H, v5.8H, v2.8H // .............*.................
+ // smlal2 v20.4S, v31.8H, v0.8H // ..............*................
+ // smlal v23.4S, v31.4H, v0.4H // ...............*...............
+ // uzp2 v19.8H, v23.8H, v20.8H // ................*..............
+ // zip2 v9.8H, v19.8H, v27.8H // .................*.............
+ // str q9, [x0, #16] // ...................*...........
+ // zip1 v26.8H, v19.8H, v27.8H // ..................*............
+ // str q26, [x0], #32 // ....................*..........
+
+
+ pop_stack
+ ret
+
+/****************** REGISTER DEALLOCATIONS *******************/
+ .unreq out
+ .unreq a0_ptr
+ .unreq b0_ptr
+ .unreq b0_cache_ptr
+ .unreq a1_ptr
+ .unreq b1_ptr
+ .unreq b1_cache_ptr
+ .unreq a2_ptr
+ .unreq b2_ptr
+ .unreq b2_cache_ptr
+ .unreq a3_ptr
+ .unreq b3_ptr
+ .unreq b3_cache_ptr
+ .unreq count
+ .unreq modulus
+ .unreq modulus_twisted
+ .unreq wtmp
+ .unreq aa0
+ .unreq aa1
+ .unreq bb0
+ .unreq bb1
+ .unreq bb1t
+ .unreq res0l
+ .unreq res1l
+ .unreq res0h
+ .unreq res1h
+ .unreq tmp0
+ .unreq tmp1
+ .unreq q_tmp0
+ .unreq q_tmp1
+ .unreq out0
+ .unreq out1
+ .unreq t0
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT && MLKEM_K == 3 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S
new file mode 100644
index 000000000..1c30ed6aa
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S
@@ -0,0 +1,650 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// AArch64 re-implementation of the asymmetric base multiplication from:
+
+// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
+// https://eprint.iacr.org/2021/986
+// https://github.com/neon-ntt/neon-ntt
+
+#include "../../../common.h"
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) && MLKEM_K == 3
+/* simpasm: header-end */
+
+// Input:
+// - Vectors al, ah of 32-bit entries
+// Output:
+// - Montgomery reductions of al || ah, stored in al
+.macro montgomery_reduce_long x, a
+ uzp1 t0.8h, \a\()l.8h, \a\()h.8h
+ mul t0.8h, t0.8h, modulus_twisted.8h
+ smlal \a\()l.4s, t0.4h, modulus.4h
+ smlal2 \a\()h.4s, t0.8h, modulus.8h
+ uzp2 \x\().8h, \a\()l.8h, \a\()h.8h
+.endm
+
+// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit.
+
+// Bounds:
+// - Assume |a| < 4096,
+// - Result: < 2*4096*2^15 = 2^28
+.macro pmull d, a, b
+ smull \d\()0l.4s, \a\()0.4h, \b\()0.4h
+ smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
+ smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
+ smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
+
+ smull \d\()1l.4s, \a\()0.4h, \b\()1.4h
+ smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
+ smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
+ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
+.endm
+
+.macro pmlal d, a, b
+ smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h
+ smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
+ smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
+ smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
+
+ smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h
+ smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
+ smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
+ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
+.endm
+
+.macro ld2_wrap a, ptr
+ ldr q_tmp0, [\ptr\()], #32
+ ldr q_tmp1, [\ptr\(), #-16]
+ uzp1 \a\()0.8h, tmp0.8h, tmp1.8h
+ uzp2 \a\()1.8h, tmp0.8h, tmp1.8h
+.endm
+
+.macro st2_wrap a, ptr
+ zip1 tmp0.8h, \a\()0.8h, \a\()1.8h
+ zip2 tmp1.8h, \a\()0.8h, \a\()1.8h
+ str q_tmp0, [\ptr\()], #32
+ str q_tmp1, [\ptr\(), #-16]
+.endm
+
+.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr
+ ld2_wrap \a\(), \a_ptr
+ ld2_wrap \b\(), \b_ptr
+ ld1 {\b\()1t.8h}, [\b_cache_ptr], #16
+.endm
+
+.macro save_vregs
+ sub sp, sp, #(16*4)
+ stp d8, d9, [sp, #16*0]
+ stp d10, d11, [sp, #16*1]
+ stp d12, d13, [sp, #16*2]
+ stp d14, d15, [sp, #16*3]
+.endm
+
+.macro restore_vregs
+ ldp d8, d9, [sp, #16*0]
+ ldp d10, d11, [sp, #16*1]
+ ldp d12, d13, [sp, #16*2]
+ ldp d14, d15, [sp, #16*3]
+ add sp, sp, #(16*4)
+.endm
+
+.macro push_stack
+ save_vregs
+.endm
+
+.macro pop_stack
+ restore_vregs
+.endm
+
+ out .req x0
+ a0_ptr .req x1
+ b0_ptr .req x2
+ b0_cache_ptr .req x3
+ a1_ptr .req x4
+ b1_ptr .req x5
+ b1_cache_ptr .req x6
+ a2_ptr .req x7
+ b2_ptr .req x8
+ b2_cache_ptr .req x9
+ a3_ptr .req x10
+ b3_ptr .req x11
+ b3_cache_ptr .req x12
+ count .req x13
+ wtmp .req w14
+
+ modulus .req v0
+ modulus_twisted .req v2
+
+ aa0 .req v3
+ aa1 .req v4
+ bb0 .req v5
+ bb1 .req v6
+ bb1t .req v7
+
+ res0l .req v8
+ res1l .req v9
+ res0h .req v10
+ res1h .req v11
+
+ tmp0 .req v12
+ tmp1 .req v13
+ q_tmp0 .req q12
+ q_tmp1 .req q13
+
+ out0 .req v26
+ out1 .req v27
+
+ t0 .req v28
+
+ .text
+ .global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3_opt)
+ .balign 4
+MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3_opt):
+ push_stack
+ mov wtmp, #3329
+ dup modulus.8h, wtmp
+
+ mov wtmp, #3327
+ dup modulus_twisted.8h, wtmp
+
+ // Computed bases of vector entries
+
+ add a1_ptr, a0_ptr, #(1 * 512)
+ add b1_ptr, b0_ptr, #(1 * 512)
+ add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
+ add a2_ptr, a0_ptr, #(2 * 512)
+ add b2_ptr, b0_ptr, #(2 * 512)
+ add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2)
+
+ mov count, #(MLKEM_N / 16)
+ // Instructions: 75
+ // Expected cycles: 103
+ // Expected IPC: 0.73
+
+ // Cycle bound: 103.0
+ // IPC bound: 0.73
+
+ // Wall time: 0.94s
+ // User time: 0.94s
+
+ // --------------------------- original position ---------------------------->
+ // 0 25 50
+ // |------------------------|------------------------|
+ ldr q7, [x2, #16] // *..........................................................................
+ ldr q20, [x2], #32 // ..*........................................................................
+ ldr q15, [x1, #16] // .*.........................................................................
+ uzp1 v8.8H, v20.8H, v7.8H // ...............*...........................................................
+ uzp2 v7.8H, v20.8H, v7.8H // ................*..........................................................
+ ld1 {v20.8H}, [x3], #16 // ...*.......................................................................
+ ldr q30, [x1], #32 // ..............*............................................................
+ ldr q11, [x4], #32 // ....*......................................................................
+ uzp1 v16.8H, v30.8H, v15.8H // .................*.........................................................
+ uzp2 v15.8H, v30.8H, v15.8H // ..................*........................................................
+ smull v30.4S, v16.4H, v7.4H // ...................*.......................................................
+ smull2 v7.4S, v16.8H, v7.8H // ....................*......................................................
+ smull v9.4S, v16.4H, v8.4H // .....................*.....................................................
+ smull2 v16.4S, v16.8H, v8.8H // ......................*....................................................
+ smlal v30.4S, v15.4H, v8.4H // .......................*...................................................
+ smlal2 v7.4S, v15.8H, v8.8H // ........................*..................................................
+ smlal v9.4S, v15.4H, v20.4H // .........................*.................................................
+ smlal2 v16.4S, v15.8H, v20.8H // ..........................*................................................
+ ldr q20, [x4, #-16] // .....*.....................................................................
+ ldr q15, [x5], #32 // ......*....................................................................
+ uzp1 v8.8H, v11.8H, v20.8H // ...........................*...............................................
+ uzp2 v20.8H, v11.8H, v20.8H // ............................*..............................................
+ ldr q11, [x5, #-16] // .......*...................................................................
+ ld1 {v27.8H}, [x6], #16 // ........*..................................................................
+ uzp1 v10.8H, v15.8H, v11.8H // .............................*.............................................
+ uzp2 v15.8H, v15.8H, v11.8H // ..............................*............................................
+ smlal v9.4S, v8.4H, v10.4H // ...............................*...........................................
+ smlal2 v16.4S, v8.8H, v10.8H // ................................*..........................................
+ smlal v30.4S, v8.4H, v15.4H // .................................*.........................................
+ smlal2 v7.4S, v8.8H, v15.8H // ..................................*........................................
+ smlal v9.4S, v20.4H, v27.4H // ...................................*.......................................
+ smlal2 v16.4S, v20.8H, v27.8H // ....................................*......................................
+ smlal v30.4S, v20.4H, v10.4H // .....................................*.....................................
+ smlal2 v7.4S, v20.8H, v10.8H // ......................................*....................................
+ ldr q20, [x7], #32 // .........*.................................................................
+ ldr q15, [x7, #-16] // ..........*................................................................
+ ldr q8, [x8], #32 // ...........*...............................................................
+ uzp1 v11.8H, v20.8H, v15.8H // .......................................*...................................
+ uzp2 v20.8H, v20.8H, v15.8H // ........................................*..................................
+ ldr q15, [x8, #-16] // ............*..............................................................
+ ld1 {v27.8H}, [x9], #16 // .............*.............................................................
+ uzp1 v10.8H, v8.8H, v15.8H // .........................................*.................................
+ uzp2 v15.8H, v8.8H, v15.8H // ..........................................*................................
+ smlal v9.4S, v11.4H, v10.4H // ...........................................*...............................
+ smlal2 v16.4S, v11.8H, v10.8H // ............................................*..............................
+ smlal v30.4S, v11.4H, v15.4H // .............................................*.............................
+ smlal2 v7.4S, v11.8H, v15.8H // ..............................................*............................
+ smlal v9.4S, v20.4H, v27.4H // ...............................................*...........................
+ smlal2 v16.4S, v20.8H, v27.8H // ................................................*..........................
+ smlal v30.4S, v20.4H, v10.4H // .................................................*.........................
+ smlal2 v7.4S, v20.8H, v10.8H // ..................................................*........................
+ ldr q15, [x2], #32 // ...............................................................*...........
+ uzp1 v20.8H, v9.8H, v16.8H // ....................................................*......................
+ uzp1 v8.8H, v30.8H, v7.8H // .....................................................*.....................
+ mul v20.8H, v20.8H, v2.8H // ......................................................*....................
+ mul v8.8H, v8.8H, v2.8H // .......................................................*...................
+ ldr q21, [x4], #32 // .................................................................*.........
+ smlal v9.4S, v20.4H, v0.4H // ........................................................*..................
+ smlal2 v16.4S, v20.8H, v0.8H // .........................................................*.................
+ smlal v30.4S, v8.4H, v0.4H // ..........................................................*................
+ smlal2 v7.4S, v8.8H, v0.8H // ...........................................................*...............
+ ldr q6, [x4, #-16] // ..................................................................*........
+ uzp2 v27.8H, v9.8H, v16.8H // ............................................................*..............
+ uzp2 v10.8H, v30.8H, v7.8H // .............................................................*.............
+ ldr q16, [x2, #-16] // ...................................................*.......................
+ ldr q30, [x1, #16] // ..............................................................*............
+ ld1 {v9.8H}, [x3], #16 // ................................................................*..........
+ ldr q1, [x5], #32 // ...................................................................*.......
+ ldr q12, [x5, #-16] // ....................................................................*......
+ ld1 {v24.8H}, [x6], #16 // .....................................................................*.....
+ ldr q19, [x7], #32 // ......................................................................*....
+ ldr q31, [x7, #-16] // .......................................................................*...
+ ldr q17, [x8], #32 // ........................................................................*..
+ ldr q18, [x8, #-16] // .........................................................................*.
+ ld1 {v25.8H}, [x9], #16 // ..........................................................................*
+
+ // ------------------------------ new position ------------------------------>
+ // 0 25 50
+ // |------------------------|------------------------|------------------------
+ // ldr q16, [x2, #16] // *..........................................................................
+ // ldr q30, [x1, #16] // ..*........................................................................
+ // ldr q15, [x2], #32 // .*.........................................................................
+ // ld1 {v9.8H}, [x3], #16 // .....*.....................................................................
+ // ldr q21, [x4], #32 // .......*...................................................................
+ // ldr q6, [x4, #-16] // ..................*........................................................
+ // ldr q1, [x5], #32 // ...................*.......................................................
+ // ldr q12, [x5, #-16] // ......................*....................................................
+ // ld1 {v24.8H}, [x6], #16 // .......................*...................................................
+ // ldr q19, [x7], #32 // ..................................*........................................
+ // ldr q31, [x7, #-16] // ...................................*.......................................
+ // ldr q17, [x8], #32 // ....................................*......................................
+ // ldr q18, [x8, #-16] // .......................................*...................................
+ // ld1 {v25.8H}, [x9], #16 // ........................................*..................................
+ // ldr q20, [x1], #32 // ......*....................................................................
+ // uzp1 v7.8H, v15.8H, v16.8H // ...*.......................................................................
+ // uzp2 v15.8H, v15.8H, v16.8H // ....*......................................................................
+ // uzp1 v8.8H, v20.8H, v30.8H // ........*..................................................................
+ // uzp2 v20.8H, v20.8H, v30.8H // .........*.................................................................
+ // smull v30.4S, v8.4H, v15.4H // ..........*................................................................
+ // smull2 v15.4S, v8.8H, v15.8H // ...........*...............................................................
+ // smull v11.4S, v8.4H, v7.4H // ............*..............................................................
+ // smull2 v8.4S, v8.8H, v7.8H // .............*.............................................................
+ // smlal v30.4S, v20.4H, v7.4H // ..............*............................................................
+ // smlal2 v15.4S, v20.8H, v7.8H // ...............*...........................................................
+ // smlal v11.4S, v20.4H, v9.4H // ................*..........................................................
+ // smlal2 v8.4S, v20.8H, v9.8H // .................*.........................................................
+ // uzp1 v7.8H, v21.8H, v6.8H // ....................*......................................................
+ // uzp2 v20.8H, v21.8H, v6.8H // .....................*.....................................................
+ // uzp1 v16.8H, v1.8H, v12.8H // ........................*..................................................
+ // uzp2 v9.8H, v1.8H, v12.8H // .........................*.................................................
+ // smlal v11.4S, v7.4H, v16.4H // ..........................*................................................
+ // smlal2 v8.4S, v7.8H, v16.8H // ...........................*...............................................
+ // smlal v30.4S, v7.4H, v9.4H // ............................*..............................................
+ // smlal2 v15.4S, v7.8H, v9.8H // .............................*.............................................
+ // smlal v11.4S, v20.4H, v24.4H // ..............................*............................................
+ // smlal2 v8.4S, v20.8H, v24.8H // ...............................*...........................................
+ // smlal v30.4S, v20.4H, v16.4H // ................................*..........................................
+ // smlal2 v15.4S, v20.8H, v16.8H // .................................*.........................................
+ // uzp1 v7.8H, v19.8H, v31.8H // .....................................*.....................................
+ // uzp2 v20.8H, v19.8H, v31.8H // ......................................*....................................
+ // uzp1 v16.8H, v17.8H, v18.8H // .........................................*.................................
+ // uzp2 v9.8H, v17.8H, v18.8H // ..........................................*................................
+ // smlal v11.4S, v7.4H, v16.4H // ...........................................*...............................
+ // smlal2 v8.4S, v7.8H, v16.8H // ............................................*..............................
+ // smlal v30.4S, v7.4H, v9.4H // .............................................*.............................
+ // smlal2 v15.4S, v7.8H, v9.8H // ..............................................*............................
+ // smlal v11.4S, v20.4H, v25.4H // ...............................................*...........................
+ // smlal2 v8.4S, v20.8H, v25.8H // ................................................*..........................
+ // smlal v30.4S, v20.4H, v16.4H // .................................................*.........................
+ // smlal2 v15.4S, v20.8H, v16.8H // ..................................................*........................
+ // ldr q16, [x2, #16] // ................................................................*..........
+ // uzp1 v7.8H, v11.8H, v8.8H // ....................................................*......................
+ // uzp1 v20.8H, v30.8H, v15.8H // .....................................................*.....................
+ // mul v7.8H, v7.8H, v2.8H // ......................................................*....................
+ // mul v20.8H, v20.8H, v2.8H // .......................................................*...................
+ // smlal v11.4S, v7.4H, v0.4H // .........................................................*.................
+ // smlal2 v8.4S, v7.8H, v0.8H // ..........................................................*................
+ // smlal v30.4S, v20.4H, v0.4H // ...........................................................*...............
+ // smlal2 v15.4S, v20.8H, v0.8H // ............................................................*..............
+ // uzp2 v27.8H, v11.8H, v8.8H // ..............................................................*............
+ // uzp2 v10.8H, v30.8H, v15.8H // ...............................................................*...........
+ // ldr q30, [x1, #16] // .................................................................*.........
+ // ldr q15, [x2], #32 // ...................................................*.......................
+ // ld1 {v9.8H}, [x3], #16 // ..................................................................*........
+ // ldr q21, [x4], #32 // ........................................................*..................
+ // ldr q6, [x4, #-16] // .............................................................*.............
+ // ldr q1, [x5], #32 // ...................................................................*.......
+ // ldr q12, [x5, #-16] // ....................................................................*......
+ // ld1 {v24.8H}, [x6], #16 // .....................................................................*.....
+ // ldr q19, [x7], #32 // ......................................................................*....
+ // ldr q31, [x7, #-16] // .......................................................................*...
+ // ldr q17, [x8], #32 // ........................................................................*..
+ // ldr q18, [x8, #-16] // .........................................................................*.
+ // ld1 {v25.8H}, [x9], #16 // ..........................................................................*
+
+ sub count, count, #2
+polyvec_basemul_acc_montgomery_cached_asm_k3_opt_loop:
+ // Instructions: 65
+ // Expected cycles: 80
+ // Expected IPC: 0.81
+
+ // Cycle bound: 80.0
+ // IPC bound: 0.81
+
+ // Wall time: 11.64s
+ // User time: 11.64s
+
+ // ---------------------- original position ----------------------->
+ // 0 25 50
+ // |------------------------|------------------------|--------------
+ ldr q20, [x1], #32 // *................................................................
+ uzp1 v7.8H, v15.8H, v16.8H // ......*..........................................................
+ uzp2 v15.8H, v15.8H, v16.8H // .......*.........................................................
+ uzp1 v8.8H, v20.8H, v30.8H // ..*..............................................................
+ uzp2 v20.8H, v20.8H, v30.8H // ...*.............................................................
+ smull v30.4S, v8.4H, v15.4H // .............*...................................................
+ smull2 v15.4S, v8.8H, v15.8H // ..............*..................................................
+ smull v11.4S, v8.4H, v7.4H // .........*.......................................................
+ smull2 v8.4S, v8.8H, v7.8H // ..........*......................................................
+ smlal v30.4S, v20.4H, v7.4H // ...............*.................................................
+ smlal2 v15.4S, v20.8H, v7.8H // ................*................................................
+ smlal v11.4S, v20.4H, v9.4H // ...........*.....................................................
+ smlal2 v8.4S, v20.8H, v9.8H // ............*....................................................
+ uzp1 v7.8H, v21.8H, v6.8H // ...................*.............................................
+ uzp2 v20.8H, v21.8H, v6.8H // ....................*............................................
+ uzp1 v16.8H, v1.8H, v12.8H // .......................*.........................................
+ uzp2 v9.8H, v1.8H, v12.8H // ........................*........................................
+ smlal v11.4S, v7.4H, v16.4H // ..........................*......................................
+ smlal2 v8.4S, v7.8H, v16.8H // ...........................*.....................................
+ smlal v30.4S, v7.4H, v9.4H // ..............................*..................................
+ smlal2 v15.4S, v7.8H, v9.8H // ...............................*.................................
+ smlal v11.4S, v20.4H, v24.4H // ............................*....................................
+ smlal2 v8.4S, v20.8H, v24.8H // .............................*...................................
+ smlal v30.4S, v20.4H, v16.4H // ................................*................................
+ smlal2 v15.4S, v20.8H, v16.8H // .................................*...............................
+ uzp1 v7.8H, v19.8H, v31.8H // ....................................*............................
+ uzp2 v20.8H, v19.8H, v31.8H // .....................................*...........................
+ uzp1 v16.8H, v17.8H, v18.8H // ........................................*........................
+ uzp2 v9.8H, v17.8H, v18.8H // .........................................*.......................
+ smlal v11.4S, v7.4H, v16.4H // ...........................................*.....................
+ smlal2 v8.4S, v7.8H, v16.8H // ............................................*....................
+ smlal v30.4S, v7.4H, v9.4H // ...............................................*.................
+ smlal2 v15.4S, v7.8H, v9.8H // ................................................*................
+ smlal v11.4S, v20.4H, v25.4H // .............................................*...................
+ smlal2 v8.4S, v20.8H, v25.8H // ..............................................*..................
+ smlal v30.4S, v20.4H, v16.4H // .................................................*...............
+ smlal2 v15.4S, v20.8H, v16.8H // ..................................................*..............
+ ldr q16, [x2, #16] // .....e...........................................................
+ uzp1 v7.8H, v11.8H, v8.8H // ...................................................*.............
+ uzp1 v20.8H, v30.8H, v15.8H // ........................................................*........
+ mul v7.8H, v7.8H, v2.8H // ....................................................*............
+ mul v20.8H, v20.8H, v2.8H // .........................................................*.......
+ zip2 v9.8H, v27.8H, v10.8H // ..............................................................l..
+ zip1 v27.8H, v27.8H, v10.8H // .............................................................l...
+ smlal v11.4S, v7.4H, v0.4H // .....................................................*...........
+ smlal2 v8.4S, v7.8H, v0.8H // ......................................................*..........
+ smlal v30.4S, v20.4H, v0.4H // ..........................................................*......
+ smlal2 v15.4S, v20.8H, v0.8H // ...........................................................*.....
+ str q27, [x0], #32 // ...............................................................l.
+ uzp2 v27.8H, v11.8H, v8.8H // .......................................................*.........
+ str q9, [x0, #-16] // ................................................................l
+ uzp2 v10.8H, v30.8H, v15.8H // ............................................................*....
+ ldr q30, [x1, #16] // .e...............................................................
+ ldr q15, [x2], #32 // ....e............................................................
+ ld1 {v9.8H}, [x3], #16 // ........e........................................................
+ ldr q21, [x4], #32 // .................e...............................................
+ ldr q6, [x4, #-16] // ..................e..............................................
+ ldr q1, [x5], #32 // .....................e...........................................
+ ldr q12, [x5, #-16] // ......................e..........................................
+ ld1 {v24.8H}, [x6], #16 // .........................e.......................................
+ ldr q19, [x7], #32 // ..................................e..............................
+ ldr q31, [x7, #-16] // ...................................e.............................
+ ldr q17, [x8], #32 // ......................................e..........................
+ ldr q18, [x8, #-16] // .......................................e.........................
+ ld1 {v25.8H}, [x9], #16 // ..........................................e......................
+
+ // ---------------------------------------------------------------- new position ----------------------------------------------------------------->
+ // 0 25 50 75 100 125
+ // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------
+ // ldr q12, [x1], #32 // ............................*................................................................~..................................................
+ // ldr q13, [x1, #-16] // ...............e............'...................................................~............'..................................................
+ // uzp1 v3.8h, v12.8h, v13.8h // ............................'..*.............................................................'..~...............................................
+ // uzp2 v4.8h, v12.8h, v13.8h // ............................'...*............................................................'...~..............................................
+ // ldr q12, [x2], #32 // ................e...........'....................................................~...........'..................................................
+ // ldr q13, [x2, #-16] // e...........................'....................................~...........................'....................................~.............
+ // uzp1 v5.8h, v12.8h, v13.8h // ............................'*...............................................................'~.................................................
+ // uzp2 v6.8h, v12.8h, v13.8h // ............................'.*..............................................................'.~................................................
+ // ld1 {v7.8h}, [x3], #16 // .................e..........'.....................................................~..........'..................................................
+ // smull v8.4s, v3.4h, v5.4h // ............................'......*.........................................................'......~...........................................
+ // smull2 v10.4s, v3.8h, v5.8h // ............................'.......*........................................................'.......~..........................................
+ // smlal v8.4s, v4.4h, v7.4h // ............................'..........*.....................................................'..........~.......................................
+ // smlal2 v10.4s, v4.8h, v7.8h // ............................'...........*....................................................'...........~......................................
+ // smull v9.4s, v3.4h, v6.4h // ............................'....*...........................................................'....~.............................................
+ // smull2 v11.4s, v3.8h, v6.8h // ............................'.....*..........................................................'.....~............................................
+ // smlal v9.4s, v4.4h, v5.4h // ............................'........*.......................................................'........~.........................................
+ // smlal2 v11.4s, v4.8h, v5.8h // ............................'.........*......................................................'.........~........................................
+ // ldr q12, [x4], #32 // ..................e.........'......................................................~.........'..................................................
+ // ldr q13, [x4, #-16] // ...................e........'.......................................................~........'..................................................
+ // uzp1 v3.8h, v12.8h, v13.8h // ............................'............*...................................................'............~.....................................
+ // uzp2 v4.8h, v12.8h, v13.8h // ............................'.............*..................................................'.............~....................................
+ // ldr q12, [x5], #32 // ....................e.......'........................................................~.......'..................................................
+ // ldr q13, [x5, #-16] // .....................e......'.........................................................~......'..................................................
+ // uzp1 v5.8h, v12.8h, v13.8h // ............................'..............*.................................................'..............~...................................
+ // uzp2 v6.8h, v12.8h, v13.8h // ............................'...............*................................................'...............~..................................
+ // ld1 {v7.8h}, [x6], #16 // ......................e.....'..........................................................~.....'..................................................
+ // smlal v8.4s, v3.4h, v5.4h // ............................'................*...............................................'................~.................................
+ // smlal2 v10.4s, v3.8h, v5.8h // ............................'.................*..............................................'.................~................................
+ // smlal v8.4s, v4.4h, v7.4h // ............................'....................*...........................................'....................~.............................
+ // smlal2 v10.4s, v4.8h, v7.8h // ............................'.....................*..........................................'.....................~............................
+ // smlal v9.4s, v3.4h, v6.4h // ............................'..................*.............................................'..................~...............................
+ // smlal2 v11.4s, v3.8h, v6.8h // ............................'...................*............................................'...................~..............................
+ // smlal v9.4s, v4.4h, v5.4h // ............................'......................*.........................................'......................~...........................
+ // smlal2 v11.4s, v4.8h, v5.8h // ............................'.......................*........................................'.......................~..........................
+ // ldr q12, [x7], #32 // .......................e....'...........................................................~....'..................................................
+ // ldr q13, [x7, #-16] // ........................e...'............................................................~...'..................................................
+ // uzp1 v3.8h, v12.8h, v13.8h // ............................'........................*.......................................'........................~.........................
+ // uzp2 v4.8h, v12.8h, v13.8h // ............................'.........................*......................................'.........................~........................
+ // ldr q12, [x8], #32 // .........................e..'.............................................................~..'..................................................
+ // ldr q13, [x8, #-16] // ..........................e.'..............................................................~.'..................................................
+ // uzp1 v5.8h, v12.8h, v13.8h // ............................'..........................*.....................................'..........................~.......................
+ // uzp2 v6.8h, v12.8h, v13.8h // ............................'...........................*....................................'...........................~......................
+ // ld1 {v7.8h}, [x9], #16 // ...........................e'...............................................................~'..................................................
+ // smlal v8.4s, v3.4h, v5.4h // ............................'............................*...................................'............................~.....................
+ // smlal2 v10.4s, v3.8h, v5.8h // ............................'.............................*..................................'.............................~....................
+ // smlal v8.4s, v4.4h, v7.4h // ............................'................................*...............................'................................~.................
+ // smlal2 v10.4s, v4.8h, v7.8h // ............................'.................................*..............................'.................................~................
+ // smlal v9.4s, v3.4h, v6.4h // ............................'..............................*.................................'..............................~...................
+ // smlal2 v11.4s, v3.8h, v6.8h // ............................'...............................*................................'...............................~..................
+ // smlal v9.4s, v4.4h, v5.4h // ............................'..................................*.............................'..................................~...............
+ // smlal2 v11.4s, v4.8h, v5.8h // ............................'...................................*............................'...................................~..............
+ // uzp1 v28.8h, v8.8h, v10.8h // .~..........................'.....................................*..........................'.....................................~............
+ // mul v28.8h, v28.8h, v2.8h // ...~........................'.......................................*........................'.......................................~..........
+ // smlal v8.4s, v28.4h, v0.4h // .......~....................'...........................................*....................'...........................................~......
+ // smlal2 v10.4s, v28.8h, v0.8h // ........~...................'............................................*...................'............................................~.....
+ // uzp2 v26.8h, v8.8h, v10.8h // ............~...............'................................................*...............'................................................~.
+ // uzp1 v28.8h, v9.8h, v11.8h // ..~.........................'......................................*.........................'......................................~...........
+ // mul v28.8h, v28.8h, v2.8h // ....~.......................'........................................*.......................'........................................~.........
+ // smlal v9.4s, v28.4h, v0.4h // .........~..................'.............................................*..................'.............................................~....
+ // smlal2 v11.4s, v28.8h, v0.8h // ..........~.................'..............................................*.................'..............................................~...
+ // uzp2 v27.8h, v9.8h, v11.8h // ..............~.............'..................................................*.............'..................................................
+ // zip1 v12.8h, v26.8h, v27.8h // ......~.....................'..........................................~.....................'..........................................l.......
+ // zip2 v13.8h, v26.8h, v27.8h // .....~......................'.........................................~......................'.........................................l........
+ // str q12, [x0], #32 // ...........~................'...............................................~................'...............................................l..
+ // str q13, [x0, #-16] // .............~..............'.................................................~..............'.................................................l
+
+ sub count, count, #1
+ cbnz count, polyvec_basemul_acc_montgomery_cached_asm_k3_opt_loop
+ // Instructions: 55
+ // Expected cycles: 61
+ // Expected IPC: 0.90
+
+ // Cycle bound: 61.0
+ // IPC bound: 0.90
+
+ // Wall time: 8.41s
+ // User time: 8.41s
+
+ // ----------------- original position ------------------>
+ // 0 25 50
+ // |------------------------|------------------------|----
+ ldr q7, [x1], #32 // *......................................................
+ uzp1 v20.8H, v15.8H, v16.8H // .*.....................................................
+ uzp2 v15.8H, v15.8H, v16.8H // ..*....................................................
+ uzp1 v23.8H, v7.8H, v30.8H // ...*...................................................
+ uzp2 v11.8H, v7.8H, v30.8H // ....*..................................................
+ smull2 v8.4S, v23.8H, v20.8H // ........*..............................................
+ smull v5.4S, v23.4H, v20.4H // .......*...............................................
+ smull2 v30.4S, v23.8H, v15.8H // ......*................................................
+ uzp1 v28.8H, v1.8H, v12.8H // ...............*.......................................
+ smlal2 v8.4S, v11.8H, v9.8H // ............*..........................................
+ smlal v5.4S, v11.4H, v9.4H // ...........*...........................................
+ uzp1 v3.8H, v21.8H, v6.8H // .............*.........................................
+ smull v16.4S, v23.4H, v15.4H // .....*.................................................
+ smlal2 v8.4S, v3.8H, v28.8H // ..................*....................................
+ smlal v5.4S, v3.4H, v28.4H // .................*.....................................
+ uzp2 v29.8H, v21.8H, v6.8H // ..............*........................................
+ uzp1 v7.8H, v17.8H, v18.8H // ...........................*...........................
+ smlal2 v8.4S, v29.8H, v24.8H // ......................*................................
+ uzp1 v14.8H, v19.8H, v31.8H // .........................*.............................
+ smlal v16.4S, v11.4H, v20.4H // .........*.............................................
+ smlal2 v30.4S, v11.8H, v20.8H // ..........*............................................
+ smlal2 v8.4S, v14.8H, v7.8H // ..............................*........................
+ uzp2 v20.8H, v1.8H, v12.8H // ................*......................................
+ uzp2 v21.8H, v19.8H, v31.8H // ..........................*............................
+ smlal2 v30.4S, v3.8H, v20.8H // ....................*..................................
+ smlal v16.4S, v3.4H, v20.4H // ...................*...................................
+ smlal v5.4S, v29.4H, v24.4H // .....................*.................................
+ uzp2 v9.8H, v17.8H, v18.8H // ............................*..........................
+ smlal2 v30.4S, v29.8H, v28.8H // ........................*..............................
+ smlal v16.4S, v29.4H, v28.4H // .......................*...............................
+ smlal v5.4S, v14.4H, v7.4H // .............................*.........................
+ smlal2 v8.4S, v21.8H, v25.8H // ..................................*....................
+ smlal2 v30.4S, v14.8H, v9.8H // ................................*......................
+ smlal v16.4S, v14.4H, v9.4H // ...............................*.......................
+ smlal v5.4S, v21.4H, v25.4H // .................................*.....................
+ zip1 v20.8H, v27.8H, v10.8H // ..........................................*............
+ smlal2 v30.4S, v21.8H, v7.8H // ....................................*..................
+ smlal v16.4S, v21.4H, v7.4H // ...................................*...................
+ uzp1 v7.8H, v5.8H, v8.8H // .....................................*.................
+ str q20, [x0], #32 // ...............................................*.......
+ mul v15.8H, v7.8H, v2.8H // .......................................*...............
+ uzp1 v7.8H, v16.8H, v30.8H // ......................................*................
+ zip2 v31.8H, v27.8H, v10.8H // .........................................*.............
+ mul v20.8H, v7.8H, v2.8H // ........................................*..............
+ smlal v5.4S, v15.4H, v0.4H // ...........................................*...........
+ smlal2 v8.4S, v15.8H, v0.8H // ............................................*..........
+ str q31, [x0, #-16] // .................................................*.....
+ smlal2 v30.4S, v20.8H, v0.8H // ..............................................*........
+ smlal v16.4S, v20.4H, v0.4H // .............................................*.........
+ uzp2 v15.8H, v5.8H, v8.8H // ................................................*......
+ uzp2 v20.8H, v16.8H, v30.8H // ..................................................*....
+ zip1 v7.8H, v15.8H, v20.8H // ....................................................*..
+ zip2 v20.8H, v15.8H, v20.8H // ...................................................*...
+ str q7, [x0], #32 // .....................................................*.
+ str q20, [x0, #-16] // ......................................................*
+
+ // -------------------- new position -------------------->
+ // 0 25 50
+ // |------------------------|------------------------|----
+ // ldr q20, [x1], #32 // *......................................................
+ // uzp1 v7.8H, v15.8H, v16.8H // .*.....................................................
+ // uzp2 v15.8H, v15.8H, v16.8H // ..*....................................................
+ // uzp1 v8.8H, v20.8H, v30.8H // ...*...................................................
+ // uzp2 v20.8H, v20.8H, v30.8H // ....*..................................................
+ // smull v30.4S, v8.4H, v15.4H // ............*..........................................
+ // smull2 v15.4S, v8.8H, v15.8H // .......*...............................................
+ // smull v11.4S, v8.4H, v7.4H // ......*................................................
+ // smull2 v8.4S, v8.8H, v7.8H // .....*.................................................
+ // smlal v30.4S, v20.4H, v7.4H // ...................*...................................
+ // smlal2 v15.4S, v20.8H, v7.8H // ....................*..................................
+ // smlal v11.4S, v20.4H, v9.4H // ..........*............................................
+ // smlal2 v8.4S, v20.8H, v9.8H // .........*.............................................
+ // uzp1 v7.8H, v21.8H, v6.8H // ...........*...........................................
+ // uzp2 v20.8H, v21.8H, v6.8H // ...............*.......................................
+ // uzp1 v16.8H, v1.8H, v12.8H // ........*..............................................
+ // uzp2 v9.8H, v1.8H, v12.8H // ......................*................................
+ // smlal v11.4S, v7.4H, v16.4H // ..............*........................................
+ // smlal2 v8.4S, v7.8H, v16.8H // .............*.........................................
+ // smlal v30.4S, v7.4H, v9.4H // .........................*.............................
+ // smlal2 v15.4S, v7.8H, v9.8H // ........................*..............................
+ // smlal v11.4S, v20.4H, v24.4H // ..........................*............................
+ // smlal2 v8.4S, v20.8H, v24.8H // .................*.....................................
+ // smlal v30.4S, v20.4H, v16.4H // .............................*.........................
+ // smlal2 v15.4S, v20.8H, v16.8H // ............................*..........................
+ // uzp1 v7.8H, v19.8H, v31.8H // ..................*....................................
+ // uzp2 v20.8H, v19.8H, v31.8H // .......................*...............................
+ // uzp1 v16.8H, v17.8H, v18.8H // ................*......................................
+ // uzp2 v9.8H, v17.8H, v18.8H // ...........................*...........................
+ // smlal v11.4S, v7.4H, v16.4H // ..............................*........................
+ // smlal2 v8.4S, v7.8H, v16.8H // .....................*.................................
+ // smlal v30.4S, v7.4H, v9.4H // .................................*.....................
+ // smlal2 v15.4S, v7.8H, v9.8H // ................................*......................
+ // smlal v11.4S, v20.4H, v25.4H // ..................................*....................
+ // smlal2 v8.4S, v20.8H, v25.8H // ...............................*.......................
+ // smlal v30.4S, v20.4H, v16.4H // .....................................*.................
+ // smlal2 v15.4S, v20.8H, v16.8H // ....................................*..................
+ // uzp1 v7.8H, v11.8H, v8.8H // ......................................*................
+ // uzp1 v20.8H, v30.8H, v15.8H // .........................................*.............
+ // mul v7.8H, v7.8H, v2.8H // ........................................*..............
+ // mul v20.8H, v20.8H, v2.8H // ...........................................*...........
+ // zip2 v9.8H, v27.8H, v10.8H // ..........................................*............
+ // zip1 v27.8H, v27.8H, v10.8H // ...................................*...................
+ // smlal v11.4S, v7.4H, v0.4H // ............................................*..........
+ // smlal2 v8.4S, v7.8H, v0.8H // .............................................*.........
+ // smlal v30.4S, v20.4H, v0.4H // ................................................*......
+ // smlal2 v15.4S, v20.8H, v0.8H // ...............................................*.......
+ // str q27, [x0], #32 // .......................................*...............
+ // uzp2 v27.8H, v11.8H, v8.8H // .................................................*.....
+ // str q9, [x0, #-16] // ..............................................*........
+ // uzp2 v10.8H, v30.8H, v15.8H // ..................................................*....
+ // zip2 v9.8H, v27.8H, v10.8H // ....................................................*..
+ // zip1 v27.8H, v27.8H, v10.8H // ...................................................*...
+ // str q27, [x0], #32 // .....................................................*.
+ // str q9, [x0, #-16] // ......................................................*
+
+
+ pop_stack
+ ret
+
+/****************** REGISTER DEALLOCATIONS *******************/
+ .unreq out
+ .unreq a0_ptr
+ .unreq b0_ptr
+ .unreq b0_cache_ptr
+ .unreq a1_ptr
+ .unreq b1_ptr
+ .unreq b1_cache_ptr
+ .unreq a2_ptr
+ .unreq b2_ptr
+ .unreq b2_cache_ptr
+ .unreq a3_ptr
+ .unreq b3_ptr
+ .unreq b3_cache_ptr
+ .unreq count
+ .unreq modulus
+ .unreq modulus_twisted
+ .unreq wtmp
+ .unreq aa0
+ .unreq aa1
+ .unreq bb0
+ .unreq bb1
+ .unreq bb1t
+ .unreq res0l
+ .unreq res1l
+ .unreq res0h
+ .unreq res1h
+ .unreq tmp0
+ .unreq tmp1
+ .unreq q_tmp0
+ .unreq q_tmp1
+ .unreq out0
+ .unreq out1
+ .unreq t0
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT && MLKEM_K == 3 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S
new file mode 100644
index 000000000..c3d70ed42
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S
@@ -0,0 +1,761 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// AArch64 re-implementation of the asymmetric base multiplication from:
+
+// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
+// https://eprint.iacr.org/2021/986
+// https://github.com/neon-ntt/neon-ntt
+
+#include "../../../common.h"
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) && MLKEM_K == 4
+/* simpasm: header-end */
+
+// Input:
+// - Vectors al, ah of 32-bit entries
+// Output:
+// - Montgomery reductions of al || ah, stored in al
+.macro montgomery_reduce_long x, a
+ uzp1 t0.8h, \a\()l.8h, \a\()h.8h
+ mul t0.8h, t0.8h, modulus_twisted.8h
+ smlal \a\()l.4s, t0.4h, modulus.4h
+ smlal2 \a\()h.4s, t0.8h, modulus.8h
+ uzp2 \x\().8h, \a\()l.8h, \a\()h.8h
+.endm
+
+// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit.
+
+// Bounds:
+// - Assume |a| < 4096,
+// - Result: < 2*4096*2^15 = 2^28
+.macro pmull d, a, b
+ smull \d\()0l.4s, \a\()0.4h, \b\()0.4h
+ smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
+ smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
+ smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
+
+ smull \d\()1l.4s, \a\()0.4h, \b\()1.4h
+ smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
+ smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
+ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
+.endm
+
+.macro pmlal d, a, b
+ smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h
+ smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
+ smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
+ smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
+
+ smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h
+ smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
+ smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
+ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
+.endm
+
+.macro ld2_wrap a, ptr
+ ldr q_tmp0, [\ptr\()], #32
+ ldr q_tmp1, [\ptr\(), #-16]
+ uzp1 \a\()0.8h, tmp0.8h, tmp1.8h
+ uzp2 \a\()1.8h, tmp0.8h, tmp1.8h
+.endm
+
+.macro st2_wrap a, ptr
+ zip1 tmp0.8h, \a\()0.8h, \a\()1.8h
+ zip2 tmp1.8h, \a\()0.8h, \a\()1.8h
+ str q_tmp0, [\ptr\()], #32
+ str q_tmp1, [\ptr\(), #-16]
+.endm
+
+.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr
+ ld2_wrap \a\(), \a_ptr
+ ld2_wrap \b\(), \b_ptr
+ ld1 {\b\()1t.8h}, [\b_cache_ptr], #16
+.endm
+
+.macro save_vregs
+ sub sp, sp, #(16*4)
+ stp d8, d9, [sp, #16*0]
+ stp d10, d11, [sp, #16*1]
+ stp d12, d13, [sp, #16*2]
+ stp d14, d15, [sp, #16*3]
+.endm
+
+.macro restore_vregs
+ ldp d8, d9, [sp, #16*0]
+ ldp d10, d11, [sp, #16*1]
+ ldp d12, d13, [sp, #16*2]
+ ldp d14, d15, [sp, #16*3]
+ add sp, sp, #(16*4)
+.endm
+
+.macro push_stack
+ save_vregs
+.endm
+
+.macro pop_stack
+ restore_vregs
+.endm
+
+ out .req x0
+ a0_ptr .req x1
+ b0_ptr .req x2
+ b0_cache_ptr .req x3
+ a1_ptr .req x4
+ b1_ptr .req x5
+ b1_cache_ptr .req x6
+ a2_ptr .req x7
+ b2_ptr .req x8
+ b2_cache_ptr .req x9
+ a3_ptr .req x10
+ b3_ptr .req x11
+ b3_cache_ptr .req x12
+ count .req x13
+ wtmp .req w14
+
+ modulus .req v0
+ modulus_twisted .req v2
+
+ aa0 .req v3
+ aa1 .req v4
+ bb0 .req v5
+ bb1 .req v6
+ bb1t .req v7
+
+ res0l .req v8
+ res1l .req v9
+ res0h .req v10
+ res1h .req v11
+
+ tmp0 .req v12
+ tmp1 .req v13
+ q_tmp0 .req q12
+ q_tmp1 .req q13
+
+ out0 .req v26
+ out1 .req v27
+
+ t0 .req v28
+
+ .text
+ .global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4_opt)
+ .balign 4
+MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4_opt):
+ push_stack
+ mov wtmp, #3329
+ dup modulus.8h, wtmp
+
+ mov wtmp, #3327
+ dup modulus_twisted.8h, wtmp
+
+ // Computed bases of vector entries
+
+ add a1_ptr, a0_ptr, #(1 * 512)
+ add b1_ptr, b0_ptr, #(1 * 512)
+ add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
+ add a2_ptr, a0_ptr, #(2 * 512)
+ add b2_ptr, b0_ptr, #(2 * 512)
+ add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2)
+ add a3_ptr, a0_ptr, #(3 * 512)
+ add b3_ptr, b0_ptr, #(3 * 512)
+ add b3_cache_ptr, b0_cache_ptr, #(3 * 512/2)
+
+ // Bounds:
+
+ // Each pmull is bound by 2*4096*2^15=2^28, so the final value
+ // before Montgomery reduction is bound by 2^30.
+
+ mov count, #(MLKEM_N / 16)
+ // Instructions: 114
+ // Expected cycles: 153
+ // Expected IPC: 0.75
+ //
+ // Cycle bound: 153.0
+ // IPC bound: 0.75
+ //
+ // Wall time: 0.69s
+ // User time: 0.69s
+ //
+ // ----------------------------------------------- original position ----------------------------------------------->
+ // 0 25 50 75 100
+ // |------------------------|------------------------|------------------------|------------------------|-------------
+ ldr q23, [x2, #16] // .*................................................................................................................
+ ldr q19, [x2], #32 // *.................................................................................................................
+ ldr q17, [x5], #32 // ..*...............................................................................................................
+ uzp2 v13.8H, v19.8H, v23.8H // ..........*.......................................................................................................
+ uzp1 v19.8H, v19.8H, v23.8H // ...........*......................................................................................................
+ ldr q23, [x5, #-16] // ...*..............................................................................................................
+ ldr q30, [x1, #16] // .....*............................................................................................................
+ uzp2 v9.8H, v17.8H, v23.8H // ....*.............................................................................................................
+ uzp1 v23.8H, v17.8H, v23.8H // .......*..........................................................................................................
+ ldr q17, [x1], #32 // ......*...........................................................................................................
+ ldr q10, [x7, #16] // .............*....................................................................................................
+ uzp1 v12.8H, v17.8H, v30.8H // ........*.........................................................................................................
+ uzp2 v17.8H, v17.8H, v30.8H // .........*........................................................................................................
+ smull2 v30.4S, v12.8H, v13.8H // ............*.....................................................................................................
+ smull v13.4S, v12.4H, v13.4H // ............................................*.....................................................................
+ smull2 v22.4S, v12.8H, v19.8H // .....................................*............................................................................
+ smull v12.4S, v12.4H, v19.4H // ..........................................*.......................................................................
+ smlal2 v30.4S, v17.8H, v19.8H // ...............................*..................................................................................
+ smlal v13.4S, v17.4H, v19.4H // ...............................................*..................................................................
+ ldr q19, [x4], #32 // ....................*.............................................................................................
+ ldr q16, [x4, #-16] // .....................*............................................................................................
+ ld1 {v8.8H}, [x3], #16 // ................................*.................................................................................
+ uzp1 v26.8H, v19.8H, v16.8H // .......................*..........................................................................................
+ uzp2 v19.8H, v19.8H, v16.8H // ........................*.........................................................................................
+ smlal2 v30.4S, v26.8H, v9.8H // .................................*................................................................................
+ smlal v13.4S, v26.4H, v9.4H // ..................................................*...............................................................
+ smlal2 v22.4S, v17.8H, v8.8H // ........................................*.........................................................................
+ smlal v12.4S, v17.4H, v8.4H // .................................................*................................................................
+ smlal2 v30.4S, v19.8H, v23.8H // ...................................*..............................................................................
+ smlal v13.4S, v19.4H, v23.4H // .......................................................*..........................................................
+ smlal2 v22.4S, v26.8H, v23.8H // ...........................................*......................................................................
+ smlal v12.4S, v26.4H, v23.4H // .....................................................*............................................................
+ ldr q23, [x7], #32 // ......................*...........................................................................................
+ ldr q17, [x8, #16] // ..............*...................................................................................................
+ uzp1 v9.8H, v23.8H, v10.8H // ..........................*.......................................................................................
+ uzp2 v23.8H, v23.8H, v10.8H // ....................................*.............................................................................
+ ldr q10, [x10], #32 // ...............*..................................................................................................
+ ldr q16, [x10, #-16] // ................*.................................................................................................
+ ld1 {v8.8H}, [x12], #16 // .................*................................................................................................
+ uzp1 v26.8H, v10.8H, v16.8H // ..................*...............................................................................................
+ uzp2 v10.8H, v10.8H, v16.8H // ...................*..............................................................................................
+ ld1 {v16.8H}, [x6], #16 // .........................*........................................................................................
+ ldr q3, [x11, #16] // ...........................*......................................................................................
+ smlal2 v22.4S, v19.8H, v16.8H // ..............................................*...................................................................
+ smlal v12.4S, v19.4H, v16.4H // ........................................................*.........................................................
+ ldr q19, [x11], #32 // ............................*.....................................................................................
+ ld1 {v16.8H}, [x9], #16 // .............................*....................................................................................
+ uzp1 v4.8H, v19.8H, v3.8H // ..................................*...............................................................................
+ uzp2 v19.8H, v19.8H, v3.8H // .......................................*..........................................................................
+ ldr q3, [x8], #32 // ..............................*...................................................................................
+ ldr q31, [x2], #32 // ......................................*...........................................................................
+ uzp1 v6.8H, v3.8H, v17.8H // ...................................................*..............................................................
+ uzp2 v17.8H, v3.8H, v17.8H // .........................................................*........................................................
+ smlal2 v22.4S, v9.8H, v6.8H // ..........................................................*.......................................................
+ smlal2 v30.4S, v9.8H, v17.8H // ...........................................................*......................................................
+ smlal v13.4S, v9.4H, v17.4H // ............................................................*.....................................................
+ smlal v12.4S, v9.4H, v6.4H // .............................................................*....................................................
+ smlal2 v22.4S, v23.8H, v16.8H // ..............................................................*...................................................
+ smlal2 v30.4S, v23.8H, v6.8H // ...............................................................*..................................................
+ smlal v13.4S, v23.4H, v6.4H // ................................................................*.................................................
+ smlal v12.4S, v23.4H, v16.4H // .................................................................*................................................
+ smlal2 v22.4S, v26.8H, v4.8H // ..................................................................*...............................................
+ smlal2 v30.4S, v26.8H, v19.8H // ...................................................................*..............................................
+ smlal v13.4S, v26.4H, v19.4H // ....................................................................*.............................................
+ smlal v12.4S, v26.4H, v4.4H // .....................................................................*............................................
+ smlal2 v22.4S, v10.8H, v8.8H // ......................................................................*...........................................
+ smlal2 v30.4S, v10.8H, v4.8H // .......................................................................*..........................................
+ smlal v13.4S, v10.4H, v4.4H // ........................................................................*.........................................
+ smlal v12.4S, v10.4H, v8.4H // .........................................................................*........................................
+ ldr q19, [x2, #-16] // .........................................*........................................................................
+ uzp1 v23.8H, v13.8H, v30.8H // ...........................................................................*......................................
+ uzp1 v17.8H, v12.8H, v22.8H // ....................................................................................*.............................
+ mul v23.8H, v23.8H, v2.8H // .............................................................................*....................................
+ uzp2 v21.8H, v31.8H, v19.8H // ................................................................................*.................................
+ uzp1 v19.8H, v31.8H, v19.8H // ...................................................................................*..............................
+ mul v17.8H, v17.8H, v2.8H // .....................................................................................*............................
+ smlal v13.4S, v23.4H, v0.4H // .................................................................................*................................
+ smlal2 v30.4S, v23.8H, v0.8H // ..................................................................................*...............................
+ ldr q23, [x5], #32 // .............................................*....................................................................
+ smlal2 v22.4S, v17.8H, v0.8H // ...........................................................................................................*......
+ uzp2 v15.8H, v13.8H, v30.8H // ......................................................................................*...........................
+ smlal v12.4S, v17.4H, v0.4H // ............................................................................................................*.....
+ ldr q17, [x5, #-16] // ................................................*.................................................................
+ ldr q13, [x1, #16] // ......................................................*...........................................................
+ uzp2 v27.8H, v23.8H, v17.8H // ....................................................*.............................................................
+ uzp1 v28.8H, v23.8H, v17.8H // ............................................................................*.....................................
+ uzp2 v7.8H, v12.8H, v22.8H // ...............................................................................................................*..
+ ldr q23, [x1], #32 // ..........................................................................*.......................................
+ zip1 v5.8H, v7.8H, v15.8H // .................................................................................................................*
+ ldr q3, [x7, #16] // ........................................................................................*.........................
+ uzp1 v31.8H, v23.8H, v13.8H // ..............................................................................*...................................
+ uzp2 v16.8H, v23.8H, v13.8H // ...............................................................................*..................................
+ smull2 v24.4S, v31.8H, v21.8H // .......................................................................................*..........................
+ ldr q6, [x8, #16] // .........................................................................................*........................
+ ldr q23, [x10], #32 // ..........................................................................................*.......................
+ smlal2 v24.4S, v16.8H, v19.8H // ..........................................................................................................*.......
+ ldr q17, [x10, #-16] // ...........................................................................................*......................
+ ld1 {v22.8H}, [x12], #16 // ............................................................................................*.....................
+ uzp1 v30.8H, v23.8H, v17.8H // .............................................................................................*....................
+ uzp2 v11.8H, v23.8H, v17.8H // ..............................................................................................*...................
+ ldr q23, [x4], #32 // ...............................................................................................*..................
+ ldr q17, [x4, #-16] // ................................................................................................*.................
+ ldr q4, [x7], #32 // .................................................................................................*................
+ uzp1 v20.8H, v23.8H, v17.8H // ..................................................................................................*...............
+ uzp2 v26.8H, v23.8H, v17.8H // ...................................................................................................*..............
+ uzp1 v9.8H, v4.8H, v3.8H // .....................................................................................................*............
+ smlal2 v24.4S, v20.8H, v27.8H // ..............................................................................................................*...
+ ld1 {v8.8H}, [x6], #16 // ....................................................................................................*.............
+ ldr q25, [x11, #16] // ......................................................................................................*...........
+ ldr q29, [x11], #32 // .......................................................................................................*..........
+ ld1 {v12.8H}, [x9], #16 // ........................................................................................................*.........
+ uzp1 v10.8H, v29.8H, v25.8H // ................................................................................................................*.
+ ldr q14, [x8], #32 // .........................................................................................................*........
+ ld1 {v23.8H}, [x3], #16 // .............................................................................................................*....
+
+ // ------------------------------------------------- new position -------------------------------------------------->
+ // 0 25 50 75 100
+ // |------------------------|------------------------|------------------------|------------------------|-------------
+ // ldr q3, [x2], #32 // .*................................................................................................................
+ // ldr q17, [x2, #-16] // *.................................................................................................................
+ // ldr q21, [x5], #32 // ..*...............................................................................................................
+ // ldr q19, [x5, #-16] // .....*............................................................................................................
+ // uzp2 v27.8H, v21.8H, v19.8H // .......*..........................................................................................................
+ // ldr q25, [x1, #16] // ......*...........................................................................................................
+ // ldr q22, [x1], #32 // .........*........................................................................................................
+ // uzp1 v28.8H, v21.8H, v19.8H // ........*.........................................................................................................
+ // uzp1 v31.8H, v22.8H, v25.8H // ...........*......................................................................................................
+ // uzp2 v16.8H, v22.8H, v25.8H // ............*.....................................................................................................
+ // uzp2 v21.8H, v3.8H, v17.8H // ...*..............................................................................................................
+ // uzp1 v19.8H, v3.8H, v17.8H // ....*.............................................................................................................
+ // smull2 v24.4S, v31.8H, v21.8H // .............*....................................................................................................
+ // ldr q3, [x7, #16] // ..........*.......................................................................................................
+ // ldr q6, [x8, #16] // .................................*................................................................................
+ // ldr q8, [x10], #32 // ....................................*.............................................................................
+ // ldr q26, [x10, #-16] // .....................................*............................................................................
+ // ld1 {v22.8H}, [x12], #16 // ......................................*...........................................................................
+ // uzp1 v30.8H, v8.8H, v26.8H // .......................................*..........................................................................
+ // uzp2 v11.8H, v8.8H, v26.8H // ........................................*.........................................................................
+ // ldr q8, [x4], #32 // ...................*..............................................................................................
+ // ldr q26, [x4, #-16] // ....................*.............................................................................................
+ // ldr q4, [x7], #32 // ................................*.................................................................................
+ // uzp1 v20.8H, v8.8H, v26.8H // ......................*...........................................................................................
+ // uzp2 v26.8H, v8.8H, v26.8H // .......................*..........................................................................................
+ // ld1 {v8.8H}, [x6], #16 // .........................................*........................................................................
+ // uzp1 v9.8H, v4.8H, v3.8H // ..................................*...............................................................................
+ // ldr q25, [x11, #16] // ..........................................*.......................................................................
+ // ldr q29, [x11], #32 // .............................................*....................................................................
+ // ld1 {v12.8H}, [x9], #16 // ..............................................*...................................................................
+ // ldr q14, [x8], #32 // .................................................*................................................................
+ // smlal2 v24.4S, v16.8H, v19.8H // .................*................................................................................................
+ // ld1 {v23.8H}, [x3], #16 // .....................*............................................................................................
+ // smlal2 v24.4S, v20.8H, v27.8H // ........................*.........................................................................................
+ // uzp1 v10.8H, v29.8H, v25.8H // ...............................................*..................................................................
+ // smlal2 v24.4S, v26.8H, v28.8H // ............................*.....................................................................................
+ // uzp2 v4.8H, v4.8H, v3.8H // ...................................*..............................................................................
+ // smull2 v13.4S, v31.8H, v19.8H // ...............*..................................................................................................
+ // ldr q3, [x2], #32 // ..................................................*...............................................................
+ // uzp2 v1.8H, v29.8H, v25.8H // ................................................*.................................................................
+ // smlal2 v13.4S, v16.8H, v23.8H // ..........................*.......................................................................................
+ // ldr q17, [x2, #-16] // .....................................................................*............................................
+ // smull v18.4S, v31.4H, v19.4H // ................*.................................................................................................
+ // smlal2 v13.4S, v20.8H, v28.8H // ..............................*...................................................................................
+ // smull v29.4S, v31.4H, v21.4H // ..............*...................................................................................................
+ // ldr q21, [x5], #32 // ..............................................................................*...................................
+ // smlal2 v13.4S, v26.8H, v8.8H // ...........................................*......................................................................
+ // smlal v29.4S, v16.4H, v19.4H // ..................*...............................................................................................
+ // ldr q19, [x5, #-16] // ..................................................................................*...............................
+ // smlal v18.4S, v16.4H, v23.4H // ...........................*......................................................................................
+ // smlal v29.4S, v20.4H, v27.4H // .........................*........................................................................................
+ // uzp1 v31.8H, v14.8H, v6.8H // ...................................................*..............................................................
+ // uzp2 v27.8H, v21.8H, v19.8H // ....................................................................................*.............................
+ // smlal v18.4S, v20.4H, v28.4H // ...............................*..................................................................................
+ // ldr q25, [x1, #16] // ...................................................................................*..............................
+ // smlal v29.4S, v26.4H, v28.4H // .............................*....................................................................................
+ // smlal v18.4S, v26.4H, v8.4H // ............................................*.....................................................................
+ // uzp2 v26.8H, v14.8H, v6.8H // ....................................................*.............................................................
+ // smlal2 v13.4S, v9.8H, v31.8H // .....................................................*............................................................
+ // smlal2 v24.4S, v9.8H, v26.8H // ......................................................*...........................................................
+ // smlal v29.4S, v9.4H, v26.4H // .......................................................*..........................................................
+ // smlal v18.4S, v9.4H, v31.4H // ........................................................*.........................................................
+ // smlal2 v13.4S, v4.8H, v12.8H // .........................................................*........................................................
+ // smlal2 v24.4S, v4.8H, v31.8H // ..........................................................*.......................................................
+ // smlal v29.4S, v4.4H, v31.4H // ...........................................................*......................................................
+ // smlal v18.4S, v4.4H, v12.4H // ............................................................*.....................................................
+ // smlal2 v13.4S, v30.8H, v10.8H // .............................................................*....................................................
+ // smlal2 v24.4S, v30.8H, v1.8H // ..............................................................*...................................................
+ // smlal v29.4S, v30.4H, v1.4H // ...............................................................*..................................................
+ // smlal v18.4S, v30.4H, v10.4H // ................................................................*.................................................
+ // smlal2 v13.4S, v11.8H, v22.8H // .................................................................*................................................
+ // smlal2 v24.4S, v11.8H, v10.8H // ..................................................................*...............................................
+ // smlal v29.4S, v11.4H, v10.4H // ...................................................................*..............................................
+ // smlal v18.4S, v11.4H, v22.4H // ....................................................................*.............................................
+ // ldr q22, [x1], #32 // .......................................................................................*..........................
+ // uzp1 v31.8H, v29.8H, v24.8H // ......................................................................*...........................................
+ // uzp1 v28.8H, v21.8H, v19.8H // .....................................................................................*............................
+ // mul v19.8H, v31.8H, v2.8H // ........................................................................*.........................................
+ // uzp1 v31.8H, v22.8H, v25.8H // ..........................................................................................*.......................
+ // uzp2 v16.8H, v22.8H, v25.8H // ...........................................................................................*......................
+ // uzp2 v21.8H, v3.8H, v17.8H // .........................................................................*........................................
+ // smlal v29.4S, v19.4H, v0.4H // ............................................................................*.....................................
+ // smlal2 v24.4S, v19.8H, v0.8H // .............................................................................*....................................
+ // uzp1 v19.8H, v3.8H, v17.8H // ..........................................................................*.......................................
+ // uzp1 v26.8H, v18.8H, v13.8H // .......................................................................*..........................................
+ // mul v23.8H, v26.8H, v2.8H // ...........................................................................*......................................
+ // uzp2 v15.8H, v29.8H, v24.8H // ................................................................................*.................................
+ // smull2 v24.4S, v31.8H, v21.8H // ............................................................................................*.....................
+ // ldr q3, [x7, #16] // .........................................................................................*........................
+ // ldr q6, [x8, #16] // .............................................................................................*....................
+ // ldr q8, [x10], #32 // ..............................................................................................*...................
+ // ldr q26, [x10, #-16] // ................................................................................................*.................
+ // ld1 {v22.8H}, [x12], #16 // .................................................................................................*................
+ // uzp1 v30.8H, v8.8H, v26.8H // ..................................................................................................*...............
+ // uzp2 v11.8H, v8.8H, v26.8H // ...................................................................................................*..............
+ // ldr q8, [x4], #32 // ....................................................................................................*.............
+ // ldr q26, [x4, #-16] // .....................................................................................................*............
+ // ldr q4, [x7], #32 // ......................................................................................................*...........
+ // uzp1 v20.8H, v8.8H, v26.8H // .......................................................................................................*..........
+ // uzp2 v26.8H, v8.8H, v26.8H // ........................................................................................................*.........
+ // ld1 {v8.8H}, [x6], #16 // ...........................................................................................................*......
+ // uzp1 v9.8H, v4.8H, v3.8H // .........................................................................................................*........
+ // ldr q25, [x11, #16] // ............................................................................................................*.....
+ // ldr q29, [x11], #32 // .............................................................................................................*....
+ // ld1 {v12.8H}, [x9], #16 // ..............................................................................................................*...
+ // ldr q14, [x8], #32 // ................................................................................................................*.
+ // smlal2 v24.4S, v16.8H, v19.8H // ...............................................................................................*..................
+ // smlal2 v13.4S, v23.8H, v0.8H // ...............................................................................*..................................
+ // smlal v18.4S, v23.4H, v0.4H // .................................................................................*................................
+ // ld1 {v23.8H}, [x3], #16 // .................................................................................................................*
+ // smlal2 v24.4S, v20.8H, v27.8H // ..........................................................................................................*.......
+ // uzp2 v7.8H, v18.8H, v13.8H // ......................................................................................*...........................
+ // uzp1 v10.8H, v29.8H, v25.8H // ...............................................................................................................*..
+ // zip1 v5.8H, v7.8H, v15.8H // ........................................................................................*.........................
+
+ sub count, count, #2
+polyvec_basemul_acc_montgomery_cached_asm_k4_opt_loop:
+ // Instructions: 82
+ // Expected cycles: 102
+ // Expected IPC: 0.80
+ //
+ // Cycle bound: 102.0
+ // IPC bound: 0.80
+ //
+ // Wall time: 15.93s
+ // User time: 15.93s
+ //
+ // ------------------------------- original position ------------------------------->
+ // 0 25 50 75
+ // |------------------------|------------------------|------------------------|------
+ smlal2 v24.4S, v26.8H, v28.8H // .................................*................................................
+ uzp2 v4.8H, v4.8H, v3.8H // .....................................*............................................
+ smull2 v13.4S, v31.8H, v19.8H // ..........*.......................................................................
+ ldr q3, [x2], #32 // ....e.............................................................................
+ uzp2 v1.8H, v29.8H, v25.8H // ..........................................................*.......................
+ smlal2 v13.4S, v16.8H, v23.8H // ............*.....................................................................
+ ldr q17, [x2, #-16] // .....e............................................................................
+ smull v18.4S, v31.4H, v19.4H // .........*........................................................................
+ smlal2 v13.4S, v20.8H, v28.8H // ...........................*......................................................
+ smull v29.4S, v31.4H, v21.4H // .............*....................................................................
+ ldr q21, [x5], #32 // .....................e............................................................
+ smlal2 v13.4S, v26.8H, v8.8H // .............................*....................................................
+ smlal v29.4S, v16.4H, v19.4H // ...............*..................................................................
+ ldr q19, [x5, #-16] // ......................e...........................................................
+ smlal v18.4S, v16.4H, v23.4H // ...........*......................................................................
+ smlal v29.4S, v20.4H, v27.4H // ..............................*...................................................
+ uzp1 v31.8H, v14.8H, v6.8H // ........................................*.........................................
+ uzp2 v27.8H, v21.8H, v19.8H // ........................e.........................................................
+ smlal v18.4S, v20.4H, v28.4H // ..........................*.......................................................
+ ldr q25, [x1, #16] // .e................................................................................
+ smlal v29.4S, v26.4H, v28.4H // ................................*.................................................
+ smlal v18.4S, v26.4H, v8.4H // ............................*.....................................................
+ uzp2 v26.8H, v14.8H, v6.8H // .........................................*........................................
+ smlal2 v13.4S, v9.8H, v31.8H // ............................................*.....................................
+ smlal2 v24.4S, v9.8H, v26.8H // ................................................*.................................
+ smlal v29.4S, v9.4H, v26.4H // ...............................................*..................................
+ smlal v18.4S, v9.4H, v31.4H // ...........................................*......................................
+ smlal2 v13.4S, v4.8H, v12.8H // ..............................................*...................................
+ smlal2 v24.4S, v4.8H, v31.8H // ..................................................*...............................
+ smlal v29.4S, v4.4H, v31.4H // .................................................*................................
+ smlal v18.4S, v4.4H, v12.4H // .............................................*....................................
+ smlal2 v13.4S, v30.8H, v10.8H // .............................................................*....................
+ smlal2 v24.4S, v30.8H, v1.8H // .................................................................*................
+ smlal v29.4S, v30.4H, v1.4H // ................................................................*.................
+ smlal v18.4S, v30.4H, v10.4H // ............................................................*.....................
+ smlal2 v13.4S, v11.8H, v22.8H // ...............................................................*..................
+ smlal2 v24.4S, v11.8H, v10.8H // ...................................................................*..............
+ smlal v29.4S, v11.4H, v10.4H // ..................................................................*...............
+ smlal v18.4S, v11.4H, v22.4H // ..............................................................*...................
+ ldr q22, [x1], #32 // e.................................................................................
+ uzp1 v31.8H, v29.8H, v24.8H // .........................................................................*........
+ uzp1 v28.8H, v21.8H, v19.8H // .......................e..........................................................
+ mul v19.8H, v31.8H, v2.8H // ..........................................................................*.......
+ uzp1 v31.8H, v22.8H, v25.8H // ..e...............................................................................
+ uzp2 v16.8H, v22.8H, v25.8H // ...e..............................................................................
+ uzp2 v21.8H, v3.8H, v17.8H // .......e..........................................................................
+ smlal v29.4S, v19.4H, v0.4H // ...........................................................................*......
+ smlal2 v24.4S, v19.8H, v0.8H // ............................................................................*.....
+ uzp1 v19.8H, v3.8H, v17.8H // ......e...........................................................................
+ uzp1 v26.8H, v18.8H, v13.8H // ....................................................................*.............
+ zip2 v14.8H, v7.8H, v15.8H // ...............................................................................l..
+ mul v23.8H, v26.8H, v2.8H // .....................................................................*............
+ uzp2 v15.8H, v29.8H, v24.8H // .............................................................................*....
+ smull2 v24.4S, v31.8H, v21.8H // ..............e...................................................................
+ str q14, [x0, #16] // .................................................................................l
+ ldr q3, [x7, #16] // ...................................e..............................................
+ ldr q6, [x8, #16] // .......................................e..........................................
+ ldr q8, [x10], #32 // ...................................................e..............................
+ ldr q26, [x10, #-16] // ....................................................e.............................
+ ld1 {v22.8H}, [x12], #16 // ...........................................................e......................
+ uzp1 v30.8H, v8.8H, v26.8H // .....................................................e............................
+ uzp2 v11.8H, v8.8H, v26.8H // ......................................................e...........................
+ ldr q8, [x4], #32 // .................e................................................................
+ ldr q26, [x4, #-16] // ..................e...............................................................
+ ldr q4, [x7], #32 // ..................................e...............................................
+ uzp1 v20.8H, v8.8H, v26.8H // ...................e..............................................................
+ uzp2 v26.8H, v8.8H, v26.8H // ....................e.............................................................
+ ld1 {v8.8H}, [x6], #16 // .........................e........................................................
+ uzp1 v9.8H, v4.8H, v3.8H // ....................................e.............................................
+ ldr q25, [x11, #16] // ........................................................e.........................
+ ldr q29, [x11], #32 // .......................................................e..........................
+ ld1 {v12.8H}, [x9], #16 // ..........................................e.......................................
+ ldr q14, [x8], #32 // ......................................e...........................................
+ smlal2 v24.4S, v16.8H, v19.8H // ................e.................................................................
+ smlal2 v13.4S, v23.8H, v0.8H // .......................................................................*..........
+ smlal v18.4S, v23.4H, v0.4H // ......................................................................*...........
+ ld1 {v23.8H}, [x3], #16 // ........e.........................................................................
+ smlal2 v24.4S, v20.8H, v27.8H // ...............................e..................................................
+ uzp2 v7.8H, v18.8H, v13.8H // ........................................................................*.........
+ uzp1 v10.8H, v29.8H, v25.8H // .........................................................e........................
+ str q5, [x0], #32 // ................................................................................l.
+ zip1 v5.8H, v7.8H, v15.8H // ..............................................................................*...
+
+ // ----------------------------------------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------------------------------------>
+ // 0 25 50 75 100 125 150 175 200 225
+ // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------
+ // ldr q12, [x1], #32 // ....................................e..........................................'......................................~..........................................'......................................~.........................................
+ // ldr q13, [x1, #-16] // ................e..............................................................'..................~..............................................................'..................~.............................................................
+ // uzp1 v3.8h, v12.8h, v13.8h // ........................................e......................................'..........................................~......................................'..........................................~.....................................
+ // uzp2 v4.8h, v12.8h, v13.8h // .........................................e.....................................'...........................................~.....................................'...........................................~....................................
+ // ldr q12, [x2], #32 // e..............................................................................'..~..............................................................................'..~.............................................................................
+ // ldr q13, [x2, #-16] // ...e...........................................................................'.....~...........................................................................'.....~..........................................................................
+ // uzp1 v5.8h, v12.8h, v13.8h // .............................................e.................................'...............................................~.................................'...............................................~................................
+ // uzp2 v6.8h, v12.8h, v13.8h // ..........................................e....................................'............................................~....................................'............................................~...................................
+ // ld1 {v7.8h}, [x3], #16 // .........................................................................e.....'...........................................................................~.....'...........................................................................~....
+ // smull v8.4s, v3.4h, v5.4h // ....~..........................................................................'......*..........................................................................'......~.........................................................................
+ // smull2 v10.4s, v3.8h, v5.8h // ...............................................................................'.*...............................................................................'.~..............................................................................
+ // smlal v8.4s, v4.4h, v7.4h // ...........~...................................................................'.............*...................................................................'.............~..................................................................
+ // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................................................'....*............................................................................'....~...........................................................................
+ // smull v9.4s, v3.4h, v6.4h // ......~........................................................................'........*........................................................................'........~.......................................................................
+ // smull2 v11.4s, v3.8h, v6.8h // ..................................................e............................'....................................................~............................'....................................................~...........................
+ // smlal v9.4s, v4.4h, v5.4h // .........~.....................................................................'...........*.....................................................................'...........~....................................................................
+ // smlal2 v11.4s, v4.8h, v5.8h // ......................................................................e........'........................................................................~........'........................................................................~.......
+ // ldr q12, [x4], #32 // ...........................................................e...................'.............................................................~...................'.............................................................~..................
+ // ldr q13, [x4, #-16] // ............................................................e..................'..............................................................~..................'..............................................................~.................
+ // uzp1 v3.8h, v12.8h, v13.8h // ..............................................................e................'................................................................~................'................................................................~...............
+ // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................e...............'.................................................................~...............'.................................................................~..............
+ // ldr q12, [x5], #32 // .......e.......................................................................'.........~.......................................................................'.........~......................................................................
+ // ldr q13, [x5, #-16] // ..........e....................................................................'............~....................................................................'............~...................................................................
+ // uzp1 v5.8h, v12.8h, v13.8h // ......................................e........................................'........................................~........................................'........................................~.......................................
+ // uzp2 v6.8h, v12.8h, v13.8h // ..............e................................................................'................~................................................................'................~...............................................................
+ // ld1 {v7.8h}, [x6], #16 // ................................................................e..............'..................................................................~..............'..................................................................~.............
+ // smlal v8.4s, v3.4h, v5.4h // ...............~...............................................................'.................*...............................................................'.................~..............................................................
+ // smlal2 v10.4s, v3.8h, v5.8h // .....~.........................................................................'.......*.........................................................................'.......~........................................................................
+ // smlal v8.4s, v4.4h, v7.4h // ..................~............................................................'....................*............................................................'....................~...........................................................
+ // smlal2 v10.4s, v4.8h, v7.8h // ........~......................................................................'..........*......................................................................'..........~.....................................................................
+ // smlal v9.4s, v3.4h, v6.4h // ............~..................................................................'..............*..................................................................'..............~.................................................................
+ // smlal2 v11.4s, v3.8h, v6.8h // ..........................................................................e....'............................................................................~....'............................................................................~...
+ // smlal v9.4s, v4.4h, v5.4h // .................~.............................................................'...................*.............................................................'...................~............................................................
+ // smlal2 v11.4s, v4.8h, v5.8h // ...............................................................................*.................................................................................~................................................................................
+ // ldr q12, [x7], #32 // .............................................................e.................'...............................................................~.................'...............................................................~................
+ // ldr q13, [x7, #-16] // ....................................................e..........................'......................................................~..........................'......................................................~.........................
+ // uzp1 v3.8h, v12.8h, v13.8h // .................................................................e.............'...................................................................~.............'...................................................................~............
+ // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................................'*................................................................................'~...............................................................................
+ // ldr q12, [x8], #32 // .....................................................................e.........'.......................................................................~.........'.......................................................................~........
+ // ldr q13, [x8, #-16] // .....................................................e.........................'.......................................................~.........................'.......................................................~........................
+ // uzp1 v5.8h, v12.8h, v13.8h // .............~.................................................................'...............*.................................................................'...............~................................................................
+ // uzp2 v6.8h, v12.8h, v13.8h // ...................~...........................................................'.....................*...........................................................'.....................~..........................................................
+ // ld1 {v7.8h}, [x9], #16 // ....................................................................e..........'......................................................................~..........'......................................................................~.........
+ // smlal v8.4s, v3.4h, v5.4h // .......................~.......................................................'.........................*.......................................................'.........................~......................................................
+ // smlal2 v10.4s, v3.8h, v5.8h // ....................~..........................................................'......................*..........................................................'......................~.........................................................
+ // smlal v8.4s, v4.4h, v7.4h // ...........................~...................................................'.............................*...................................................'.............................~..................................................
+ // smlal2 v10.4s, v4.8h, v7.8h // ........................~......................................................'..........................*......................................................'..........................~.....................................................
+ // smlal v9.4s, v3.4h, v6.4h // ......................~........................................................'........................*........................................................'........................~.......................................................
+ // smlal2 v11.4s, v3.8h, v6.8h // .....................~.........................................................'.......................*.........................................................'.......................~........................................................
+ // smlal v9.4s, v4.4h, v5.4h // ..........................~....................................................'............................*....................................................'............................~...................................................
+ // smlal2 v11.4s, v4.8h, v5.8h // .........................~.....................................................'...........................*.....................................................'...........................~....................................................
+ // ldr q12, [x10], #32 // ......................................................e........................'........................................................~........................'........................................................~.......................
+ // ldr q13, [x10, #-16] // .......................................................e.......................'.........................................................~.......................'.........................................................~......................
+ // uzp1 v3.8h, v12.8h, v13.8h // .........................................................e.....................'...........................................................~.....................'...........................................................~....................
+ // uzp2 v4.8h, v12.8h, v13.8h // ..........................................................e....................'............................................................~....................'............................................................~...................
+ // ldr q12, [x11], #32 // ...................................................................e...........'.....................................................................~...........'.....................................................................~..........
+ // ldr q13, [x11, #-16] // ..................................................................e............'....................................................................~............'....................................................................~...........
+ // uzp1 v5.8h, v12.8h, v13.8h // ............................................................................e..'..............................................................................~..'..............................................................................~.
+ // uzp2 v6.8h, v12.8h, v13.8h // .~.............................................................................'...*.............................................................................'...~............................................................................
+ // ld1 {v7.8h}, [x12], #16 // ........................................................e......................'..........................................................~......................'..........................................................~.....................
+ // smlal v8.4s, v3.4h, v5.4h // ...............................~...............................................'.................................*...............................................'.................................~..............................................
+ // smlal2 v10.4s, v3.8h, v5.8h // ............................~..................................................'..............................*..................................................'..............................~.................................................
+ // smlal v8.4s, v4.4h, v7.4h // ...................................~...........................................'.....................................*...........................................'.....................................~..........................................
+ // smlal2 v10.4s, v4.8h, v7.8h // ................................~..............................................'..................................*..............................................'..................................~.............................................
+ // smlal v9.4s, v3.4h, v6.4h // ..............................~................................................'................................*................................................'................................~...............................................
+ // smlal2 v11.4s, v3.8h, v6.8h // .............................~.................................................'...............................*.................................................'...............................~................................................
+ // smlal v9.4s, v4.4h, v5.4h // ..................................~............................................'....................................*............................................'....................................~...........................................
+ // smlal2 v11.4s, v4.8h, v5.8h // .................................~.............................................'...................................*.............................................'...................................~............................................
+ // uzp1 v28.8h, v8.8h, v10.8h // ..............................................~................................'................................................*................................'................................................~...............................
+ // mul v28.8h, v28.8h, v2.8h // ................................................~..............................'..................................................*..............................'..................................................~.............................
+ // smlal v8.4s, v28.4h, v0.4h // ........................................................................~......'..........................................................................*......'..........................................................................~.....
+ // smlal2 v10.4s, v28.8h, v0.8h // .......................................................................~.......'.........................................................................*.......'.........................................................................~......
+ // uzp2 v26.8h, v8.8h, v10.8h // ...........................................................................~...'.............................................................................*...'.............................................................................~..
+ // uzp1 v28.8h, v9.8h, v11.8h // .....................................~.........................................'.......................................*.........................................'.......................................~........................................
+ // mul v28.8h, v28.8h, v2.8h // .......................................~.......................................'.........................................*.......................................'.........................................~......................................
+ // smlal v9.4s, v28.4h, v0.4h // ...........................................~...................................'.............................................*...................................'.............................................~..................................
+ // smlal2 v11.4s, v28.8h, v0.8h // ............................................~..................................'..............................................*..................................'..............................................~.................................
+ // uzp2 v27.8h, v9.8h, v11.8h // .................................................~.............................'...................................................*.............................'...................................................~............................
+ // zip1 v12.8h, v26.8h, v27.8h // ..............................................................................~'................................................................................*'................................................................................
+ // zip2 v13.8h, v26.8h, v27.8h // ...............................................~...............................'.................................................~...............................'.................................................l..............................
+ // str q12, [x0], #32 // .............................................................................~.'...............................................................................~.'...............................................................................l
+ // str q13, [x0, #-16] // ...................................................~...........................'.....................................................~...........................'.....................................................l..........................
+
+ sub count, count, #1
+ cbnz count, polyvec_basemul_acc_montgomery_cached_asm_k4_opt_loop
+
+ // Instructions: 50
+ // Expected cycles: 56
+ // Expected IPC: 0.89
+ //
+ // Cycle bound: 56.0
+ // IPC bound: 0.89
+ //
+ // Wall time: 4.16s
+ // User time: 4.16s
+ //
+ // --------------- original position --------------->
+ // 0 25
+ // |------------------------|
+ smull2 v17.4S, v31.8H, v19.8H // ..*...............................................
+ uzp2 v1.8H, v14.8H, v6.8H // ................*.................................
+ smull v18.4S, v31.4H, v21.4H // .......*..........................................
+ smlal2 v24.4S, v26.8H, v28.8H // *.................................................
+ smlal2 v17.4S, v16.8H, v23.8H // ....*.............................................
+ smull v21.4S, v31.4H, v19.4H // .....*............................................
+ smlal v18.4S, v16.4H, v19.4H // .........*........................................
+ uzp2 v31.8H, v4.8H, v3.8H // .*................................................
+ uzp1 v3.8H, v14.8H, v6.8H // ............*.....................................
+ smlal v21.4S, v16.4H, v23.4H // ..........*.......................................
+ smlal v18.4S, v20.4H, v27.4H // ...........*......................................
+ uzp2 v14.8H, v29.8H, v25.8H // ...*..............................................
+ smlal2 v17.4S, v20.8H, v28.8H // ......*...........................................
+ smlal v21.4S, v20.4H, v28.4H // .............*....................................
+ smlal v18.4S, v26.4H, v28.4H // ..............*...................................
+ smlal2 v24.4S, v9.8H, v1.8H // ..................*...............................
+ smlal2 v17.4S, v26.8H, v8.8H // ........*.........................................
+ smlal v21.4S, v26.4H, v8.4H // ...............*..................................
+ smlal v18.4S, v9.4H, v1.4H // ...................*..............................
+ smlal2 v24.4S, v31.8H, v3.8H // ......................*...........................
+ smlal2 v17.4S, v9.8H, v3.8H // .................*................................
+ smlal v21.4S, v9.4H, v3.4H // ....................*.............................
+ smlal v18.4S, v31.4H, v3.4H // .......................*..........................
+ smlal2 v24.4S, v30.8H, v14.8H // ..........................*.......................
+ smlal2 v17.4S, v31.8H, v12.8H // .....................*............................
+ smlal v21.4S, v31.4H, v12.4H // ........................*.........................
+ smlal v18.4S, v30.4H, v14.4H // ...........................*......................
+ smlal2 v24.4S, v11.8H, v10.8H // ..............................*...................
+ smlal2 v17.4S, v30.8H, v10.8H // .........................*........................
+ smlal v21.4S, v30.4H, v10.4H // ............................*.....................
+ smlal v18.4S, v11.4H, v10.4H // ...............................*..................
+ zip2 v19.8H, v7.8H, v15.8H // ......................................*...........
+ smlal2 v17.4S, v11.8H, v22.8H // .............................*....................
+ smlal v21.4S, v11.4H, v22.4H // ................................*.................
+ uzp1 v23.8H, v18.8H, v24.8H // .................................*................
+ str q19, [x0, #16] // .........................................*........
+ mul v19.8H, v23.8H, v2.8H // ..................................*...............
+ uzp1 v23.8H, v21.8H, v17.8H // .....................................*............
+ str q5, [x0], #32 // .............................................*....
+ mul v26.8H, v23.8H, v2.8H // .......................................*..........
+ smlal v18.4S, v19.4H, v0.4H // ...................................*..............
+ smlal2 v24.4S, v19.8H, v0.8H // ....................................*.............
+ smlal v21.4S, v26.4H, v0.4H // ...........................................*......
+ smlal2 v17.4S, v26.8H, v0.8H // ..........................................*.......
+ uzp2 v13.8H, v18.8H, v24.8H // ........................................*.........
+ uzp2 v19.8H, v21.8H, v17.8H // ............................................*.....
+ zip1 v23.8H, v19.8H, v13.8H // ..............................................*...
+ zip2 v19.8H, v19.8H, v13.8H // ...............................................*..
+ str q23, [x0], #32 // .................................................*
+ str q19, [x0, #-16] // ................................................*.
+
+ // ----------------- new position ------------------>
+ // 0 25
+ // |------------------------|------------------------
+ // smlal2 v24.4S, v26.8H, v28.8H // ...*..............................................
+ // uzp2 v4.8H, v4.8H, v3.8H // .......*..........................................
+ // smull2 v13.4S, v31.8H, v19.8H // *.................................................
+ // uzp2 v1.8H, v29.8H, v25.8H // ...........*......................................
+ // smlal2 v13.4S, v16.8H, v23.8H // ....*.............................................
+ // smull v18.4S, v31.4H, v19.4H // .....*............................................
+ // smlal2 v13.4S, v20.8H, v28.8H // ............*.....................................
+ // smull v29.4S, v31.4H, v21.4H // ..*...............................................
+ // smlal2 v13.4S, v26.8H, v8.8H // ................*.................................
+ // smlal v29.4S, v16.4H, v19.4H // ......*...........................................
+ // smlal v18.4S, v16.4H, v23.4H // .........*........................................
+ // smlal v29.4S, v20.4H, v27.4H // ..........*.......................................
+ // uzp1 v31.8H, v14.8H, v6.8H // ........*.........................................
+ // smlal v18.4S, v20.4H, v28.4H // .............*....................................
+ // smlal v29.4S, v26.4H, v28.4H // ..............*...................................
+ // smlal v18.4S, v26.4H, v8.4H // .................*................................
+ // uzp2 v26.8H, v14.8H, v6.8H // .*................................................
+ // smlal2 v13.4S, v9.8H, v31.8H // ....................*.............................
+ // smlal2 v24.4S, v9.8H, v26.8H // ...............*..................................
+ // smlal v29.4S, v9.4H, v26.4H // ..................*...............................
+ // smlal v18.4S, v9.4H, v31.4H // .....................*............................
+ // smlal2 v13.4S, v4.8H, v12.8H // ........................*.........................
+ // smlal2 v24.4S, v4.8H, v31.8H // ...................*..............................
+ // smlal v29.4S, v4.4H, v31.4H // ......................*...........................
+ // smlal v18.4S, v4.4H, v12.4H // .........................*........................
+ // smlal2 v13.4S, v30.8H, v10.8H // ............................*.....................
+ // smlal2 v24.4S, v30.8H, v1.8H // .......................*..........................
+ // smlal v29.4S, v30.4H, v1.4H // ..........................*.......................
+ // smlal v18.4S, v30.4H, v10.4H // .............................*....................
+ // smlal2 v13.4S, v11.8H, v22.8H // ................................*.................
+ // smlal2 v24.4S, v11.8H, v10.8H // ...........................*......................
+ // smlal v29.4S, v11.4H, v10.4H // ..............................*...................
+ // smlal v18.4S, v11.4H, v22.4H // .................................*................
+ // uzp1 v31.8H, v29.8H, v24.8H // ..................................*...............
+ // mul v19.8H, v31.8H, v2.8H // ....................................*.............
+ // smlal v29.4S, v19.4H, v0.4H // ........................................*.........
+ // smlal2 v24.4S, v19.8H, v0.8H // .........................................*........
+ // uzp1 v26.8H, v18.8H, v13.8H // .....................................*............
+ // zip2 v14.8H, v7.8H, v15.8H // ...............................*..................
+ // mul v23.8H, v26.8H, v2.8H // .......................................*..........
+ // uzp2 v15.8H, v29.8H, v24.8H // ............................................*.....
+ // str q14, [x0, #16] // ...................................*..............
+ // smlal2 v13.4S, v23.8H, v0.8H // ...........................................*......
+ // smlal v18.4S, v23.4H, v0.4H // ..........................................*.......
+ // uzp2 v7.8H, v18.8H, v13.8H // .............................................*....
+ // str q5, [x0], #32 // ......................................*...........
+ // zip1 v5.8H, v7.8H, v15.8H // ..............................................*...
+ // zip2 v14.8H, v7.8H, v15.8H // ...............................................*..
+ // str q14, [x0, #16] // .................................................*
+ // str q5, [x0], #32 // ................................................*.
+
+
+ pop_stack
+ ret
+
+/****************** REGISTER DEALLOCATIONS *******************/
+ .unreq out
+ .unreq a0_ptr
+ .unreq b0_ptr
+ .unreq b0_cache_ptr
+ .unreq a1_ptr
+ .unreq b1_ptr
+ .unreq b1_cache_ptr
+ .unreq a2_ptr
+ .unreq b2_ptr
+ .unreq b2_cache_ptr
+ .unreq a3_ptr
+ .unreq b3_ptr
+ .unreq b3_cache_ptr
+ .unreq count
+ .unreq modulus
+ .unreq modulus_twisted
+ .unreq wtmp
+ .unreq aa0
+ .unreq aa1
+ .unreq bb0
+ .unreq bb1
+ .unreq bb1t
+ .unreq res0l
+ .unreq res1l
+ .unreq res0h
+ .unreq res1h
+ .unreq tmp0
+ .unreq tmp1
+ .unreq q_tmp0
+ .unreq q_tmp1
+ .unreq out0
+ .unreq out1
+ .unreq t0
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT && MLKEM_K == 4 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_clean.S
deleted file mode 100644
index 94f0889b7..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_clean.S
+++ /dev/null
@@ -1,310 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-//
-// AArch64 re-implementation of the asymmetric base multiplication from:
-//
-// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
-// https://eprint.iacr.org/2021/986
-// https://github.com/neon-ntt/neon-ntt
-
-#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
-
-// Input:
-// - Vectors al, ah of 32-bit entries
-// Output:
-// - Montgomery reductions of al || ah, stored in al
-.macro montgomery_reduce_long x, a
- uzp1 t0.8h, \a\()l.8h, \a\()h.8h
- mul t0.8h, t0.8h, modulus_twisted.8h
- smlal \a\()l.4s, t0.4h, modulus.4h
- smlal2 \a\()h.4s, t0.8h, modulus.8h
- uzp2 \x\().8h, \a\()l.8h, \a\()h.8h
-.endm
-
-// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit.
-//
-// Bounds:
-// - Assume |a| < 4096,
-// - Result: < 2*4096*2^15 = 2^28
-.macro pmull d, a, b
- smull \d\()0l.4s, \a\()0.4h, \b\()0.4h
- smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
- smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
- smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
-
- smull \d\()1l.4s, \a\()0.4h, \b\()1.4h
- smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
- smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
- smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
-.endm
-
-.macro pmlal d, a, b
- smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h
- smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
- smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
- smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
-
- smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h
- smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
- smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
- smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
-.endm
-
-.macro ld2_wrap a, ptr
- ldr q_tmp0, [\ptr\()], #32
- ldr q_tmp1, [\ptr\(), #-16]
- uzp1 \a\()0.8h, tmp0.8h, tmp1.8h
- uzp2 \a\()1.8h, tmp0.8h, tmp1.8h
-.endm
-
-.macro st2_wrap a, ptr
- zip1 tmp0.8h, \a\()0.8h, \a\()1.8h
- zip2 tmp1.8h, \a\()0.8h, \a\()1.8h
- str q_tmp0, [\ptr\()], #32
- str q_tmp1, [\ptr\(), #-16]
-.endm
-
-.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr
- ld2_wrap \a\(), \a_ptr
- ld2_wrap \b\(), \b_ptr
- ld1 {\b\()1t.8h}, [\b_cache_ptr], #16
-.endm
-
-.macro save_vregs
- sub sp, sp, #(16*4)
- stp d8, d9, [sp, #16*0]
- stp d10, d11, [sp, #16*1]
- stp d12, d13, [sp, #16*2]
- stp d14, d15, [sp, #16*3]
-.endm
-
-.macro restore_vregs
- ldp d8, d9, [sp, #16*0]
- ldp d10, d11, [sp, #16*1]
- ldp d12, d13, [sp, #16*2]
- ldp d14, d15, [sp, #16*3]
- add sp, sp, #(16*4)
-.endm
-
-.macro push_stack
- save_vregs
-.endm
-
-.macro pop_stack
- restore_vregs
-.endm
-
- out .req x0
- a0_ptr .req x1
- b0_ptr .req x2
- b0_cache_ptr .req x3
- a1_ptr .req x4
- b1_ptr .req x5
- b1_cache_ptr .req x6
- a2_ptr .req x7
- b2_ptr .req x8
- b2_cache_ptr .req x9
- a3_ptr .req x10
- b3_ptr .req x11
- b3_cache_ptr .req x12
- count .req x13
- wtmp .req w14
-
- modulus .req v0
- modulus_twisted .req v2
-
- aa0 .req v3
- aa1 .req v4
- bb0 .req v5
- bb1 .req v6
- bb1t .req v7
-
- res0l .req v8
- res1l .req v9
- res0h .req v10
- res1h .req v11
-
- tmp0 .req v12
- tmp1 .req v13
- q_tmp0 .req q12
- q_tmp1 .req q13
-
- out0 .req v26
- out1 .req v27
-
- t0 .req v28
-
-#if MLKEM_K == 2
-.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
-
-.balign 4
-MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean):
- push_stack
-
- mov wtmp, #3329
- dup modulus.8h, wtmp
-
- mov wtmp, #3327
- dup modulus_twisted.8h, wtmp
-
- // Computed bases of vector entries
-
- add a1_ptr, a0_ptr, #(1 * 512)
- add b1_ptr, b0_ptr, #(1 * 512)
- add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
-
- mov count, #(MLKEM_N / 16)
-k2_loop_start:
-
- load_polys aa, bb, a0_ptr, b0_ptr, b0_cache_ptr
- pmull res, aa, bb
- load_polys aa, bb, a1_ptr, b1_ptr, b1_cache_ptr
- pmlal res, aa, bb
-
- montgomery_reduce_long out0, res0
- montgomery_reduce_long out1, res1
-
- st2_wrap out, out
-
- subs count, count, #1
- cbnz count, k2_loop_start
-
- pop_stack
- ret
-#endif /* MLKEM_K == 2 */
-
-#if MLKEM_K == 3
-.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
-
-.balign 4
-MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean):
- push_stack
- mov wtmp, #3329
- dup modulus.8h, wtmp
-
- mov wtmp, #3327
- dup modulus_twisted.8h, wtmp
-
- // Computed bases of vector entries
-
- add a1_ptr, a0_ptr, #(1 * 512)
- add b1_ptr, b0_ptr, #(1 * 512)
- add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
- add a2_ptr, a0_ptr, #(2 * 512)
- add b2_ptr, b0_ptr, #(2 * 512)
- add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2)
-
- mov count, #(MLKEM_N / 16)
-k3_loop_start:
-
- load_polys aa, bb, a0_ptr, b0_ptr, b0_cache_ptr
- pmull res, aa, bb
- load_polys aa, bb, a1_ptr, b1_ptr, b1_cache_ptr
- pmlal res, aa, bb
- load_polys aa, bb, a2_ptr, b2_ptr, b2_cache_ptr
- pmlal res, aa, bb
-
- montgomery_reduce_long out0, res0
- montgomery_reduce_long out1, res1
-
- st2_wrap out, out
-
- subs count, count, #1
- cbnz count, k3_loop_start
-
- pop_stack
- ret
-#endif /* MLKEM_K == 3 */
-
-#if MLKEM_K == 4
-.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
-
-.balign 4
-MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean):
- push_stack
- mov wtmp, #3329
- dup modulus.8h, wtmp
-
- mov wtmp, #3327
- dup modulus_twisted.8h, wtmp
-
- // Computed bases of vector entries
-
- add a1_ptr, a0_ptr, #(1 * 512)
- add b1_ptr, b0_ptr, #(1 * 512)
- add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
- add a2_ptr, a0_ptr, #(2 * 512)
- add b2_ptr, b0_ptr, #(2 * 512)
- add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2)
- add a3_ptr, a0_ptr, #(3 * 512)
- add b3_ptr, b0_ptr, #(3 * 512)
- add b3_cache_ptr, b0_cache_ptr, #(3 * 512/2)
-
- // Bounds:
- //
- // Each pmull is bound by 2*4096*2^15=2^28, so the final value
- // before Montgomery reduction is bound by 2^30.
-
- mov count, #(MLKEM_N / 16)
-k4_loop_start:
-
- load_polys aa, bb, a0_ptr, b0_ptr, b0_cache_ptr
- pmull res, aa, bb
- load_polys aa, bb, a1_ptr, b1_ptr, b1_cache_ptr
- pmlal res, aa, bb
- load_polys aa, bb, a2_ptr, b2_ptr, b2_cache_ptr
- pmlal res, aa, bb
- load_polys aa, bb, a3_ptr, b3_ptr, b3_cache_ptr
- pmlal res, aa, bb
-
- montgomery_reduce_long out0, res0
- montgomery_reduce_long out1, res1
-
- st2_wrap out, out
-
- subs count, count, #1
- cbnz count, k4_loop_start
-
- pop_stack
- ret
-#endif /* MLKEM_K == 4 */
-
-/****************** REGISTER DEALLOCATIONS *******************/
- .unreq out
- .unreq a0_ptr
- .unreq b0_ptr
- .unreq b0_cache_ptr
- .unreq a1_ptr
- .unreq b1_ptr
- .unreq b1_cache_ptr
- .unreq a2_ptr
- .unreq b2_ptr
- .unreq b2_cache_ptr
- .unreq a3_ptr
- .unreq b3_ptr
- .unreq b3_cache_ptr
- .unreq count
- .unreq modulus
- .unreq modulus_twisted
- .unreq aa0
- .unreq aa1
- .unreq bb0
- .unreq bb1
- .unreq bb1t
- .unreq res0l
- .unreq res1l
- .unreq res0h
- .unreq wtmp
- .unreq res1h
- .unreq tmp0
- .unreq tmp1
- .unreq q_tmp0
- .unreq q_tmp1
- .unreq out0
- .unreq out1
- .unreq t0
-
-#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_opt.S
deleted file mode 100644
index 275ca06d2..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_opt.S
+++ /dev/null
@@ -1,1606 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-// AArch64 re-implementation of the asymmetric base multiplication from:
-
-// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
-// https://eprint.iacr.org/2021/986
-// https://github.com/neon-ntt/neon-ntt
-
-#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
-
-// Input:
-// - Vectors al, ah of 32-bit entries
-// Output:
-// - Montgomery reductions of al || ah, stored in al
-.macro montgomery_reduce_long x, a
- uzp1 t0.8h, \a\()l.8h, \a\()h.8h
- mul t0.8h, t0.8h, modulus_twisted.8h
- smlal \a\()l.4s, t0.4h, modulus.4h
- smlal2 \a\()h.4s, t0.8h, modulus.8h
- uzp2 \x\().8h, \a\()l.8h, \a\()h.8h
-.endm
-
-// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit.
-
-// Bounds:
-// - Assume |a| < 4096,
-// - Result: < 2*4096*2^15 = 2^28
-.macro pmull d, a, b
- smull \d\()0l.4s, \a\()0.4h, \b\()0.4h
- smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
- smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
- smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
-
- smull \d\()1l.4s, \a\()0.4h, \b\()1.4h
- smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
- smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
- smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
-.endm
-
-.macro pmlal d, a, b
- smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h
- smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h
- smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h
- smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h
-
- smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h
- smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h
- smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h
- smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h
-.endm
-
-.macro ld2_wrap a, ptr
- ldr q_tmp0, [\ptr\()], #32
- ldr q_tmp1, [\ptr\(), #-16]
- uzp1 \a\()0.8h, tmp0.8h, tmp1.8h
- uzp2 \a\()1.8h, tmp0.8h, tmp1.8h
-.endm
-
-.macro st2_wrap a, ptr
- zip1 tmp0.8h, \a\()0.8h, \a\()1.8h
- zip2 tmp1.8h, \a\()0.8h, \a\()1.8h
- str q_tmp0, [\ptr\()], #32
- str q_tmp1, [\ptr\(), #-16]
-.endm
-
-.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr
- ld2_wrap \a\(), \a_ptr
- ld2_wrap \b\(), \b_ptr
- ld1 {\b\()1t.8h}, [\b_cache_ptr], #16
-.endm
-
-.macro save_vregs
- sub sp, sp, #(16*4)
- stp d8, d9, [sp, #16*0]
- stp d10, d11, [sp, #16*1]
- stp d12, d13, [sp, #16*2]
- stp d14, d15, [sp, #16*3]
-.endm
-
-.macro restore_vregs
- ldp d8, d9, [sp, #16*0]
- ldp d10, d11, [sp, #16*1]
- ldp d12, d13, [sp, #16*2]
- ldp d14, d15, [sp, #16*3]
- add sp, sp, #(16*4)
-.endm
-
-.macro push_stack
- save_vregs
-.endm
-
-.macro pop_stack
- restore_vregs
-.endm
-
- out .req x0
- a0_ptr .req x1
- b0_ptr .req x2
- b0_cache_ptr .req x3
- a1_ptr .req x4
- b1_ptr .req x5
- b1_cache_ptr .req x6
- a2_ptr .req x7
- b2_ptr .req x8
- b2_cache_ptr .req x9
- a3_ptr .req x10
- b3_ptr .req x11
- b3_cache_ptr .req x12
- count .req x13
- wtmp .req w14
-
- modulus .req v0
- modulus_twisted .req v2
-
- aa0 .req v3
- aa1 .req v4
- bb0 .req v5
- bb1 .req v6
- bb1t .req v7
-
- res0l .req v8
- res1l .req v9
- res0h .req v10
- res1h .req v11
-
- tmp0 .req v12
- tmp1 .req v13
- q_tmp0 .req q12
- q_tmp1 .req q13
-
- out0 .req v26
- out1 .req v27
-
- t0 .req v28
-
-#if MLKEM_K == 2
-.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
-
-.balign 4
-MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt):
- push_stack
-
- mov wtmp, #3329
- dup modulus.8h, wtmp
-
- mov wtmp, #3327
- dup modulus_twisted.8h, wtmp
-
- // Computed bases of vector entries
-
- add a1_ptr, a0_ptr, #(1 * 512)
- add b1_ptr, b0_ptr, #(1 * 512)
- add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
-
- mov count, #(MLKEM_N / 16)
- // Instructions: 75
- // Expected cycles: 94
- // Expected IPC: 0.80
-
- // Cycle bound: 94.0
- // IPC bound: 0.80
-
- // Wall time: 1.49s
- // User time: 1.49s
-
- // --------------------------- original position ---------------------------->
- // 0 25 50
- // |------------------------|------------------------|
- ldr q9, [x4], #32 // *..........................................................................
- ldr q5, [x4, #-16] // ......*....................................................................
- ldr q11, [x5], #32 // .*.........................................................................
- uzp1 v23.8H, v9.8H, v5.8H // .........*.................................................................
- uzp2 v9.8H, v9.8H, v5.8H // .....................*.....................................................
- ldr q5, [x2], #32 // ..*........................................................................
- ldr q7, [x5, #-16] // ..............*............................................................
- ldr q21, [x2, #-16] // ...*.......................................................................
- uzp2 v10.8H, v11.8H, v7.8H // .................*.........................................................
- uzp1 v11.8H, v11.8H, v7.8H // ..................*........................................................
- uzp1 v7.8H, v5.8H, v21.8H // ....*......................................................................
- uzp2 v5.8H, v5.8H, v21.8H // .....*.....................................................................
- ldr q21, [x1], #32 // .......*...................................................................
- ldr q25, [x1, #-16] // ........*..................................................................
- ld1 {v6.8H}, [x3], #16 // ............................*..............................................
- uzp1 v26.8H, v21.8H, v25.8H // ..........*................................................................
- uzp2 v21.8H, v21.8H, v25.8H // ...........*...............................................................
- smull v25.4S, v26.4H, v5.4H // ............*..............................................................
- smull2 v5.4S, v26.8H, v5.8H // .............*.............................................................
- smull v19.4S, v26.4H, v7.4H // ..........................*................................................
- smull2 v26.4S, v26.8H, v7.8H // ..............................*............................................
- smlal v25.4S, v21.4H, v7.4H // ...............*...........................................................
- smlal2 v5.4S, v21.8H, v7.8H // ................*..........................................................
- smlal v19.4S, v21.4H, v6.4H // ...................................*.......................................
- smlal2 v26.4S, v21.8H, v6.8H // .................................*.........................................
- smlal v25.4S, v23.4H, v10.4H // ...................*.......................................................
- smlal2 v5.4S, v23.8H, v10.8H // ....................*......................................................
- smlal v19.4S, v23.4H, v11.4H // ......................................*....................................
- smlal2 v26.4S, v23.8H, v11.8H // ....................................*......................................
- ld1 {v23.8H}, [x6], #16 // ........................*..................................................
- smlal v25.4S, v9.4H, v11.4H // ......................*....................................................
- smlal2 v5.4S, v9.8H, v11.8H // .......................*...................................................
- smlal2 v26.4S, v9.8H, v23.8H // .......................................*...................................
- smlal v19.4S, v9.4H, v23.4H // .........................................*.................................
- ldr q9, [x4], #32 // ...............................*...........................................
- uzp1 v11.8H, v25.8H, v5.8H // .........................*.................................................
- uzp1 v23.8H, v19.8H, v26.8H // .............................................*.............................
- mul v11.8H, v11.8H, v2.8H // ...........................*...............................................
- mul v23.8H, v23.8H, v2.8H // ..............................................*............................
- ldr q7, [x5], #32 // ................................*..........................................
- smlal2 v5.4S, v11.8H, v0.8H // .............................*.............................................
- smlal v25.4S, v11.4H, v0.4H // ..................................*........................................
- ldr q11, [x2], #32 // .....................................*.....................................
- ldr q21, [x2, #-16] // ........................................*..................................
- ldr q6, [x4, #-16] // ...............................................*...........................
- uzp1 v17.8H, v11.8H, v21.8H // ...........................................*...............................
- ldr q10, [x1], #32 // ................................................*..........................
- ldr q29, [x1, #-16] // .................................................*.........................
- uzp2 v11.8H, v11.8H, v21.8H // ............................................*..............................
- uzp1 v13.8H, v9.8H, v6.8H // ...................................................*.......................
- uzp1 v3.8H, v10.8H, v29.8H // ....................................................*......................
- uzp2 v10.8H, v10.8H, v29.8H // .....................................................*.....................
- smull v12.4S, v3.4H, v11.4H // ......................................................*....................
- smull2 v11.4S, v3.8H, v11.8H // .......................................................*...................
- ldr q21, [x5, #-16] // ........................................................*..................
- smlal v12.4S, v10.4H, v17.4H // .........................................................*.................
- smlal2 v11.4S, v10.8H, v17.8H // ..........................................................*................
- uzp2 v29.8H, v7.8H, v21.8H // ...........................................................*...............
- uzp1 v15.8H, v7.8H, v21.8H // ............................................................*..............
- smlal v12.4S, v13.4H, v29.4H // .............................................................*.............
- smlal2 v11.4S, v13.8H, v29.8H // ..............................................................*............
- uzp2 v28.8H, v9.8H, v6.8H // ...............................................................*...........
- smlal2 v26.4S, v23.8H, v0.8H // ..................................................*........................
- smlal v12.4S, v28.4H, v15.4H // .................................................................*.........
- smlal2 v11.4S, v28.8H, v15.8H // ..................................................................*........
- smlal v19.4S, v23.4H, v0.4H // ................................................................*..........
- uzp2 v27.8H, v25.8H, v5.8H // ..........................................*................................
- smull v23.4S, v3.4H, v17.4H // ......................................................................*....
- uzp1 v9.8H, v12.8H, v11.8H // .....................................................................*.....
- uzp2 v19.8H, v19.8H, v26.8H // ....................................................................*......
- mul v14.8H, v9.8H, v2.8H // .......................................................................*...
- ld1 {v22.8H}, [x6], #16 // ...................................................................*.......
- zip2 v9.8H, v19.8H, v27.8H // ........................................................................*..
- smlal2 v11.4S, v14.8H, v0.8H // ..........................................................................*
- ld1 {v4.8H}, [x3], #16 // .........................................................................*.
-
- // ------------------------------ new position ------------------------------>
- // 0 25 50
- // |------------------------|------------------------|------------------------
- // ldr q18, [x4], #32 // *..........................................................................
- // ldr q30, [x5], #32 // ..*........................................................................
- // ldr q8, [x2], #32 // .....*.....................................................................
- // ldr q9, [x2, #-16] // .......*...................................................................
- // uzp1 v17.8H, v8.8H, v9.8H // ..........*................................................................
- // uzp2 v4.8H, v8.8H, v9.8H // ...........*...............................................................
- // ldr q19, [x4, #-16] // .*.........................................................................
- // ldr q29, [x1], #32 // ............*..............................................................
- // ldr q12, [x1, #-16] // .............*.............................................................
- // uzp1 v13.8H, v18.8H, v19.8H // ...*.......................................................................
- // uzp1 v3.8H, v29.8H, v12.8H // ...............*...........................................................
- // uzp2 v10.8H, v29.8H, v12.8H // ................*..........................................................
- // smull v12.4S, v3.4H, v4.4H // .................*.........................................................
- // smull2 v11.4S, v3.8H, v4.8H // ..................*........................................................
- // ldr q5, [x5, #-16] // ......*....................................................................
- // smlal v12.4S, v10.4H, v17.4H // .....................*.....................................................
- // smlal2 v11.4S, v10.8H, v17.8H // ......................*....................................................
- // uzp2 v14.8H, v30.8H, v5.8H // ........*..................................................................
- // uzp1 v15.8H, v30.8H, v5.8H // .........*.................................................................
- // smlal v12.4S, v13.4H, v14.4H // .........................*.................................................
- // smlal2 v11.4S, v13.8H, v14.8H // ..........................*................................................
- // uzp2 v28.8H, v18.8H, v19.8H // ....*......................................................................
- // smlal v12.4S, v28.4H, v15.4H // ..............................*............................................
- // smlal2 v11.4S, v28.8H, v15.8H // ...............................*...........................................
- // ld1 {v22.8H}, [x6], #16 // .............................*.............................................
- // uzp1 v1.8H, v12.8H, v11.8H // ...................................*.......................................
- // smull v23.4S, v3.4H, v17.4H // ...................*.......................................................
- // mul v14.8H, v1.8H, v2.8H // .....................................*.....................................
- // ld1 {v4.8H}, [x3], #16 // ..............*............................................................
- // smlal2 v11.4S, v14.8H, v0.8H // ........................................*..................................
- // smull2 v20.4S, v3.8H, v17.8H // ....................*......................................................
- // ldr q18, [x4], #32 // ..................................*........................................
- // ldr q30, [x5], #32 // .......................................*...................................
- // smlal2 v20.4S, v10.8H, v4.8H // ........................*..................................................
- // smlal v12.4S, v14.4H, v0.4H // .........................................*.................................
- // smlal v23.4S, v10.4H, v4.4H // .......................*...................................................
- // smlal2 v20.4S, v13.8H, v15.8H // ............................*..............................................
- // ldr q8, [x2], #32 // ..........................................*................................
- // smlal v23.4S, v13.4H, v15.4H // ...........................*...............................................
- // smlal2 v20.4S, v28.8H, v22.8H // ................................*..........................................
- // ldr q9, [x2, #-16] // ...........................................*...............................
- // smlal v23.4S, v28.4H, v22.4H // .................................*.........................................
- // uzp2 v27.8H, v12.8H, v11.8H // ..................................................................*........
- // uzp1 v17.8H, v8.8H, v9.8H // .............................................*.............................
- // uzp2 v4.8H, v8.8H, v9.8H // ................................................*..........................
- // uzp1 v5.8H, v23.8H, v20.8H // ....................................*......................................
- // mul v31.8H, v5.8H, v2.8H // ......................................*....................................
- // ldr q19, [x4, #-16] // ............................................*..............................
- // ldr q29, [x1], #32 // ..............................................*............................
- // ldr q12, [x1, #-16] // ...............................................*...........................
- // smlal2 v20.4S, v31.8H, v0.8H // ..............................................................*............
- // uzp1 v13.8H, v18.8H, v19.8H // .................................................*.........................
- // uzp1 v3.8H, v29.8H, v12.8H // ..................................................*........................
- // uzp2 v10.8H, v29.8H, v12.8H // ...................................................*.......................
- // smull v12.4S, v3.4H, v4.4H // ....................................................*......................
- // smull2 v11.4S, v3.8H, v4.8H // .....................................................*.....................
- // ldr q5, [x5, #-16] // ......................................................*....................
- // smlal v12.4S, v10.4H, v17.4H // .......................................................*...................
- // smlal2 v11.4S, v10.8H, v17.8H // ........................................................*..................
- // uzp2 v14.8H, v30.8H, v5.8H // .........................................................*.................
- // uzp1 v15.8H, v30.8H, v5.8H // ..........................................................*................
- // smlal v12.4S, v13.4H, v14.4H // ...........................................................*...............
- // smlal2 v11.4S, v13.8H, v14.8H // ............................................................*..............
- // uzp2 v28.8H, v18.8H, v19.8H // .............................................................*.............
- // smlal v23.4S, v31.4H, v0.4H // .................................................................*.........
- // smlal v12.4S, v28.4H, v15.4H // ...............................................................*...........
- // smlal2 v11.4S, v28.8H, v15.8H // ................................................................*..........
- // ld1 {v22.8H}, [x6], #16 // .......................................................................*...
- // uzp2 v19.8H, v23.8H, v20.8H // .....................................................................*.....
- // uzp1 v1.8H, v12.8H, v11.8H // ....................................................................*......
- // smull v23.4S, v3.4H, v17.4H // ...................................................................*.......
- // mul v14.8H, v1.8H, v2.8H // ......................................................................*....
- // zip2 v9.8H, v19.8H, v27.8H // ........................................................................*..
- // ld1 {v4.8H}, [x3], #16 // ..........................................................................*
- // smlal2 v11.4S, v14.8H, v0.8H // .........................................................................*.
-
- sub count, count, #2
-1:
- // Instructions: 48
- // Expected cycles: 58
- // Expected IPC: 0.83
-
- // Cycle bound: 58.0
- // IPC bound: 0.83
-
- // Wall time: 6.39s
- // User time: 6.39s
-
- // -------------- original position -------------->
- // 0 25
- // |------------------------|----------------------
- smull2 v20.4S, v3.8H, v17.8H // ..........*.....................................
- ldr q18, [x4], #32 // .................e..............................
- ldr q30, [x5], #32 // .....................e..........................
- smlal2 v20.4S, v10.8H, v4.8H // ............*...................................
- smlal v12.4S, v14.4H, v0.4H // .........................................*......
- smlal v23.4S, v10.4H, v4.4H // ...........*....................................
- str q9, [x0, #16] // ...............................................l
- smlal2 v20.4S, v13.8H, v15.8H // ...........................*....................
- ldr q8, [x2], #32 // ....e...........................................
- smlal v23.4S, v13.4H, v15.4H // ..........................*.....................
- smlal2 v20.4S, v28.8H, v22.8H // .............................*..................
- zip1 v26.8H, v19.8H, v27.8H // ............................................l...
- ldr q9, [x2, #-16] // .....e..........................................
- smlal v23.4S, v28.4H, v22.4H // ............................*...................
- uzp2 v27.8H, v12.8H, v11.8H // ...........................................*....
- uzp1 v17.8H, v8.8H, v9.8H // ......e.........................................
- uzp2 v4.8H, v8.8H, v9.8H // .......e........................................
- uzp1 v5.8H, v23.8H, v20.8H // ..................................*.............
- str q26, [x0], #32 // ..............................................l.
- mul v31.8H, v5.8H, v2.8H // ...................................*............
- ldr q19, [x4, #-16] // ..................e.............................
- ldr q29, [x1], #32 // e...............................................
- ldr q12, [x1, #-16] // .e..............................................
- smlal2 v20.4S, v31.8H, v0.8H // .....................................*..........
- uzp1 v13.8H, v18.8H, v19.8H // ...................e............................
- uzp1 v3.8H, v29.8H, v12.8H // ..e.............................................
- uzp2 v10.8H, v29.8H, v12.8H // ...e............................................
- smull v12.4S, v3.4H, v4.4H // .............e..................................
- smull2 v11.4S, v3.8H, v4.8H // ..............e.................................
- ldr q5, [x5, #-16] // ......................e.........................
- smlal v12.4S, v10.4H, v17.4H // ...............e................................
- smlal2 v11.4S, v10.8H, v17.8H // ................e...............................
- uzp2 v14.8H, v30.8H, v5.8H // ........................e.......................
- uzp1 v15.8H, v30.8H, v5.8H // .......................e........................
- smlal v12.4S, v13.4H, v14.4H // ..............................e.................
- smlal2 v11.4S, v13.8H, v14.8H // ...............................e................
- uzp2 v28.8H, v18.8H, v19.8H // ....................e...........................
- smlal v23.4S, v31.4H, v0.4H // ....................................*...........
- smlal v12.4S, v28.4H, v15.4H // ................................e...............
- smlal2 v11.4S, v28.8H, v15.8H // .................................e..............
- ld1 {v22.8H}, [x6], #16 // .........................e......................
- uzp2 v19.8H, v23.8H, v20.8H // ......................................*.........
- uzp1 v1.8H, v12.8H, v11.8H // .......................................e........
- smull v23.4S, v3.4H, v17.4H // .........e......................................
- mul v14.8H, v1.8H, v2.8H // ........................................e.......
- zip2 v9.8H, v19.8H, v27.8H // .............................................*..
- ld1 {v4.8H}, [x3], #16 // ........e.......................................
- smlal2 v11.4S, v14.8H, v0.8H // ..........................................e.....
-
- // ------------------------------------------------- new position -------------------------------------------------->
- // 0 25 50 75 100
- // |------------------------|------------------------|------------------------|------------------------|-------------
- // ldr q12, [x1], #32 // ....................e..........................'....................~..........................'..................
- // ldr q13, [x1, #-16] // .....................e.........................'.....................~.........................'..................
- // uzp1 v3.8h, v12.8h, v13.8h // ........................e......................'........................~......................'..................
- // uzp2 v4.8h, v12.8h, v13.8h // .........................e.....................'.........................~.....................'..................
- // ldr q12, [x2], #32 // .......e.......................................'.......~.......................................'.......~..........
- // ldr q13, [x2, #-16] // ...........e...................................'...........~...................................'...........~......
- // uzp1 v5.8h, v12.8h, v13.8h // ..............e................................'..............~................................'..............~...
- // uzp2 v6.8h, v12.8h, v13.8h // ...............e...............................'...............~...............................'...............~..
- // ld1 {v7.8h}, [x3], #16 // .............................................e.'.............................................~.'..................
- // smull v8.4s, v3.4h, v5.4h // ..........................................e....'..........................................~....'..................
- // smull2 v10.4s, v3.8h, v5.8h // ...............................................*...............................................~..................
- // smlal v8.4s, v4.4h, v7.4h // ....~..........................................'....*..........................................'....~.............
- // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................'..*............................................'..~...............
- // smull v9.4s, v3.4h, v6.4h // ..........................e....................'..........................~....................'..................
- // smull2 v11.4s, v3.8h, v6.8h // ...........................e...................'...........................~...................'..................
- // smlal v9.4s, v4.4h, v5.4h // .............................e.................'.............................~.................'..................
- // smlal2 v11.4s, v4.8h, v5.8h // ..............................e................'..............................~................'..................
- // ldr q12, [x4], #32 // e..............................................'~..............................................'~.................
- // ldr q13, [x4, #-16] // ...................e...........................'...................~...........................'..................
- // uzp1 v3.8h, v12.8h, v13.8h // .......................e.......................'.......................~.......................'..................
- // uzp2 v4.8h, v12.8h, v13.8h // ...................................e...........'...................................~...........'..................
- // ldr q12, [x5], #32 // .e.............................................'.~.............................................'.~................
- // ldr q13, [x5, #-16] // ............................e..................'............................~..................'..................
- // uzp1 v5.8h, v12.8h, v13.8h // ................................e..............'................................~..............'..................
- // uzp2 v6.8h, v12.8h, v13.8h // ...............................e...............'...............................~...............'..................
- // ld1 {v7.8h}, [x6], #16 // .......................................e.......'.......................................~.......'..................
- // smlal v8.4s, v3.4h, v5.4h // ........~......................................'........*......................................'........~.........
- // smlal2 v10.4s, v3.8h, v5.8h // ......~........................................'......*........................................'......~...........
- // smlal v8.4s, v4.4h, v7.4h // ............~..................................'............*..................................'............~.....
- // smlal2 v10.4s, v4.8h, v7.8h // .........~.....................................'.........*.....................................'.........~........
- // smlal v9.4s, v3.4h, v6.4h // .................................e.............'.................................~.............'..................
- // smlal2 v11.4s, v3.8h, v6.8h // ..................................e............'..................................~............'..................
- // smlal v9.4s, v4.4h, v5.4h // .....................................e.........'.....................................~.........'..................
- // smlal2 v11.4s, v4.8h, v5.8h // ......................................e........'......................................~........'..................
- // uzp1 v28.8h, v8.8h, v10.8h // ................~..............................'................*..............................'................~.
- // mul v28.8h, v28.8h, v2.8h // ..................~............................'..................*............................'..................
- // smlal v8.4s, v28.4h, v0.4h // ....................................~..........'....................................*..........'..................
- // smlal2 v10.4s, v28.8h, v0.8h // ......................~........................'......................*........................'..................
- // uzp2 v26.8h, v8.8h, v10.8h // ........................................~......'........................................*......'..................
- // uzp1 v28.8h, v9.8h, v11.8h // .........................................e.....'.........................................~.....'..................
- // mul v28.8h, v28.8h, v2.8h // ...........................................e...'...........................................~...'..................
- // smlal v9.4s, v28.4h, v0.4h // ...~...........................................'...*...........................................'...~..............
- // smlal2 v11.4s, v28.8h, v0.8h // ..............................................e'..............................................~'..................
- // uzp2 v27.8h, v9.8h, v11.8h // .............~.................................'.............*.................................'.............~....
- // zip1 v12.8h, v26.8h, v27.8h // ..........~....................................'..........~....................................'..........l.......
- // zip2 v13.8h, v26.8h, v27.8h // ............................................~..'............................................*..'..................
- // str q12, [x0], #32 // .................~.............................'.................~.............................'.................l
- // str q13, [x0, #-16] // .....~.........................................'.....~.........................................'.....l............
-
- sub count, count, #1
- cbnz count, 1b
- // Instructions: 21
- // Expected cycles: 35
- // Expected IPC: 0.60
-
- // Cycle bound: 35.0
- // IPC bound: 0.60
-
- // Wall time: 0.08s
- // User time: 0.08s
-
- // ----- original position ----->
- // 0 25
- // |------------------------|----
- smull2 v5.4S, v3.8H, v17.8H // *.............................
- smlal v12.4S, v14.4H, v0.4H // ..*...........................
- smlal v23.4S, v10.4H, v4.4H // ...*..........................
- str q9, [x0, #16] // ....*.........................
- smlal2 v5.4S, v10.8H, v4.8H // .*............................
- uzp2 v11.8H, v12.8H, v11.8H // ..........*...................
- zip1 v9.8H, v19.8H, v27.8H // ........*.....................
- smlal v23.4S, v13.4H, v15.4H // ......*.......................
- smlal2 v5.4S, v13.8H, v15.8H // .....*........................
- str q9, [x0], #32 // ............*.................
- smlal v23.4S, v28.4H, v22.4H // .........*....................
- smlal2 v5.4S, v28.8H, v22.8H // .......*......................
- uzp1 v9.8H, v23.8H, v5.8H // ...........*..................
- mul v9.8H, v9.8H, v2.8H // .............*................
- smlal2 v5.4S, v9.8H, v0.8H // ..............*...............
- smlal v23.4S, v9.4H, v0.4H // ...............*..............
- uzp2 v9.8H, v23.8H, v5.8H // ................*.............
- zip2 v5.8H, v9.8H, v11.8H // .................*............
- zip1 v9.8H, v9.8H, v11.8H // ...................*..........
- str q5, [x0, #16] // ..................*...........
- str q9, [x0], #32 // ....................*.........
-
- // -------- new position -------->
- // 0 25
- // |------------------------|-----
- // smull2 v20.4S, v3.8H, v17.8H // *..............................
- // smlal2 v20.4S, v10.8H, v4.8H // ....*..........................
- // smlal v12.4S, v14.4H, v0.4H // .*.............................
- // smlal v23.4S, v10.4H, v4.4H // ..*............................
- // str q9, [x0, #16] // ...*...........................
- // smlal2 v20.4S, v13.8H, v15.8H // ........*......................
- // smlal v23.4S, v13.4H, v15.4H // .......*.......................
- // smlal2 v20.4S, v28.8H, v22.8H // ...........*...................
- // zip1 v26.8H, v19.8H, v27.8H // ......*........................
- // smlal v23.4S, v28.4H, v22.4H // ..........*....................
- // uzp2 v27.8H, v12.8H, v11.8H // .....*.........................
- // uzp1 v5.8H, v23.8H, v20.8H // ............*..................
- // str q26, [x0], #32 // .........*.....................
- // mul v31.8H, v5.8H, v2.8H // .............*.................
- // smlal2 v20.4S, v31.8H, v0.8H // ..............*................
- // smlal v23.4S, v31.4H, v0.4H // ...............*...............
- // uzp2 v19.8H, v23.8H, v20.8H // ................*..............
- // zip2 v9.8H, v19.8H, v27.8H // .................*.............
- // str q9, [x0, #16] // ...................*...........
- // zip1 v26.8H, v19.8H, v27.8H // ..................*............
- // str q26, [x0], #32 // ....................*..........
-
-
- pop_stack
- ret
-#endif /* MLKEM_K == 2 */
-
-#if MLKEM_K == 3
-.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
-
-.balign 4
-MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt):
- push_stack
- mov wtmp, #3329
- dup modulus.8h, wtmp
-
- mov wtmp, #3327
- dup modulus_twisted.8h, wtmp
-
- // Computed bases of vector entries
-
- add a1_ptr, a0_ptr, #(1 * 512)
- add b1_ptr, b0_ptr, #(1 * 512)
- add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
- add a2_ptr, a0_ptr, #(2 * 512)
- add b2_ptr, b0_ptr, #(2 * 512)
- add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2)
-
- mov count, #(MLKEM_N / 16)
- // Instructions: 75
- // Expected cycles: 103
- // Expected IPC: 0.73
-
- // Cycle bound: 103.0
- // IPC bound: 0.73
-
- // Wall time: 0.94s
- // User time: 0.94s
-
- // --------------------------- original position ---------------------------->
- // 0 25 50
- // |------------------------|------------------------|
- ldr q7, [x2, #16] // *..........................................................................
- ldr q20, [x2], #32 // ..*........................................................................
- ldr q15, [x1, #16] // .*.........................................................................
- uzp1 v8.8H, v20.8H, v7.8H // ...............*...........................................................
- uzp2 v7.8H, v20.8H, v7.8H // ................*..........................................................
- ld1 {v20.8H}, [x3], #16 // ...*.......................................................................
- ldr q30, [x1], #32 // ..............*............................................................
- ldr q11, [x4], #32 // ....*......................................................................
- uzp1 v16.8H, v30.8H, v15.8H // .................*.........................................................
- uzp2 v15.8H, v30.8H, v15.8H // ..................*........................................................
- smull v30.4S, v16.4H, v7.4H // ...................*.......................................................
- smull2 v7.4S, v16.8H, v7.8H // ....................*......................................................
- smull v9.4S, v16.4H, v8.4H // .....................*.....................................................
- smull2 v16.4S, v16.8H, v8.8H // ......................*....................................................
- smlal v30.4S, v15.4H, v8.4H // .......................*...................................................
- smlal2 v7.4S, v15.8H, v8.8H // ........................*..................................................
- smlal v9.4S, v15.4H, v20.4H // .........................*.................................................
- smlal2 v16.4S, v15.8H, v20.8H // ..........................*................................................
- ldr q20, [x4, #-16] // .....*.....................................................................
- ldr q15, [x5], #32 // ......*....................................................................
- uzp1 v8.8H, v11.8H, v20.8H // ...........................*...............................................
- uzp2 v20.8H, v11.8H, v20.8H // ............................*..............................................
- ldr q11, [x5, #-16] // .......*...................................................................
- ld1 {v27.8H}, [x6], #16 // ........*..................................................................
- uzp1 v10.8H, v15.8H, v11.8H // .............................*.............................................
- uzp2 v15.8H, v15.8H, v11.8H // ..............................*............................................
- smlal v9.4S, v8.4H, v10.4H // ...............................*...........................................
- smlal2 v16.4S, v8.8H, v10.8H // ................................*..........................................
- smlal v30.4S, v8.4H, v15.4H // .................................*.........................................
- smlal2 v7.4S, v8.8H, v15.8H // ..................................*........................................
- smlal v9.4S, v20.4H, v27.4H // ...................................*.......................................
- smlal2 v16.4S, v20.8H, v27.8H // ....................................*......................................
- smlal v30.4S, v20.4H, v10.4H // .....................................*.....................................
- smlal2 v7.4S, v20.8H, v10.8H // ......................................*....................................
- ldr q20, [x7], #32 // .........*.................................................................
- ldr q15, [x7, #-16] // ..........*................................................................
- ldr q8, [x8], #32 // ...........*...............................................................
- uzp1 v11.8H, v20.8H, v15.8H // .......................................*...................................
- uzp2 v20.8H, v20.8H, v15.8H // ........................................*..................................
- ldr q15, [x8, #-16] // ............*..............................................................
- ld1 {v27.8H}, [x9], #16 // .............*.............................................................
- uzp1 v10.8H, v8.8H, v15.8H // .........................................*.................................
- uzp2 v15.8H, v8.8H, v15.8H // ..........................................*................................
- smlal v9.4S, v11.4H, v10.4H // ...........................................*...............................
- smlal2 v16.4S, v11.8H, v10.8H // ............................................*..............................
- smlal v30.4S, v11.4H, v15.4H // .............................................*.............................
- smlal2 v7.4S, v11.8H, v15.8H // ..............................................*............................
- smlal v9.4S, v20.4H, v27.4H // ...............................................*...........................
- smlal2 v16.4S, v20.8H, v27.8H // ................................................*..........................
- smlal v30.4S, v20.4H, v10.4H // .................................................*.........................
- smlal2 v7.4S, v20.8H, v10.8H // ..................................................*........................
- ldr q15, [x2], #32 // ...............................................................*...........
- uzp1 v20.8H, v9.8H, v16.8H // ....................................................*......................
- uzp1 v8.8H, v30.8H, v7.8H // .....................................................*.....................
- mul v20.8H, v20.8H, v2.8H // ......................................................*....................
- mul v8.8H, v8.8H, v2.8H // .......................................................*...................
- ldr q21, [x4], #32 // .................................................................*.........
- smlal v9.4S, v20.4H, v0.4H // ........................................................*..................
- smlal2 v16.4S, v20.8H, v0.8H // .........................................................*.................
- smlal v30.4S, v8.4H, v0.4H // ..........................................................*................
- smlal2 v7.4S, v8.8H, v0.8H // ...........................................................*...............
- ldr q6, [x4, #-16] // ..................................................................*........
- uzp2 v27.8H, v9.8H, v16.8H // ............................................................*..............
- uzp2 v10.8H, v30.8H, v7.8H // .............................................................*.............
- ldr q16, [x2, #-16] // ...................................................*.......................
- ldr q30, [x1, #16] // ..............................................................*............
- ld1 {v9.8H}, [x3], #16 // ................................................................*..........
- ldr q1, [x5], #32 // ...................................................................*.......
- ldr q12, [x5, #-16] // ....................................................................*......
- ld1 {v24.8H}, [x6], #16 // .....................................................................*.....
- ldr q19, [x7], #32 // ......................................................................*....
- ldr q31, [x7, #-16] // .......................................................................*...
- ldr q17, [x8], #32 // ........................................................................*..
- ldr q18, [x8, #-16] // .........................................................................*.
- ld1 {v25.8H}, [x9], #16 // ..........................................................................*
-
- // ------------------------------ new position ------------------------------>
- // 0 25 50
- // |------------------------|------------------------|------------------------
- // ldr q16, [x2, #16] // *..........................................................................
- // ldr q30, [x1, #16] // ..*........................................................................
- // ldr q15, [x2], #32 // .*.........................................................................
- // ld1 {v9.8H}, [x3], #16 // .....*.....................................................................
- // ldr q21, [x4], #32 // .......*...................................................................
- // ldr q6, [x4, #-16] // ..................*........................................................
- // ldr q1, [x5], #32 // ...................*.......................................................
- // ldr q12, [x5, #-16] // ......................*....................................................
- // ld1 {v24.8H}, [x6], #16 // .......................*...................................................
- // ldr q19, [x7], #32 // ..................................*........................................
- // ldr q31, [x7, #-16] // ...................................*.......................................
- // ldr q17, [x8], #32 // ....................................*......................................
- // ldr q18, [x8, #-16] // .......................................*...................................
- // ld1 {v25.8H}, [x9], #16 // ........................................*..................................
- // ldr q20, [x1], #32 // ......*....................................................................
- // uzp1 v7.8H, v15.8H, v16.8H // ...*.......................................................................
- // uzp2 v15.8H, v15.8H, v16.8H // ....*......................................................................
- // uzp1 v8.8H, v20.8H, v30.8H // ........*..................................................................
- // uzp2 v20.8H, v20.8H, v30.8H // .........*.................................................................
- // smull v30.4S, v8.4H, v15.4H // ..........*................................................................
- // smull2 v15.4S, v8.8H, v15.8H // ...........*...............................................................
- // smull v11.4S, v8.4H, v7.4H // ............*..............................................................
- // smull2 v8.4S, v8.8H, v7.8H // .............*.............................................................
- // smlal v30.4S, v20.4H, v7.4H // ..............*............................................................
- // smlal2 v15.4S, v20.8H, v7.8H // ...............*...........................................................
- // smlal v11.4S, v20.4H, v9.4H // ................*..........................................................
- // smlal2 v8.4S, v20.8H, v9.8H // .................*.........................................................
- // uzp1 v7.8H, v21.8H, v6.8H // ....................*......................................................
- // uzp2 v20.8H, v21.8H, v6.8H // .....................*.....................................................
- // uzp1 v16.8H, v1.8H, v12.8H // ........................*..................................................
- // uzp2 v9.8H, v1.8H, v12.8H // .........................*.................................................
- // smlal v11.4S, v7.4H, v16.4H // ..........................*................................................
- // smlal2 v8.4S, v7.8H, v16.8H // ...........................*...............................................
- // smlal v30.4S, v7.4H, v9.4H // ............................*..............................................
- // smlal2 v15.4S, v7.8H, v9.8H // .............................*.............................................
- // smlal v11.4S, v20.4H, v24.4H // ..............................*............................................
- // smlal2 v8.4S, v20.8H, v24.8H // ...............................*...........................................
- // smlal v30.4S, v20.4H, v16.4H // ................................*..........................................
- // smlal2 v15.4S, v20.8H, v16.8H // .................................*.........................................
- // uzp1 v7.8H, v19.8H, v31.8H // .....................................*.....................................
- // uzp2 v20.8H, v19.8H, v31.8H // ......................................*....................................
- // uzp1 v16.8H, v17.8H, v18.8H // .........................................*.................................
- // uzp2 v9.8H, v17.8H, v18.8H // ..........................................*................................
- // smlal v11.4S, v7.4H, v16.4H // ...........................................*...............................
- // smlal2 v8.4S, v7.8H, v16.8H // ............................................*..............................
- // smlal v30.4S, v7.4H, v9.4H // .............................................*.............................
- // smlal2 v15.4S, v7.8H, v9.8H // ..............................................*............................
- // smlal v11.4S, v20.4H, v25.4H // ...............................................*...........................
- // smlal2 v8.4S, v20.8H, v25.8H // ................................................*..........................
- // smlal v30.4S, v20.4H, v16.4H // .................................................*.........................
- // smlal2 v15.4S, v20.8H, v16.8H // ..................................................*........................
- // ldr q16, [x2, #16] // ................................................................*..........
- // uzp1 v7.8H, v11.8H, v8.8H // ....................................................*......................
- // uzp1 v20.8H, v30.8H, v15.8H // .....................................................*.....................
- // mul v7.8H, v7.8H, v2.8H // ......................................................*....................
- // mul v20.8H, v20.8H, v2.8H // .......................................................*...................
- // smlal v11.4S, v7.4H, v0.4H // .........................................................*.................
- // smlal2 v8.4S, v7.8H, v0.8H // ..........................................................*................
- // smlal v30.4S, v20.4H, v0.4H // ...........................................................*...............
- // smlal2 v15.4S, v20.8H, v0.8H // ............................................................*..............
- // uzp2 v27.8H, v11.8H, v8.8H // ..............................................................*............
- // uzp2 v10.8H, v30.8H, v15.8H // ...............................................................*...........
- // ldr q30, [x1, #16] // .................................................................*.........
- // ldr q15, [x2], #32 // ...................................................*.......................
- // ld1 {v9.8H}, [x3], #16 // ..................................................................*........
- // ldr q21, [x4], #32 // ........................................................*..................
- // ldr q6, [x4, #-16] // .............................................................*.............
- // ldr q1, [x5], #32 // ...................................................................*.......
- // ldr q12, [x5, #-16] // ....................................................................*......
- // ld1 {v24.8H}, [x6], #16 // .....................................................................*.....
- // ldr q19, [x7], #32 // ......................................................................*....
- // ldr q31, [x7, #-16] // .......................................................................*...
- // ldr q17, [x8], #32 // ........................................................................*..
- // ldr q18, [x8, #-16] // .........................................................................*.
- // ld1 {v25.8H}, [x9], #16 // ..........................................................................*
-
- sub count, count, #2
-1:
- // Instructions: 65
- // Expected cycles: 80
- // Expected IPC: 0.81
-
- // Cycle bound: 80.0
- // IPC bound: 0.81
-
- // Wall time: 11.64s
- // User time: 11.64s
-
- // ---------------------- original position ----------------------->
- // 0 25 50
- // |------------------------|------------------------|--------------
- ldr q20, [x1], #32 // *................................................................
- uzp1 v7.8H, v15.8H, v16.8H // ......*..........................................................
- uzp2 v15.8H, v15.8H, v16.8H // .......*.........................................................
- uzp1 v8.8H, v20.8H, v30.8H // ..*..............................................................
- uzp2 v20.8H, v20.8H, v30.8H // ...*.............................................................
- smull v30.4S, v8.4H, v15.4H // .............*...................................................
- smull2 v15.4S, v8.8H, v15.8H // ..............*..................................................
- smull v11.4S, v8.4H, v7.4H // .........*.......................................................
- smull2 v8.4S, v8.8H, v7.8H // ..........*......................................................
- smlal v30.4S, v20.4H, v7.4H // ...............*.................................................
- smlal2 v15.4S, v20.8H, v7.8H // ................*................................................
- smlal v11.4S, v20.4H, v9.4H // ...........*.....................................................
- smlal2 v8.4S, v20.8H, v9.8H // ............*....................................................
- uzp1 v7.8H, v21.8H, v6.8H // ...................*.............................................
- uzp2 v20.8H, v21.8H, v6.8H // ....................*............................................
- uzp1 v16.8H, v1.8H, v12.8H // .......................*.........................................
- uzp2 v9.8H, v1.8H, v12.8H // ........................*........................................
- smlal v11.4S, v7.4H, v16.4H // ..........................*......................................
- smlal2 v8.4S, v7.8H, v16.8H // ...........................*.....................................
- smlal v30.4S, v7.4H, v9.4H // ..............................*..................................
- smlal2 v15.4S, v7.8H, v9.8H // ...............................*.................................
- smlal v11.4S, v20.4H, v24.4H // ............................*....................................
- smlal2 v8.4S, v20.8H, v24.8H // .............................*...................................
- smlal v30.4S, v20.4H, v16.4H // ................................*................................
- smlal2 v15.4S, v20.8H, v16.8H // .................................*...............................
- uzp1 v7.8H, v19.8H, v31.8H // ....................................*............................
- uzp2 v20.8H, v19.8H, v31.8H // .....................................*...........................
- uzp1 v16.8H, v17.8H, v18.8H // ........................................*........................
- uzp2 v9.8H, v17.8H, v18.8H // .........................................*.......................
- smlal v11.4S, v7.4H, v16.4H // ...........................................*.....................
- smlal2 v8.4S, v7.8H, v16.8H // ............................................*....................
- smlal v30.4S, v7.4H, v9.4H // ...............................................*.................
- smlal2 v15.4S, v7.8H, v9.8H // ................................................*................
- smlal v11.4S, v20.4H, v25.4H // .............................................*...................
- smlal2 v8.4S, v20.8H, v25.8H // ..............................................*..................
- smlal v30.4S, v20.4H, v16.4H // .................................................*...............
- smlal2 v15.4S, v20.8H, v16.8H // ..................................................*..............
- ldr q16, [x2, #16] // .....e...........................................................
- uzp1 v7.8H, v11.8H, v8.8H // ...................................................*.............
- uzp1 v20.8H, v30.8H, v15.8H // ........................................................*........
- mul v7.8H, v7.8H, v2.8H // ....................................................*............
- mul v20.8H, v20.8H, v2.8H // .........................................................*.......
- zip2 v9.8H, v27.8H, v10.8H // ..............................................................l..
- zip1 v27.8H, v27.8H, v10.8H // .............................................................l...
- smlal v11.4S, v7.4H, v0.4H // .....................................................*...........
- smlal2 v8.4S, v7.8H, v0.8H // ......................................................*..........
- smlal v30.4S, v20.4H, v0.4H // ..........................................................*......
- smlal2 v15.4S, v20.8H, v0.8H // ...........................................................*.....
- str q27, [x0], #32 // ...............................................................l.
- uzp2 v27.8H, v11.8H, v8.8H // .......................................................*.........
- str q9, [x0, #-16] // ................................................................l
- uzp2 v10.8H, v30.8H, v15.8H // ............................................................*....
- ldr q30, [x1, #16] // .e...............................................................
- ldr q15, [x2], #32 // ....e............................................................
- ld1 {v9.8H}, [x3], #16 // ........e........................................................
- ldr q21, [x4], #32 // .................e...............................................
- ldr q6, [x4, #-16] // ..................e..............................................
- ldr q1, [x5], #32 // .....................e...........................................
- ldr q12, [x5, #-16] // ......................e..........................................
- ld1 {v24.8H}, [x6], #16 // .........................e.......................................
- ldr q19, [x7], #32 // ..................................e..............................
- ldr q31, [x7, #-16] // ...................................e.............................
- ldr q17, [x8], #32 // ......................................e..........................
- ldr q18, [x8, #-16] // .......................................e.........................
- ld1 {v25.8H}, [x9], #16 // ..........................................e......................
-
- // ---------------------------------------------------------------- new position ----------------------------------------------------------------->
- // 0 25 50 75 100 125
- // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------
- // ldr q12, [x1], #32 // ............................*................................................................~..................................................
- // ldr q13, [x1, #-16] // ...............e............'...................................................~............'..................................................
- // uzp1 v3.8h, v12.8h, v13.8h // ............................'..*.............................................................'..~...............................................
- // uzp2 v4.8h, v12.8h, v13.8h // ............................'...*............................................................'...~..............................................
- // ldr q12, [x2], #32 // ................e...........'....................................................~...........'..................................................
- // ldr q13, [x2, #-16] // e...........................'....................................~...........................'....................................~.............
- // uzp1 v5.8h, v12.8h, v13.8h // ............................'*...............................................................'~.................................................
- // uzp2 v6.8h, v12.8h, v13.8h // ............................'.*..............................................................'.~................................................
- // ld1 {v7.8h}, [x3], #16 // .................e..........'.....................................................~..........'..................................................
- // smull v8.4s, v3.4h, v5.4h // ............................'......*.........................................................'......~...........................................
- // smull2 v10.4s, v3.8h, v5.8h // ............................'.......*........................................................'.......~..........................................
- // smlal v8.4s, v4.4h, v7.4h // ............................'..........*.....................................................'..........~.......................................
- // smlal2 v10.4s, v4.8h, v7.8h // ............................'...........*....................................................'...........~......................................
- // smull v9.4s, v3.4h, v6.4h // ............................'....*...........................................................'....~.............................................
- // smull2 v11.4s, v3.8h, v6.8h // ............................'.....*..........................................................'.....~............................................
- // smlal v9.4s, v4.4h, v5.4h // ............................'........*.......................................................'........~.........................................
- // smlal2 v11.4s, v4.8h, v5.8h // ............................'.........*......................................................'.........~........................................
- // ldr q12, [x4], #32 // ..................e.........'......................................................~.........'..................................................
- // ldr q13, [x4, #-16] // ...................e........'.......................................................~........'..................................................
- // uzp1 v3.8h, v12.8h, v13.8h // ............................'............*...................................................'............~.....................................
- // uzp2 v4.8h, v12.8h, v13.8h // ............................'.............*..................................................'.............~....................................
- // ldr q12, [x5], #32 // ....................e.......'........................................................~.......'..................................................
- // ldr q13, [x5, #-16] // .....................e......'.........................................................~......'..................................................
- // uzp1 v5.8h, v12.8h, v13.8h // ............................'..............*.................................................'..............~...................................
- // uzp2 v6.8h, v12.8h, v13.8h // ............................'...............*................................................'...............~..................................
- // ld1 {v7.8h}, [x6], #16 // ......................e.....'..........................................................~.....'..................................................
- // smlal v8.4s, v3.4h, v5.4h // ............................'................*...............................................'................~.................................
- // smlal2 v10.4s, v3.8h, v5.8h // ............................'.................*..............................................'.................~................................
- // smlal v8.4s, v4.4h, v7.4h // ............................'....................*...........................................'....................~.............................
- // smlal2 v10.4s, v4.8h, v7.8h // ............................'.....................*..........................................'.....................~............................
- // smlal v9.4s, v3.4h, v6.4h // ............................'..................*.............................................'..................~...............................
- // smlal2 v11.4s, v3.8h, v6.8h // ............................'...................*............................................'...................~..............................
- // smlal v9.4s, v4.4h, v5.4h // ............................'......................*.........................................'......................~...........................
- // smlal2 v11.4s, v4.8h, v5.8h // ............................'.......................*........................................'.......................~..........................
- // ldr q12, [x7], #32 // .......................e....'...........................................................~....'..................................................
- // ldr q13, [x7, #-16] // ........................e...'............................................................~...'..................................................
- // uzp1 v3.8h, v12.8h, v13.8h // ............................'........................*.......................................'........................~.........................
- // uzp2 v4.8h, v12.8h, v13.8h // ............................'.........................*......................................'.........................~........................
- // ldr q12, [x8], #32 // .........................e..'.............................................................~..'..................................................
- // ldr q13, [x8, #-16] // ..........................e.'..............................................................~.'..................................................
- // uzp1 v5.8h, v12.8h, v13.8h // ............................'..........................*.....................................'..........................~.......................
- // uzp2 v6.8h, v12.8h, v13.8h // ............................'...........................*....................................'...........................~......................
- // ld1 {v7.8h}, [x9], #16 // ...........................e'...............................................................~'..................................................
- // smlal v8.4s, v3.4h, v5.4h // ............................'............................*...................................'............................~.....................
- // smlal2 v10.4s, v3.8h, v5.8h // ............................'.............................*..................................'.............................~....................
- // smlal v8.4s, v4.4h, v7.4h // ............................'................................*...............................'................................~.................
- // smlal2 v10.4s, v4.8h, v7.8h // ............................'.................................*..............................'.................................~................
- // smlal v9.4s, v3.4h, v6.4h // ............................'..............................*.................................'..............................~...................
- // smlal2 v11.4s, v3.8h, v6.8h // ............................'...............................*................................'...............................~..................
- // smlal v9.4s, v4.4h, v5.4h // ............................'..................................*.............................'..................................~...............
- // smlal2 v11.4s, v4.8h, v5.8h // ............................'...................................*............................'...................................~..............
- // uzp1 v28.8h, v8.8h, v10.8h // .~..........................'.....................................*..........................'.....................................~............
- // mul v28.8h, v28.8h, v2.8h // ...~........................'.......................................*........................'.......................................~..........
- // smlal v8.4s, v28.4h, v0.4h // .......~....................'...........................................*....................'...........................................~......
- // smlal2 v10.4s, v28.8h, v0.8h // ........~...................'............................................*...................'............................................~.....
- // uzp2 v26.8h, v8.8h, v10.8h // ............~...............'................................................*...............'................................................~.
- // uzp1 v28.8h, v9.8h, v11.8h // ..~.........................'......................................*.........................'......................................~...........
- // mul v28.8h, v28.8h, v2.8h // ....~.......................'........................................*.......................'........................................~.........
- // smlal v9.4s, v28.4h, v0.4h // .........~..................'.............................................*..................'.............................................~....
- // smlal2 v11.4s, v28.8h, v0.8h // ..........~.................'..............................................*.................'..............................................~...
- // uzp2 v27.8h, v9.8h, v11.8h // ..............~.............'..................................................*.............'..................................................
- // zip1 v12.8h, v26.8h, v27.8h // ......~.....................'..........................................~.....................'..........................................l.......
- // zip2 v13.8h, v26.8h, v27.8h // .....~......................'.........................................~......................'.........................................l........
- // str q12, [x0], #32 // ...........~................'...............................................~................'...............................................l..
- // str q13, [x0, #-16] // .............~..............'.................................................~..............'.................................................l
-
- sub count, count, #1
- cbnz count, 1b
- // Instructions: 55
- // Expected cycles: 61
- // Expected IPC: 0.90
-
- // Cycle bound: 61.0
- // IPC bound: 0.90
-
- // Wall time: 8.41s
- // User time: 8.41s
-
- // ----------------- original position ------------------>
- // 0 25 50
- // |------------------------|------------------------|----
- ldr q7, [x1], #32 // *......................................................
- uzp1 v20.8H, v15.8H, v16.8H // .*.....................................................
- uzp2 v15.8H, v15.8H, v16.8H // ..*....................................................
- uzp1 v23.8H, v7.8H, v30.8H // ...*...................................................
- uzp2 v11.8H, v7.8H, v30.8H // ....*..................................................
- smull2 v8.4S, v23.8H, v20.8H // ........*..............................................
- smull v5.4S, v23.4H, v20.4H // .......*...............................................
- smull2 v30.4S, v23.8H, v15.8H // ......*................................................
- uzp1 v28.8H, v1.8H, v12.8H // ...............*.......................................
- smlal2 v8.4S, v11.8H, v9.8H // ............*..........................................
- smlal v5.4S, v11.4H, v9.4H // ...........*...........................................
- uzp1 v3.8H, v21.8H, v6.8H // .............*.........................................
- smull v16.4S, v23.4H, v15.4H // .....*.................................................
- smlal2 v8.4S, v3.8H, v28.8H // ..................*....................................
- smlal v5.4S, v3.4H, v28.4H // .................*.....................................
- uzp2 v29.8H, v21.8H, v6.8H // ..............*........................................
- uzp1 v7.8H, v17.8H, v18.8H // ...........................*...........................
- smlal2 v8.4S, v29.8H, v24.8H // ......................*................................
- uzp1 v14.8H, v19.8H, v31.8H // .........................*.............................
- smlal v16.4S, v11.4H, v20.4H // .........*.............................................
- smlal2 v30.4S, v11.8H, v20.8H // ..........*............................................
- smlal2 v8.4S, v14.8H, v7.8H // ..............................*........................
- uzp2 v20.8H, v1.8H, v12.8H // ................*......................................
- uzp2 v21.8H, v19.8H, v31.8H // ..........................*............................
- smlal2 v30.4S, v3.8H, v20.8H // ....................*..................................
- smlal v16.4S, v3.4H, v20.4H // ...................*...................................
- smlal v5.4S, v29.4H, v24.4H // .....................*.................................
- uzp2 v9.8H, v17.8H, v18.8H // ............................*..........................
- smlal2 v30.4S, v29.8H, v28.8H // ........................*..............................
- smlal v16.4S, v29.4H, v28.4H // .......................*...............................
- smlal v5.4S, v14.4H, v7.4H // .............................*.........................
- smlal2 v8.4S, v21.8H, v25.8H // ..................................*....................
- smlal2 v30.4S, v14.8H, v9.8H // ................................*......................
- smlal v16.4S, v14.4H, v9.4H // ...............................*.......................
- smlal v5.4S, v21.4H, v25.4H // .................................*.....................
- zip1 v20.8H, v27.8H, v10.8H // ..........................................*............
- smlal2 v30.4S, v21.8H, v7.8H // ....................................*..................
- smlal v16.4S, v21.4H, v7.4H // ...................................*...................
- uzp1 v7.8H, v5.8H, v8.8H // .....................................*.................
- str q20, [x0], #32 // ...............................................*.......
- mul v15.8H, v7.8H, v2.8H // .......................................*...............
- uzp1 v7.8H, v16.8H, v30.8H // ......................................*................
- zip2 v31.8H, v27.8H, v10.8H // .........................................*.............
- mul v20.8H, v7.8H, v2.8H // ........................................*..............
- smlal v5.4S, v15.4H, v0.4H // ...........................................*...........
- smlal2 v8.4S, v15.8H, v0.8H // ............................................*..........
- str q31, [x0, #-16] // .................................................*.....
- smlal2 v30.4S, v20.8H, v0.8H // ..............................................*........
- smlal v16.4S, v20.4H, v0.4H // .............................................*.........
- uzp2 v15.8H, v5.8H, v8.8H // ................................................*......
- uzp2 v20.8H, v16.8H, v30.8H // ..................................................*....
- zip1 v7.8H, v15.8H, v20.8H // ....................................................*..
- zip2 v20.8H, v15.8H, v20.8H // ...................................................*...
- str q7, [x0], #32 // .....................................................*.
- str q20, [x0, #-16] // ......................................................*
-
- // -------------------- new position -------------------->
- // 0 25 50
- // |------------------------|------------------------|----
- // ldr q20, [x1], #32 // *......................................................
- // uzp1 v7.8H, v15.8H, v16.8H // .*.....................................................
- // uzp2 v15.8H, v15.8H, v16.8H // ..*....................................................
- // uzp1 v8.8H, v20.8H, v30.8H // ...*...................................................
- // uzp2 v20.8H, v20.8H, v30.8H // ....*..................................................
- // smull v30.4S, v8.4H, v15.4H // ............*..........................................
- // smull2 v15.4S, v8.8H, v15.8H // .......*...............................................
- // smull v11.4S, v8.4H, v7.4H // ......*................................................
- // smull2 v8.4S, v8.8H, v7.8H // .....*.................................................
- // smlal v30.4S, v20.4H, v7.4H // ...................*...................................
- // smlal2 v15.4S, v20.8H, v7.8H // ....................*..................................
- // smlal v11.4S, v20.4H, v9.4H // ..........*............................................
- // smlal2 v8.4S, v20.8H, v9.8H // .........*.............................................
- // uzp1 v7.8H, v21.8H, v6.8H // ...........*...........................................
- // uzp2 v20.8H, v21.8H, v6.8H // ...............*.......................................
- // uzp1 v16.8H, v1.8H, v12.8H // ........*..............................................
- // uzp2 v9.8H, v1.8H, v12.8H // ......................*................................
- // smlal v11.4S, v7.4H, v16.4H // ..............*........................................
- // smlal2 v8.4S, v7.8H, v16.8H // .............*.........................................
- // smlal v30.4S, v7.4H, v9.4H // .........................*.............................
- // smlal2 v15.4S, v7.8H, v9.8H // ........................*..............................
- // smlal v11.4S, v20.4H, v24.4H // ..........................*............................
- // smlal2 v8.4S, v20.8H, v24.8H // .................*.....................................
- // smlal v30.4S, v20.4H, v16.4H // .............................*.........................
- // smlal2 v15.4S, v20.8H, v16.8H // ............................*..........................
- // uzp1 v7.8H, v19.8H, v31.8H // ..................*....................................
- // uzp2 v20.8H, v19.8H, v31.8H // .......................*...............................
- // uzp1 v16.8H, v17.8H, v18.8H // ................*......................................
- // uzp2 v9.8H, v17.8H, v18.8H // ...........................*...........................
- // smlal v11.4S, v7.4H, v16.4H // ..............................*........................
- // smlal2 v8.4S, v7.8H, v16.8H // .....................*.................................
- // smlal v30.4S, v7.4H, v9.4H // .................................*.....................
- // smlal2 v15.4S, v7.8H, v9.8H // ................................*......................
- // smlal v11.4S, v20.4H, v25.4H // ..................................*....................
- // smlal2 v8.4S, v20.8H, v25.8H // ...............................*.......................
- // smlal v30.4S, v20.4H, v16.4H // .....................................*.................
- // smlal2 v15.4S, v20.8H, v16.8H // ....................................*..................
- // uzp1 v7.8H, v11.8H, v8.8H // ......................................*................
- // uzp1 v20.8H, v30.8H, v15.8H // .........................................*.............
- // mul v7.8H, v7.8H, v2.8H // ........................................*..............
- // mul v20.8H, v20.8H, v2.8H // ...........................................*...........
- // zip2 v9.8H, v27.8H, v10.8H // ..........................................*............
- // zip1 v27.8H, v27.8H, v10.8H // ...................................*...................
- // smlal v11.4S, v7.4H, v0.4H // ............................................*..........
- // smlal2 v8.4S, v7.8H, v0.8H // .............................................*.........
- // smlal v30.4S, v20.4H, v0.4H // ................................................*......
- // smlal2 v15.4S, v20.8H, v0.8H // ...............................................*.......
- // str q27, [x0], #32 // .......................................*...............
- // uzp2 v27.8H, v11.8H, v8.8H // .................................................*.....
- // str q9, [x0, #-16] // ..............................................*........
- // uzp2 v10.8H, v30.8H, v15.8H // ..................................................*....
- // zip2 v9.8H, v27.8H, v10.8H // ....................................................*..
- // zip1 v27.8H, v27.8H, v10.8H // ...................................................*...
- // str q27, [x0], #32 // .....................................................*.
- // str q9, [x0, #-16] // ......................................................*
-
-
- pop_stack
- ret
-#endif /* MLKEM_K == 3 */
-
-#if MLKEM_K == 4
-.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
-
-.balign 4
-MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt):
- push_stack
- mov wtmp, #3329
- dup modulus.8h, wtmp
-
- mov wtmp, #3327
- dup modulus_twisted.8h, wtmp
-
- // Computed bases of vector entries
-
- add a1_ptr, a0_ptr, #(1 * 512)
- add b1_ptr, b0_ptr, #(1 * 512)
- add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2)
- add a2_ptr, a0_ptr, #(2 * 512)
- add b2_ptr, b0_ptr, #(2 * 512)
- add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2)
- add a3_ptr, a0_ptr, #(3 * 512)
- add b3_ptr, b0_ptr, #(3 * 512)
- add b3_cache_ptr, b0_cache_ptr, #(3 * 512/2)
-
- // Bounds:
-
- // Each pmull is bound by 2*4096*2^15=2^28, so the final value
- // before Montgomery reduction is bound by 2^30.
-
- mov count, #(MLKEM_N / 16)
- // Instructions: 114
- // Expected cycles: 153
- // Expected IPC: 0.75
- //
- // Cycle bound: 153.0
- // IPC bound: 0.75
- //
- // Wall time: 0.69s
- // User time: 0.69s
- //
- // ----------------------------------------------- original position ----------------------------------------------->
- // 0 25 50 75 100
- // |------------------------|------------------------|------------------------|------------------------|-------------
- ldr q23, [x2, #16] // .*................................................................................................................
- ldr q19, [x2], #32 // *.................................................................................................................
- ldr q17, [x5], #32 // ..*...............................................................................................................
- uzp2 v13.8H, v19.8H, v23.8H // ..........*.......................................................................................................
- uzp1 v19.8H, v19.8H, v23.8H // ...........*......................................................................................................
- ldr q23, [x5, #-16] // ...*..............................................................................................................
- ldr q30, [x1, #16] // .....*............................................................................................................
- uzp2 v9.8H, v17.8H, v23.8H // ....*.............................................................................................................
- uzp1 v23.8H, v17.8H, v23.8H // .......*..........................................................................................................
- ldr q17, [x1], #32 // ......*...........................................................................................................
- ldr q10, [x7, #16] // .............*....................................................................................................
- uzp1 v12.8H, v17.8H, v30.8H // ........*.........................................................................................................
- uzp2 v17.8H, v17.8H, v30.8H // .........*........................................................................................................
- smull2 v30.4S, v12.8H, v13.8H // ............*.....................................................................................................
- smull v13.4S, v12.4H, v13.4H // ............................................*.....................................................................
- smull2 v22.4S, v12.8H, v19.8H // .....................................*............................................................................
- smull v12.4S, v12.4H, v19.4H // ..........................................*.......................................................................
- smlal2 v30.4S, v17.8H, v19.8H // ...............................*..................................................................................
- smlal v13.4S, v17.4H, v19.4H // ...............................................*..................................................................
- ldr q19, [x4], #32 // ....................*.............................................................................................
- ldr q16, [x4, #-16] // .....................*............................................................................................
- ld1 {v8.8H}, [x3], #16 // ................................*.................................................................................
- uzp1 v26.8H, v19.8H, v16.8H // .......................*..........................................................................................
- uzp2 v19.8H, v19.8H, v16.8H // ........................*.........................................................................................
- smlal2 v30.4S, v26.8H, v9.8H // .................................*................................................................................
- smlal v13.4S, v26.4H, v9.4H // ..................................................*...............................................................
- smlal2 v22.4S, v17.8H, v8.8H // ........................................*.........................................................................
- smlal v12.4S, v17.4H, v8.4H // .................................................*................................................................
- smlal2 v30.4S, v19.8H, v23.8H // ...................................*..............................................................................
- smlal v13.4S, v19.4H, v23.4H // .......................................................*..........................................................
- smlal2 v22.4S, v26.8H, v23.8H // ...........................................*......................................................................
- smlal v12.4S, v26.4H, v23.4H // .....................................................*............................................................
- ldr q23, [x7], #32 // ......................*...........................................................................................
- ldr q17, [x8, #16] // ..............*...................................................................................................
- uzp1 v9.8H, v23.8H, v10.8H // ..........................*.......................................................................................
- uzp2 v23.8H, v23.8H, v10.8H // ....................................*.............................................................................
- ldr q10, [x10], #32 // ...............*..................................................................................................
- ldr q16, [x10, #-16] // ................*.................................................................................................
- ld1 {v8.8H}, [x12], #16 // .................*................................................................................................
- uzp1 v26.8H, v10.8H, v16.8H // ..................*...............................................................................................
- uzp2 v10.8H, v10.8H, v16.8H // ...................*..............................................................................................
- ld1 {v16.8H}, [x6], #16 // .........................*........................................................................................
- ldr q3, [x11, #16] // ...........................*......................................................................................
- smlal2 v22.4S, v19.8H, v16.8H // ..............................................*...................................................................
- smlal v12.4S, v19.4H, v16.4H // ........................................................*.........................................................
- ldr q19, [x11], #32 // ............................*.....................................................................................
- ld1 {v16.8H}, [x9], #16 // .............................*....................................................................................
- uzp1 v4.8H, v19.8H, v3.8H // ..................................*...............................................................................
- uzp2 v19.8H, v19.8H, v3.8H // .......................................*..........................................................................
- ldr q3, [x8], #32 // ..............................*...................................................................................
- ldr q31, [x2], #32 // ......................................*...........................................................................
- uzp1 v6.8H, v3.8H, v17.8H // ...................................................*..............................................................
- uzp2 v17.8H, v3.8H, v17.8H // .........................................................*........................................................
- smlal2 v22.4S, v9.8H, v6.8H // ..........................................................*.......................................................
- smlal2 v30.4S, v9.8H, v17.8H // ...........................................................*......................................................
- smlal v13.4S, v9.4H, v17.4H // ............................................................*.....................................................
- smlal v12.4S, v9.4H, v6.4H // .............................................................*....................................................
- smlal2 v22.4S, v23.8H, v16.8H // ..............................................................*...................................................
- smlal2 v30.4S, v23.8H, v6.8H // ...............................................................*..................................................
- smlal v13.4S, v23.4H, v6.4H // ................................................................*.................................................
- smlal v12.4S, v23.4H, v16.4H // .................................................................*................................................
- smlal2 v22.4S, v26.8H, v4.8H // ..................................................................*...............................................
- smlal2 v30.4S, v26.8H, v19.8H // ...................................................................*..............................................
- smlal v13.4S, v26.4H, v19.4H // ....................................................................*.............................................
- smlal v12.4S, v26.4H, v4.4H // .....................................................................*............................................
- smlal2 v22.4S, v10.8H, v8.8H // ......................................................................*...........................................
- smlal2 v30.4S, v10.8H, v4.8H // .......................................................................*..........................................
- smlal v13.4S, v10.4H, v4.4H // ........................................................................*.........................................
- smlal v12.4S, v10.4H, v8.4H // .........................................................................*........................................
- ldr q19, [x2, #-16] // .........................................*........................................................................
- uzp1 v23.8H, v13.8H, v30.8H // ...........................................................................*......................................
- uzp1 v17.8H, v12.8H, v22.8H // ....................................................................................*.............................
- mul v23.8H, v23.8H, v2.8H // .............................................................................*....................................
- uzp2 v21.8H, v31.8H, v19.8H // ................................................................................*.................................
- uzp1 v19.8H, v31.8H, v19.8H // ...................................................................................*..............................
- mul v17.8H, v17.8H, v2.8H // .....................................................................................*............................
- smlal v13.4S, v23.4H, v0.4H // .................................................................................*................................
- smlal2 v30.4S, v23.8H, v0.8H // ..................................................................................*...............................
- ldr q23, [x5], #32 // .............................................*....................................................................
- smlal2 v22.4S, v17.8H, v0.8H // ...........................................................................................................*......
- uzp2 v15.8H, v13.8H, v30.8H // ......................................................................................*...........................
- smlal v12.4S, v17.4H, v0.4H // ............................................................................................................*.....
- ldr q17, [x5, #-16] // ................................................*.................................................................
- ldr q13, [x1, #16] // ......................................................*...........................................................
- uzp2 v27.8H, v23.8H, v17.8H // ....................................................*.............................................................
- uzp1 v28.8H, v23.8H, v17.8H // ............................................................................*.....................................
- uzp2 v7.8H, v12.8H, v22.8H // ...............................................................................................................*..
- ldr q23, [x1], #32 // ..........................................................................*.......................................
- zip1 v5.8H, v7.8H, v15.8H // .................................................................................................................*
- ldr q3, [x7, #16] // ........................................................................................*.........................
- uzp1 v31.8H, v23.8H, v13.8H // ..............................................................................*...................................
- uzp2 v16.8H, v23.8H, v13.8H // ...............................................................................*..................................
- smull2 v24.4S, v31.8H, v21.8H // .......................................................................................*..........................
- ldr q6, [x8, #16] // .........................................................................................*........................
- ldr q23, [x10], #32 // ..........................................................................................*.......................
- smlal2 v24.4S, v16.8H, v19.8H // ..........................................................................................................*.......
- ldr q17, [x10, #-16] // ...........................................................................................*......................
- ld1 {v22.8H}, [x12], #16 // ............................................................................................*.....................
- uzp1 v30.8H, v23.8H, v17.8H // .............................................................................................*....................
- uzp2 v11.8H, v23.8H, v17.8H // ..............................................................................................*...................
- ldr q23, [x4], #32 // ...............................................................................................*..................
- ldr q17, [x4, #-16] // ................................................................................................*.................
- ldr q4, [x7], #32 // .................................................................................................*................
- uzp1 v20.8H, v23.8H, v17.8H // ..................................................................................................*...............
- uzp2 v26.8H, v23.8H, v17.8H // ...................................................................................................*..............
- uzp1 v9.8H, v4.8H, v3.8H // .....................................................................................................*............
- smlal2 v24.4S, v20.8H, v27.8H // ..............................................................................................................*...
- ld1 {v8.8H}, [x6], #16 // ....................................................................................................*.............
- ldr q25, [x11, #16] // ......................................................................................................*...........
- ldr q29, [x11], #32 // .......................................................................................................*..........
- ld1 {v12.8H}, [x9], #16 // ........................................................................................................*.........
- uzp1 v10.8H, v29.8H, v25.8H // ................................................................................................................*.
- ldr q14, [x8], #32 // .........................................................................................................*........
- ld1 {v23.8H}, [x3], #16 // .............................................................................................................*....
-
- // ------------------------------------------------- new position -------------------------------------------------->
- // 0 25 50 75 100
- // |------------------------|------------------------|------------------------|------------------------|-------------
- // ldr q3, [x2], #32 // .*................................................................................................................
- // ldr q17, [x2, #-16] // *.................................................................................................................
- // ldr q21, [x5], #32 // ..*...............................................................................................................
- // ldr q19, [x5, #-16] // .....*............................................................................................................
- // uzp2 v27.8H, v21.8H, v19.8H // .......*..........................................................................................................
- // ldr q25, [x1, #16] // ......*...........................................................................................................
- // ldr q22, [x1], #32 // .........*........................................................................................................
- // uzp1 v28.8H, v21.8H, v19.8H // ........*.........................................................................................................
- // uzp1 v31.8H, v22.8H, v25.8H // ...........*......................................................................................................
- // uzp2 v16.8H, v22.8H, v25.8H // ............*.....................................................................................................
- // uzp2 v21.8H, v3.8H, v17.8H // ...*..............................................................................................................
- // uzp1 v19.8H, v3.8H, v17.8H // ....*.............................................................................................................
- // smull2 v24.4S, v31.8H, v21.8H // .............*....................................................................................................
- // ldr q3, [x7, #16] // ..........*.......................................................................................................
- // ldr q6, [x8, #16] // .................................*................................................................................
- // ldr q8, [x10], #32 // ....................................*.............................................................................
- // ldr q26, [x10, #-16] // .....................................*............................................................................
- // ld1 {v22.8H}, [x12], #16 // ......................................*...........................................................................
- // uzp1 v30.8H, v8.8H, v26.8H // .......................................*..........................................................................
- // uzp2 v11.8H, v8.8H, v26.8H // ........................................*.........................................................................
- // ldr q8, [x4], #32 // ...................*..............................................................................................
- // ldr q26, [x4, #-16] // ....................*.............................................................................................
- // ldr q4, [x7], #32 // ................................*.................................................................................
- // uzp1 v20.8H, v8.8H, v26.8H // ......................*...........................................................................................
- // uzp2 v26.8H, v8.8H, v26.8H // .......................*..........................................................................................
- // ld1 {v8.8H}, [x6], #16 // .........................................*........................................................................
- // uzp1 v9.8H, v4.8H, v3.8H // ..................................*...............................................................................
- // ldr q25, [x11, #16] // ..........................................*.......................................................................
- // ldr q29, [x11], #32 // .............................................*....................................................................
- // ld1 {v12.8H}, [x9], #16 // ..............................................*...................................................................
- // ldr q14, [x8], #32 // .................................................*................................................................
- // smlal2 v24.4S, v16.8H, v19.8H // .................*................................................................................................
- // ld1 {v23.8H}, [x3], #16 // .....................*............................................................................................
- // smlal2 v24.4S, v20.8H, v27.8H // ........................*.........................................................................................
- // uzp1 v10.8H, v29.8H, v25.8H // ...............................................*..................................................................
- // smlal2 v24.4S, v26.8H, v28.8H // ............................*.....................................................................................
- // uzp2 v4.8H, v4.8H, v3.8H // ...................................*..............................................................................
- // smull2 v13.4S, v31.8H, v19.8H // ...............*..................................................................................................
- // ldr q3, [x2], #32 // ..................................................*...............................................................
- // uzp2 v1.8H, v29.8H, v25.8H // ................................................*.................................................................
- // smlal2 v13.4S, v16.8H, v23.8H // ..........................*.......................................................................................
- // ldr q17, [x2, #-16] // .....................................................................*............................................
- // smull v18.4S, v31.4H, v19.4H // ................*.................................................................................................
- // smlal2 v13.4S, v20.8H, v28.8H // ..............................*...................................................................................
- // smull v29.4S, v31.4H, v21.4H // ..............*...................................................................................................
- // ldr q21, [x5], #32 // ..............................................................................*...................................
- // smlal2 v13.4S, v26.8H, v8.8H // ...........................................*......................................................................
- // smlal v29.4S, v16.4H, v19.4H // ..................*...............................................................................................
- // ldr q19, [x5, #-16] // ..................................................................................*...............................
- // smlal v18.4S, v16.4H, v23.4H // ...........................*......................................................................................
- // smlal v29.4S, v20.4H, v27.4H // .........................*........................................................................................
- // uzp1 v31.8H, v14.8H, v6.8H // ...................................................*..............................................................
- // uzp2 v27.8H, v21.8H, v19.8H // ....................................................................................*.............................
- // smlal v18.4S, v20.4H, v28.4H // ...............................*..................................................................................
- // ldr q25, [x1, #16] // ...................................................................................*..............................
- // smlal v29.4S, v26.4H, v28.4H // .............................*....................................................................................
- // smlal v18.4S, v26.4H, v8.4H // ............................................*.....................................................................
- // uzp2 v26.8H, v14.8H, v6.8H // ....................................................*.............................................................
- // smlal2 v13.4S, v9.8H, v31.8H // .....................................................*............................................................
- // smlal2 v24.4S, v9.8H, v26.8H // ......................................................*...........................................................
- // smlal v29.4S, v9.4H, v26.4H // .......................................................*..........................................................
- // smlal v18.4S, v9.4H, v31.4H // ........................................................*.........................................................
- // smlal2 v13.4S, v4.8H, v12.8H // .........................................................*........................................................
- // smlal2 v24.4S, v4.8H, v31.8H // ..........................................................*.......................................................
- // smlal v29.4S, v4.4H, v31.4H // ...........................................................*......................................................
- // smlal v18.4S, v4.4H, v12.4H // ............................................................*.....................................................
- // smlal2 v13.4S, v30.8H, v10.8H // .............................................................*....................................................
- // smlal2 v24.4S, v30.8H, v1.8H // ..............................................................*...................................................
- // smlal v29.4S, v30.4H, v1.4H // ...............................................................*..................................................
- // smlal v18.4S, v30.4H, v10.4H // ................................................................*.................................................
- // smlal2 v13.4S, v11.8H, v22.8H // .................................................................*................................................
- // smlal2 v24.4S, v11.8H, v10.8H // ..................................................................*...............................................
- // smlal v29.4S, v11.4H, v10.4H // ...................................................................*..............................................
- // smlal v18.4S, v11.4H, v22.4H // ....................................................................*.............................................
- // ldr q22, [x1], #32 // .......................................................................................*..........................
- // uzp1 v31.8H, v29.8H, v24.8H // ......................................................................*...........................................
- // uzp1 v28.8H, v21.8H, v19.8H // .....................................................................................*............................
- // mul v19.8H, v31.8H, v2.8H // ........................................................................*.........................................
- // uzp1 v31.8H, v22.8H, v25.8H // ..........................................................................................*.......................
- // uzp2 v16.8H, v22.8H, v25.8H // ...........................................................................................*......................
- // uzp2 v21.8H, v3.8H, v17.8H // .........................................................................*........................................
- // smlal v29.4S, v19.4H, v0.4H // ............................................................................*.....................................
- // smlal2 v24.4S, v19.8H, v0.8H // .............................................................................*....................................
- // uzp1 v19.8H, v3.8H, v17.8H // ..........................................................................*.......................................
- // uzp1 v26.8H, v18.8H, v13.8H // .......................................................................*..........................................
- // mul v23.8H, v26.8H, v2.8H // ...........................................................................*......................................
- // uzp2 v15.8H, v29.8H, v24.8H // ................................................................................*.................................
- // smull2 v24.4S, v31.8H, v21.8H // ............................................................................................*.....................
- // ldr q3, [x7, #16] // .........................................................................................*........................
- // ldr q6, [x8, #16] // .............................................................................................*....................
- // ldr q8, [x10], #32 // ..............................................................................................*...................
- // ldr q26, [x10, #-16] // ................................................................................................*.................
- // ld1 {v22.8H}, [x12], #16 // .................................................................................................*................
- // uzp1 v30.8H, v8.8H, v26.8H // ..................................................................................................*...............
- // uzp2 v11.8H, v8.8H, v26.8H // ...................................................................................................*..............
- // ldr q8, [x4], #32 // ....................................................................................................*.............
- // ldr q26, [x4, #-16] // .....................................................................................................*............
- // ldr q4, [x7], #32 // ......................................................................................................*...........
- // uzp1 v20.8H, v8.8H, v26.8H // .......................................................................................................*..........
- // uzp2 v26.8H, v8.8H, v26.8H // ........................................................................................................*.........
- // ld1 {v8.8H}, [x6], #16 // ...........................................................................................................*......
- // uzp1 v9.8H, v4.8H, v3.8H // .........................................................................................................*........
- // ldr q25, [x11, #16] // ............................................................................................................*.....
- // ldr q29, [x11], #32 // .............................................................................................................*....
- // ld1 {v12.8H}, [x9], #16 // ..............................................................................................................*...
- // ldr q14, [x8], #32 // ................................................................................................................*.
- // smlal2 v24.4S, v16.8H, v19.8H // ...............................................................................................*..................
- // smlal2 v13.4S, v23.8H, v0.8H // ...............................................................................*..................................
- // smlal v18.4S, v23.4H, v0.4H // .................................................................................*................................
- // ld1 {v23.8H}, [x3], #16 // .................................................................................................................*
- // smlal2 v24.4S, v20.8H, v27.8H // ..........................................................................................................*.......
- // uzp2 v7.8H, v18.8H, v13.8H // ......................................................................................*...........................
- // uzp1 v10.8H, v29.8H, v25.8H // ...............................................................................................................*..
- // zip1 v5.8H, v7.8H, v15.8H // ........................................................................................*.........................
-
- sub count, count, #2
-1:
- // Instructions: 82
- // Expected cycles: 102
- // Expected IPC: 0.80
- //
- // Cycle bound: 102.0
- // IPC bound: 0.80
- //
- // Wall time: 15.93s
- // User time: 15.93s
- //
- // ------------------------------- original position ------------------------------->
- // 0 25 50 75
- // |------------------------|------------------------|------------------------|------
- smlal2 v24.4S, v26.8H, v28.8H // .................................*................................................
- uzp2 v4.8H, v4.8H, v3.8H // .....................................*............................................
- smull2 v13.4S, v31.8H, v19.8H // ..........*.......................................................................
- ldr q3, [x2], #32 // ....e.............................................................................
- uzp2 v1.8H, v29.8H, v25.8H // ..........................................................*.......................
- smlal2 v13.4S, v16.8H, v23.8H // ............*.....................................................................
- ldr q17, [x2, #-16] // .....e............................................................................
- smull v18.4S, v31.4H, v19.4H // .........*........................................................................
- smlal2 v13.4S, v20.8H, v28.8H // ...........................*......................................................
- smull v29.4S, v31.4H, v21.4H // .............*....................................................................
- ldr q21, [x5], #32 // .....................e............................................................
- smlal2 v13.4S, v26.8H, v8.8H // .............................*....................................................
- smlal v29.4S, v16.4H, v19.4H // ...............*..................................................................
- ldr q19, [x5, #-16] // ......................e...........................................................
- smlal v18.4S, v16.4H, v23.4H // ...........*......................................................................
- smlal v29.4S, v20.4H, v27.4H // ..............................*...................................................
- uzp1 v31.8H, v14.8H, v6.8H // ........................................*.........................................
- uzp2 v27.8H, v21.8H, v19.8H // ........................e.........................................................
- smlal v18.4S, v20.4H, v28.4H // ..........................*.......................................................
- ldr q25, [x1, #16] // .e................................................................................
- smlal v29.4S, v26.4H, v28.4H // ................................*.................................................
- smlal v18.4S, v26.4H, v8.4H // ............................*.....................................................
- uzp2 v26.8H, v14.8H, v6.8H // .........................................*........................................
- smlal2 v13.4S, v9.8H, v31.8H // ............................................*.....................................
- smlal2 v24.4S, v9.8H, v26.8H // ................................................*.................................
- smlal v29.4S, v9.4H, v26.4H // ...............................................*..................................
- smlal v18.4S, v9.4H, v31.4H // ...........................................*......................................
- smlal2 v13.4S, v4.8H, v12.8H // ..............................................*...................................
- smlal2 v24.4S, v4.8H, v31.8H // ..................................................*...............................
- smlal v29.4S, v4.4H, v31.4H // .................................................*................................
- smlal v18.4S, v4.4H, v12.4H // .............................................*....................................
- smlal2 v13.4S, v30.8H, v10.8H // .............................................................*....................
- smlal2 v24.4S, v30.8H, v1.8H // .................................................................*................
- smlal v29.4S, v30.4H, v1.4H // ................................................................*.................
- smlal v18.4S, v30.4H, v10.4H // ............................................................*.....................
- smlal2 v13.4S, v11.8H, v22.8H // ...............................................................*..................
- smlal2 v24.4S, v11.8H, v10.8H // ...................................................................*..............
- smlal v29.4S, v11.4H, v10.4H // ..................................................................*...............
- smlal v18.4S, v11.4H, v22.4H // ..............................................................*...................
- ldr q22, [x1], #32 // e.................................................................................
- uzp1 v31.8H, v29.8H, v24.8H // .........................................................................*........
- uzp1 v28.8H, v21.8H, v19.8H // .......................e..........................................................
- mul v19.8H, v31.8H, v2.8H // ..........................................................................*.......
- uzp1 v31.8H, v22.8H, v25.8H // ..e...............................................................................
- uzp2 v16.8H, v22.8H, v25.8H // ...e..............................................................................
- uzp2 v21.8H, v3.8H, v17.8H // .......e..........................................................................
- smlal v29.4S, v19.4H, v0.4H // ...........................................................................*......
- smlal2 v24.4S, v19.8H, v0.8H // ............................................................................*.....
- uzp1 v19.8H, v3.8H, v17.8H // ......e...........................................................................
- uzp1 v26.8H, v18.8H, v13.8H // ....................................................................*.............
- zip2 v14.8H, v7.8H, v15.8H // ...............................................................................l..
- mul v23.8H, v26.8H, v2.8H // .....................................................................*............
- uzp2 v15.8H, v29.8H, v24.8H // .............................................................................*....
- smull2 v24.4S, v31.8H, v21.8H // ..............e...................................................................
- str q14, [x0, #16] // .................................................................................l
- ldr q3, [x7, #16] // ...................................e..............................................
- ldr q6, [x8, #16] // .......................................e..........................................
- ldr q8, [x10], #32 // ...................................................e..............................
- ldr q26, [x10, #-16] // ....................................................e.............................
- ld1 {v22.8H}, [x12], #16 // ...........................................................e......................
- uzp1 v30.8H, v8.8H, v26.8H // .....................................................e............................
- uzp2 v11.8H, v8.8H, v26.8H // ......................................................e...........................
- ldr q8, [x4], #32 // .................e................................................................
- ldr q26, [x4, #-16] // ..................e...............................................................
- ldr q4, [x7], #32 // ..................................e...............................................
- uzp1 v20.8H, v8.8H, v26.8H // ...................e..............................................................
- uzp2 v26.8H, v8.8H, v26.8H // ....................e.............................................................
- ld1 {v8.8H}, [x6], #16 // .........................e........................................................
- uzp1 v9.8H, v4.8H, v3.8H // ....................................e.............................................
- ldr q25, [x11, #16] // ........................................................e.........................
- ldr q29, [x11], #32 // .......................................................e..........................
- ld1 {v12.8H}, [x9], #16 // ..........................................e.......................................
- ldr q14, [x8], #32 // ......................................e...........................................
- smlal2 v24.4S, v16.8H, v19.8H // ................e.................................................................
- smlal2 v13.4S, v23.8H, v0.8H // .......................................................................*..........
- smlal v18.4S, v23.4H, v0.4H // ......................................................................*...........
- ld1 {v23.8H}, [x3], #16 // ........e.........................................................................
- smlal2 v24.4S, v20.8H, v27.8H // ...............................e..................................................
- uzp2 v7.8H, v18.8H, v13.8H // ........................................................................*.........
- uzp1 v10.8H, v29.8H, v25.8H // .........................................................e........................
- str q5, [x0], #32 // ................................................................................l.
- zip1 v5.8H, v7.8H, v15.8H // ..............................................................................*...
-
- // ----------------------------------------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------------------------------------>
- // 0 25 50 75 100 125 150 175 200 225
- // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------
- // ldr q12, [x1], #32 // ....................................e..........................................'......................................~..........................................'......................................~.........................................
- // ldr q13, [x1, #-16] // ................e..............................................................'..................~..............................................................'..................~.............................................................
- // uzp1 v3.8h, v12.8h, v13.8h // ........................................e......................................'..........................................~......................................'..........................................~.....................................
- // uzp2 v4.8h, v12.8h, v13.8h // .........................................e.....................................'...........................................~.....................................'...........................................~....................................
- // ldr q12, [x2], #32 // e..............................................................................'..~..............................................................................'..~.............................................................................
- // ldr q13, [x2, #-16] // ...e...........................................................................'.....~...........................................................................'.....~..........................................................................
- // uzp1 v5.8h, v12.8h, v13.8h // .............................................e.................................'...............................................~.................................'...............................................~................................
- // uzp2 v6.8h, v12.8h, v13.8h // ..........................................e....................................'............................................~....................................'............................................~...................................
- // ld1 {v7.8h}, [x3], #16 // .........................................................................e.....'...........................................................................~.....'...........................................................................~....
- // smull v8.4s, v3.4h, v5.4h // ....~..........................................................................'......*..........................................................................'......~.........................................................................
- // smull2 v10.4s, v3.8h, v5.8h // ...............................................................................'.*...............................................................................'.~..............................................................................
- // smlal v8.4s, v4.4h, v7.4h // ...........~...................................................................'.............*...................................................................'.............~..................................................................
- // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................................................'....*............................................................................'....~...........................................................................
- // smull v9.4s, v3.4h, v6.4h // ......~........................................................................'........*........................................................................'........~.......................................................................
- // smull2 v11.4s, v3.8h, v6.8h // ..................................................e............................'....................................................~............................'....................................................~...........................
- // smlal v9.4s, v4.4h, v5.4h // .........~.....................................................................'...........*.....................................................................'...........~....................................................................
- // smlal2 v11.4s, v4.8h, v5.8h // ......................................................................e........'........................................................................~........'........................................................................~.......
- // ldr q12, [x4], #32 // ...........................................................e...................'.............................................................~...................'.............................................................~..................
- // ldr q13, [x4, #-16] // ............................................................e..................'..............................................................~..................'..............................................................~.................
- // uzp1 v3.8h, v12.8h, v13.8h // ..............................................................e................'................................................................~................'................................................................~...............
- // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................e...............'.................................................................~...............'.................................................................~..............
- // ldr q12, [x5], #32 // .......e.......................................................................'.........~.......................................................................'.........~......................................................................
- // ldr q13, [x5, #-16] // ..........e....................................................................'............~....................................................................'............~...................................................................
- // uzp1 v5.8h, v12.8h, v13.8h // ......................................e........................................'........................................~........................................'........................................~.......................................
- // uzp2 v6.8h, v12.8h, v13.8h // ..............e................................................................'................~................................................................'................~...............................................................
- // ld1 {v7.8h}, [x6], #16 // ................................................................e..............'..................................................................~..............'..................................................................~.............
- // smlal v8.4s, v3.4h, v5.4h // ...............~...............................................................'.................*...............................................................'.................~..............................................................
- // smlal2 v10.4s, v3.8h, v5.8h // .....~.........................................................................'.......*.........................................................................'.......~........................................................................
- // smlal v8.4s, v4.4h, v7.4h // ..................~............................................................'....................*............................................................'....................~...........................................................
- // smlal2 v10.4s, v4.8h, v7.8h // ........~......................................................................'..........*......................................................................'..........~.....................................................................
- // smlal v9.4s, v3.4h, v6.4h // ............~..................................................................'..............*..................................................................'..............~.................................................................
- // smlal2 v11.4s, v3.8h, v6.8h // ..........................................................................e....'............................................................................~....'............................................................................~...
- // smlal v9.4s, v4.4h, v5.4h // .................~.............................................................'...................*.............................................................'...................~............................................................
- // smlal2 v11.4s, v4.8h, v5.8h // ...............................................................................*.................................................................................~................................................................................
- // ldr q12, [x7], #32 // .............................................................e.................'...............................................................~.................'...............................................................~................
- // ldr q13, [x7, #-16] // ....................................................e..........................'......................................................~..........................'......................................................~.........................
- // uzp1 v3.8h, v12.8h, v13.8h // .................................................................e.............'...................................................................~.............'...................................................................~............
- // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................................'*................................................................................'~...............................................................................
- // ldr q12, [x8], #32 // .....................................................................e.........'.......................................................................~.........'.......................................................................~........
- // ldr q13, [x8, #-16] // .....................................................e.........................'.......................................................~.........................'.......................................................~........................
- // uzp1 v5.8h, v12.8h, v13.8h // .............~.................................................................'...............*.................................................................'...............~................................................................
- // uzp2 v6.8h, v12.8h, v13.8h // ...................~...........................................................'.....................*...........................................................'.....................~..........................................................
- // ld1 {v7.8h}, [x9], #16 // ....................................................................e..........'......................................................................~..........'......................................................................~.........
- // smlal v8.4s, v3.4h, v5.4h // .......................~.......................................................'.........................*.......................................................'.........................~......................................................
- // smlal2 v10.4s, v3.8h, v5.8h // ....................~..........................................................'......................*..........................................................'......................~.........................................................
- // smlal v8.4s, v4.4h, v7.4h // ...........................~...................................................'.............................*...................................................'.............................~..................................................
- // smlal2 v10.4s, v4.8h, v7.8h // ........................~......................................................'..........................*......................................................'..........................~.....................................................
- // smlal v9.4s, v3.4h, v6.4h // ......................~........................................................'........................*........................................................'........................~.......................................................
- // smlal2 v11.4s, v3.8h, v6.8h // .....................~.........................................................'.......................*.........................................................'.......................~........................................................
- // smlal v9.4s, v4.4h, v5.4h // ..........................~....................................................'............................*....................................................'............................~...................................................
- // smlal2 v11.4s, v4.8h, v5.8h // .........................~.....................................................'...........................*.....................................................'...........................~....................................................
- // ldr q12, [x10], #32 // ......................................................e........................'........................................................~........................'........................................................~.......................
- // ldr q13, [x10, #-16] // .......................................................e.......................'.........................................................~.......................'.........................................................~......................
- // uzp1 v3.8h, v12.8h, v13.8h // .........................................................e.....................'...........................................................~.....................'...........................................................~....................
- // uzp2 v4.8h, v12.8h, v13.8h // ..........................................................e....................'............................................................~....................'............................................................~...................
- // ldr q12, [x11], #32 // ...................................................................e...........'.....................................................................~...........'.....................................................................~..........
- // ldr q13, [x11, #-16] // ..................................................................e............'....................................................................~............'....................................................................~...........
- // uzp1 v5.8h, v12.8h, v13.8h // ............................................................................e..'..............................................................................~..'..............................................................................~.
- // uzp2 v6.8h, v12.8h, v13.8h // .~.............................................................................'...*.............................................................................'...~............................................................................
- // ld1 {v7.8h}, [x12], #16 // ........................................................e......................'..........................................................~......................'..........................................................~.....................
- // smlal v8.4s, v3.4h, v5.4h // ...............................~...............................................'.................................*...............................................'.................................~..............................................
- // smlal2 v10.4s, v3.8h, v5.8h // ............................~..................................................'..............................*..................................................'..............................~.................................................
- // smlal v8.4s, v4.4h, v7.4h // ...................................~...........................................'.....................................*...........................................'.....................................~..........................................
- // smlal2 v10.4s, v4.8h, v7.8h // ................................~..............................................'..................................*..............................................'..................................~.............................................
- // smlal v9.4s, v3.4h, v6.4h // ..............................~................................................'................................*................................................'................................~...............................................
- // smlal2 v11.4s, v3.8h, v6.8h // .............................~.................................................'...............................*.................................................'...............................~................................................
- // smlal v9.4s, v4.4h, v5.4h // ..................................~............................................'....................................*............................................'....................................~...........................................
- // smlal2 v11.4s, v4.8h, v5.8h // .................................~.............................................'...................................*.............................................'...................................~............................................
- // uzp1 v28.8h, v8.8h, v10.8h // ..............................................~................................'................................................*................................'................................................~...............................
- // mul v28.8h, v28.8h, v2.8h // ................................................~..............................'..................................................*..............................'..................................................~.............................
- // smlal v8.4s, v28.4h, v0.4h // ........................................................................~......'..........................................................................*......'..........................................................................~.....
- // smlal2 v10.4s, v28.8h, v0.8h // .......................................................................~.......'.........................................................................*.......'.........................................................................~......
- // uzp2 v26.8h, v8.8h, v10.8h // ...........................................................................~...'.............................................................................*...'.............................................................................~..
- // uzp1 v28.8h, v9.8h, v11.8h // .....................................~.........................................'.......................................*.........................................'.......................................~........................................
- // mul v28.8h, v28.8h, v2.8h // .......................................~.......................................'.........................................*.......................................'.........................................~......................................
- // smlal v9.4s, v28.4h, v0.4h // ...........................................~...................................'.............................................*...................................'.............................................~..................................
- // smlal2 v11.4s, v28.8h, v0.8h // ............................................~..................................'..............................................*..................................'..............................................~.................................
- // uzp2 v27.8h, v9.8h, v11.8h // .................................................~.............................'...................................................*.............................'...................................................~............................
- // zip1 v12.8h, v26.8h, v27.8h // ..............................................................................~'................................................................................*'................................................................................
- // zip2 v13.8h, v26.8h, v27.8h // ...............................................~...............................'.................................................~...............................'.................................................l..............................
- // str q12, [x0], #32 // .............................................................................~.'...............................................................................~.'...............................................................................l
- // str q13, [x0, #-16] // ...................................................~...........................'.....................................................~...........................'.....................................................l..........................
-
- sub count, count, #1
- cbnz count, 1b
- // Instructions: 50
- // Expected cycles: 56
- // Expected IPC: 0.89
- //
- // Cycle bound: 56.0
- // IPC bound: 0.89
- //
- // Wall time: 4.16s
- // User time: 4.16s
- //
- // --------------- original position --------------->
- // 0 25
- // |------------------------|
- smull2 v17.4S, v31.8H, v19.8H // ..*...............................................
- uzp2 v1.8H, v14.8H, v6.8H // ................*.................................
- smull v18.4S, v31.4H, v21.4H // .......*..........................................
- smlal2 v24.4S, v26.8H, v28.8H // *.................................................
- smlal2 v17.4S, v16.8H, v23.8H // ....*.............................................
- smull v21.4S, v31.4H, v19.4H // .....*............................................
- smlal v18.4S, v16.4H, v19.4H // .........*........................................
- uzp2 v31.8H, v4.8H, v3.8H // .*................................................
- uzp1 v3.8H, v14.8H, v6.8H // ............*.....................................
- smlal v21.4S, v16.4H, v23.4H // ..........*.......................................
- smlal v18.4S, v20.4H, v27.4H // ...........*......................................
- uzp2 v14.8H, v29.8H, v25.8H // ...*..............................................
- smlal2 v17.4S, v20.8H, v28.8H // ......*...........................................
- smlal v21.4S, v20.4H, v28.4H // .............*....................................
- smlal v18.4S, v26.4H, v28.4H // ..............*...................................
- smlal2 v24.4S, v9.8H, v1.8H // ..................*...............................
- smlal2 v17.4S, v26.8H, v8.8H // ........*.........................................
- smlal v21.4S, v26.4H, v8.4H // ...............*..................................
- smlal v18.4S, v9.4H, v1.4H // ...................*..............................
- smlal2 v24.4S, v31.8H, v3.8H // ......................*...........................
- smlal2 v17.4S, v9.8H, v3.8H // .................*................................
- smlal v21.4S, v9.4H, v3.4H // ....................*.............................
- smlal v18.4S, v31.4H, v3.4H // .......................*..........................
- smlal2 v24.4S, v30.8H, v14.8H // ..........................*.......................
- smlal2 v17.4S, v31.8H, v12.8H // .....................*............................
- smlal v21.4S, v31.4H, v12.4H // ........................*.........................
- smlal v18.4S, v30.4H, v14.4H // ...........................*......................
- smlal2 v24.4S, v11.8H, v10.8H // ..............................*...................
- smlal2 v17.4S, v30.8H, v10.8H // .........................*........................
- smlal v21.4S, v30.4H, v10.4H // ............................*.....................
- smlal v18.4S, v11.4H, v10.4H // ...............................*..................
- zip2 v19.8H, v7.8H, v15.8H // ......................................*...........
- smlal2 v17.4S, v11.8H, v22.8H // .............................*....................
- smlal v21.4S, v11.4H, v22.4H // ................................*.................
- uzp1 v23.8H, v18.8H, v24.8H // .................................*................
- str q19, [x0, #16] // .........................................*........
- mul v19.8H, v23.8H, v2.8H // ..................................*...............
- uzp1 v23.8H, v21.8H, v17.8H // .....................................*............
- str q5, [x0], #32 // .............................................*....
- mul v26.8H, v23.8H, v2.8H // .......................................*..........
- smlal v18.4S, v19.4H, v0.4H // ...................................*..............
- smlal2 v24.4S, v19.8H, v0.8H // ....................................*.............
- smlal v21.4S, v26.4H, v0.4H // ...........................................*......
- smlal2 v17.4S, v26.8H, v0.8H // ..........................................*.......
- uzp2 v13.8H, v18.8H, v24.8H // ........................................*.........
- uzp2 v19.8H, v21.8H, v17.8H // ............................................*.....
- zip1 v23.8H, v19.8H, v13.8H // ..............................................*...
- zip2 v19.8H, v19.8H, v13.8H // ...............................................*..
- str q23, [x0], #32 // .................................................*
- str q19, [x0, #-16] // ................................................*.
-
- // ----------------- new position ------------------>
- // 0 25
- // |------------------------|------------------------
- // smlal2 v24.4S, v26.8H, v28.8H // ...*..............................................
- // uzp2 v4.8H, v4.8H, v3.8H // .......*..........................................
- // smull2 v13.4S, v31.8H, v19.8H // *.................................................
- // uzp2 v1.8H, v29.8H, v25.8H // ...........*......................................
- // smlal2 v13.4S, v16.8H, v23.8H // ....*.............................................
- // smull v18.4S, v31.4H, v19.4H // .....*............................................
- // smlal2 v13.4S, v20.8H, v28.8H // ............*.....................................
- // smull v29.4S, v31.4H, v21.4H // ..*...............................................
- // smlal2 v13.4S, v26.8H, v8.8H // ................*.................................
- // smlal v29.4S, v16.4H, v19.4H // ......*...........................................
- // smlal v18.4S, v16.4H, v23.4H // .........*........................................
- // smlal v29.4S, v20.4H, v27.4H // ..........*.......................................
- // uzp1 v31.8H, v14.8H, v6.8H // ........*.........................................
- // smlal v18.4S, v20.4H, v28.4H // .............*....................................
- // smlal v29.4S, v26.4H, v28.4H // ..............*...................................
- // smlal v18.4S, v26.4H, v8.4H // .................*................................
- // uzp2 v26.8H, v14.8H, v6.8H // .*................................................
- // smlal2 v13.4S, v9.8H, v31.8H // ....................*.............................
- // smlal2 v24.4S, v9.8H, v26.8H // ...............*..................................
- // smlal v29.4S, v9.4H, v26.4H // ..................*...............................
- // smlal v18.4S, v9.4H, v31.4H // .....................*............................
- // smlal2 v13.4S, v4.8H, v12.8H // ........................*.........................
- // smlal2 v24.4S, v4.8H, v31.8H // ...................*..............................
- // smlal v29.4S, v4.4H, v31.4H // ......................*...........................
- // smlal v18.4S, v4.4H, v12.4H // .........................*........................
- // smlal2 v13.4S, v30.8H, v10.8H // ............................*.....................
- // smlal2 v24.4S, v30.8H, v1.8H // .......................*..........................
- // smlal v29.4S, v30.4H, v1.4H // ..........................*.......................
- // smlal v18.4S, v30.4H, v10.4H // .............................*....................
- // smlal2 v13.4S, v11.8H, v22.8H // ................................*.................
- // smlal2 v24.4S, v11.8H, v10.8H // ...........................*......................
- // smlal v29.4S, v11.4H, v10.4H // ..............................*...................
- // smlal v18.4S, v11.4H, v22.4H // .................................*................
- // uzp1 v31.8H, v29.8H, v24.8H // ..................................*...............
- // mul v19.8H, v31.8H, v2.8H // ....................................*.............
- // smlal v29.4S, v19.4H, v0.4H // ........................................*.........
- // smlal2 v24.4S, v19.8H, v0.8H // .........................................*........
- // uzp1 v26.8H, v18.8H, v13.8H // .....................................*............
- // zip2 v14.8H, v7.8H, v15.8H // ...............................*..................
- // mul v23.8H, v26.8H, v2.8H // .......................................*..........
- // uzp2 v15.8H, v29.8H, v24.8H // ............................................*.....
- // str q14, [x0, #16] // ...................................*..............
- // smlal2 v13.4S, v23.8H, v0.8H // ...........................................*......
- // smlal v18.4S, v23.4H, v0.4H // ..........................................*.......
- // uzp2 v7.8H, v18.8H, v13.8H // .............................................*....
- // str q5, [x0], #32 // ......................................*...........
- // zip1 v5.8H, v7.8H, v15.8H // ..............................................*...
- // zip2 v14.8H, v7.8H, v15.8H // ...............................................*..
- // str q14, [x0, #16] // .................................................*
- // str q5, [x0], #32 // ................................................*.
-
-
- pop_stack
- ret
-#endif /* MLKEM_K == 4 */
-
-/****************** REGISTER DEALLOCATIONS *******************/
- .unreq out
- .unreq a0_ptr
- .unreq b0_ptr
- .unreq b0_cache_ptr
- .unreq a1_ptr
- .unreq b1_ptr
- .unreq b1_cache_ptr
- .unreq a2_ptr
- .unreq b2_ptr
- .unreq b2_cache_ptr
- .unreq a3_ptr
- .unreq b3_ptr
- .unreq b3_cache_ptr
- .unreq count
- .unreq modulus
- .unreq modulus_twisted
- .unreq wtmp
- .unreq aa0
- .unreq aa1
- .unreq bb0
- .unreq bb1
- .unreq bb1t
- .unreq res0l
- .unreq res1l
- .unreq res0h
- .unreq res1h
- .unreq tmp0
- .unreq tmp1
- .unreq q_tmp0
- .unreq q_tmp1
- .unreq out0
- .unreq out1
- .unreq t0
-
-#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_asm_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_asm_clean.S
index 8302d2a3e..f2451815a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_asm_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_asm_clean.S
@@ -19,8 +19,8 @@
* Returns number of sampled 16-bit integers (at most MLKEM_N).
**************************************************/
#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \
- defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+/* simpasm: header-end */
// We save the output on the stack first, and copy to the actual
// output buffer only in the end. This is because the main loop can overwrite
@@ -112,9 +112,9 @@
mlkem_q .req v30
bits .req v31
-.text
-.global MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean)
-.balign 4
+ .text
+ .global MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean)
+ .balign 4
MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean):
push_stack
@@ -402,5 +402,5 @@ return:
.unreq mlkem_q
.unreq bits
-#endif /* defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) ||
- defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) */
+/* simpasm: footer-start */
+#endif /* defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_table.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_table.c
index becdf303b..592c15fb0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_table.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_table.c
@@ -10,8 +10,7 @@
#include "../../../common.h"
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \
- defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
#include
#include "arith_native_aarch64.h"
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/default.h
index f9fe4310a..df43dc5b0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/default.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/default.h
@@ -11,21 +11,10 @@
#include "../sys.h"
#ifdef SYS_AARCH64
-/*
- * For AArch64, we currently we have one clean and one opt profile.
- * We default to the opt profile.
- *
- * In the future, this may branch further depending on the microarchitecture.
- */
#include "aarch64/opt.h"
#endif /* SYS_AARCH64 */
#ifdef SYS_X86_64_AVX2
-/*
- * For now, there's only one x86_64 profile, based on
- * the AVX2 code from the Kyber repository.
- * https://github.com/pq-crystals/kyber
- */
#include "x86_64/default.h"
#endif /* SYS_X86_64 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/native/default.h
index f9fe4310a..df43dc5b0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/native/default.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/native/default.h
@@ -11,21 +11,10 @@
#include "../sys.h"
#ifdef SYS_AARCH64
-/*
- * For AArch64, we currently we have one clean and one opt profile.
- * We default to the opt profile.
- *
- * In the future, this may branch further depending on the microarchitecture.
- */
#include "aarch64/opt.h"
#endif /* SYS_AARCH64 */
#ifdef SYS_X86_64_AVX2
-/*
- * For now, there's only one x86_64 profile, based on
- * the AVX2 code from the Kyber repository.
- * https://github.com/pq-crystals/kyber
- */
#include "x86_64/default.h"
#endif /* SYS_X86_64 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/default.h
index f9fe4310a..df43dc5b0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/default.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/default.h
@@ -11,21 +11,10 @@
#include "../sys.h"
#ifdef SYS_AARCH64
-/*
- * For AArch64, we currently we have one clean and one opt profile.
- * We default to the opt profile.
- *
- * In the future, this may branch further depending on the microarchitecture.
- */
#include "aarch64/opt.h"
#endif /* SYS_AARCH64 */
#ifdef SYS_X86_64_AVX2
-/*
- * For now, there's only one x86_64 profile, based on
- * the AVX2 code from the Kyber repository.
- * https://github.com/pq-crystals/kyber
- */
#include "x86_64/default.h"
#endif /* SYS_X86_64 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.S
index 5fdc3d0a0..3063d20ae 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.S
@@ -8,6 +8,7 @@
#include "../../../common.h"
#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
#include "consts.h"
@@ -113,6 +114,7 @@ vmovdqa %ymm11,(64*\off+48)*2(%rdi)
.text
.global MLKEM_ASM_NAMESPACE(basemul_avx2)
+.balign 4
MLKEM_ASM_NAMESPACE(basemul_avx2):
mov %rsp,%r8
and $-32,%rsp
@@ -133,4 +135,5 @@ schoolbook 3
mov %r8,%rsp
ret
+/* simpasm: footer-start */
#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/intt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/intt.S
index 7b1f22624..e74199930 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/intt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/intt.S
@@ -12,6 +12,7 @@
#include "../../../common.h"
#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
#include "consts.h"
#include "shuffle.inc"
@@ -242,6 +243,7 @@ vmovdqa %ymm11,(64*\off+176)*2(%rdi)
.text
.global MLKEM_ASM_NAMESPACE(invntt_avx2)
+.balign 4
MLKEM_ASM_NAMESPACE(invntt_avx2):
vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0
@@ -252,4 +254,5 @@ intt_level6 0
intt_level6 1
ret
+/* simpasm: footer-start */
#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntt.S
index 5d928b4cc..70582fbc1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntt.S
@@ -8,6 +8,7 @@
#include "../../../common.h"
#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
#include "consts.h"
#include "shuffle.inc"
@@ -205,6 +206,7 @@ vmovdqa %ymm11,(128*\off+112)*2(%rdi)
.text
.global MLKEM_ASM_NAMESPACE(ntt_avx2)
+.balign 4
MLKEM_ASM_NAMESPACE(ntt_avx2):
vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0
@@ -216,4 +218,5 @@ levels1t6 1
ret
+/* simpasm: footer-start */
#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttfrombytes.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttfrombytes.S
new file mode 100644
index 000000000..71f2af000
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttfrombytes.S
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// Implementation from Kyber reference repository
+// https://github.com/pq-crystals/kyber/blob/main/avx2
+
+#include "../../../common.h"
+
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
+
+#include "consts.h"
+#include "fq.inc"
+#include "shuffle.inc"
+
+.text
+.global MLKEM_ASM_NAMESPACE(nttfrombytes_avx2)
+.balign 4
+MLKEM_ASM_NAMESPACE(nttfrombytes_avx2):
+#consts
+vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMASK*2(%rdx),%ymm0
+call nttfrombytes128_avx
+add $256,%rdi
+add $192,%rsi
+call nttfrombytes128_avx
+ret
+
+nttfrombytes128_avx:
+#load
+vmovdqu (%rsi),%ymm4
+vmovdqu 32(%rsi),%ymm5
+vmovdqu 64(%rsi),%ymm6
+vmovdqu 96(%rsi),%ymm7
+vmovdqu 128(%rsi),%ymm8
+vmovdqu 160(%rsi),%ymm9
+
+shuffle8 4,7,3,7
+shuffle8 5,8,4,8
+shuffle8 6,9,5,9
+
+shuffle4 3,8,6,8
+shuffle4 7,5,3,5
+shuffle4 4,9,7,9
+
+shuffle2 6,5,4,5
+shuffle2 8,7,6,7
+shuffle2 3,9,8,9
+
+shuffle1 4,7,10,7
+shuffle1 5,8,4,8
+shuffle1 6,9,5,9
+
+#bitunpack
+vpsrlw $12,%ymm10,%ymm11
+vpsllw $4,%ymm7,%ymm12
+vpor %ymm11,%ymm12,%ymm11
+vpand %ymm0,%ymm10,%ymm10
+vpand %ymm0,%ymm11,%ymm11
+
+vpsrlw $8,%ymm7,%ymm12
+vpsllw $8,%ymm4,%ymm13
+vpor %ymm12,%ymm13,%ymm12
+vpand %ymm0,%ymm12,%ymm12
+
+vpsrlw $4,%ymm4,%ymm13
+vpand %ymm0,%ymm13,%ymm13
+
+vpsrlw $12,%ymm8,%ymm14
+vpsllw $4,%ymm5,%ymm15
+vpor %ymm14,%ymm15,%ymm14
+vpand %ymm0,%ymm8,%ymm8
+vpand %ymm0,%ymm14,%ymm14
+
+vpsrlw $8,%ymm5,%ymm15
+vpsllw $8,%ymm9,%ymm1
+vpor %ymm15,%ymm1,%ymm15
+vpand %ymm0,%ymm15,%ymm15
+
+vpsrlw $4,%ymm9,%ymm1
+vpand %ymm0,%ymm1,%ymm1
+
+#store
+vmovdqa %ymm10,(%rdi)
+vmovdqa %ymm11,32(%rdi)
+vmovdqa %ymm12,64(%rdi)
+vmovdqa %ymm13,96(%rdi)
+vmovdqa %ymm8,128(%rdi)
+vmovdqa %ymm14,160(%rdi)
+vmovdqa %ymm15,192(%rdi)
+vmovdqa %ymm1,224(%rdi)
+
+ret
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttpack.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttpack.S
new file mode 100644
index 000000000..4c10ef366
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttpack.S
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// Implementation from Kyber reference repository
+// https://github.com/pq-crystals/kyber/blob/main/avx2
+
+#include "../../../common.h"
+
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
+
+#include "consts.h"
+#include "fq.inc"
+#include "shuffle.inc"
+
+.text
+.global MLKEM_ASM_NAMESPACE(nttpack_avx2)
+.balign 4
+MLKEM_ASM_NAMESPACE(nttpack_avx2):
+#load
+vmovdqa (%rdi),%ymm4
+vmovdqa 32(%rdi),%ymm5
+vmovdqa 64(%rdi),%ymm6
+vmovdqa 96(%rdi),%ymm7
+vmovdqa 128(%rdi),%ymm8
+vmovdqa 160(%rdi),%ymm9
+vmovdqa 192(%rdi),%ymm10
+vmovdqa 224(%rdi),%ymm11
+
+shuffle1 4,5,3,5
+shuffle1 6,7,4,7
+shuffle1 8,9,6,9
+shuffle1 10,11,8,11
+
+shuffle2 3,4,10,4
+shuffle2 6,8,3,8
+shuffle2 5,7,6,7
+shuffle2 9,11,5,11
+
+shuffle4 10,3,9,3
+shuffle4 6,5,10,5
+shuffle4 4,8,6,8
+shuffle4 7,11,4,11
+
+shuffle8 9,10,7,10
+shuffle8 6,4,9,4
+shuffle8 3,5,6,5
+shuffle8 8,11,3,11
+
+#store
+vmovdqa %ymm7,(%rdi)
+vmovdqa %ymm9,32(%rdi)
+vmovdqa %ymm6,64(%rdi)
+vmovdqa %ymm3,96(%rdi)
+vmovdqa %ymm10,128(%rdi)
+vmovdqa %ymm4,160(%rdi)
+vmovdqa %ymm5,192(%rdi)
+vmovdqa %ymm11,224(%rdi)
+
+ret
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntttobytes.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntttobytes.S
new file mode 100644
index 000000000..4f0b01e83
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntttobytes.S
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// Implementation from Kyber reference repository
+// https://github.com/pq-crystals/kyber/blob/main/avx2
+
+#include "../../../common.h"
+
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
+
+#include "consts.h"
+#include "fq.inc"
+#include "shuffle.inc"
+
+.text
+.global MLKEM_ASM_NAMESPACE(ntttobytes_avx2)
+.balign 4
+MLKEM_ASM_NAMESPACE(ntttobytes_avx2):
+#consts
+vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rdx),%ymm0
+call ntttobytes128_avx
+add $256,%rsi
+add $192,%rdi
+call ntttobytes128_avx
+ret
+
+ntttobytes128_avx:
+#load
+vmovdqa (%rsi),%ymm5
+vmovdqa 32(%rsi),%ymm6
+vmovdqa 64(%rsi),%ymm7
+vmovdqa 96(%rsi),%ymm8
+vmovdqa 128(%rsi),%ymm9
+vmovdqa 160(%rsi),%ymm10
+vmovdqa 192(%rsi),%ymm11
+vmovdqa 224(%rsi),%ymm12
+
+#bitpack
+vpsllw $12,%ymm6,%ymm4
+vpor %ymm4,%ymm5,%ymm4
+
+vpsrlw $4,%ymm6,%ymm5
+vpsllw $8,%ymm7,%ymm6
+vpor %ymm5,%ymm6,%ymm5
+
+vpsrlw $8,%ymm7,%ymm6
+vpsllw $4,%ymm8,%ymm7
+vpor %ymm6,%ymm7,%ymm6
+
+vpsllw $12,%ymm10,%ymm7
+vpor %ymm7,%ymm9,%ymm7
+
+vpsrlw $4,%ymm10,%ymm8
+vpsllw $8,%ymm11,%ymm9
+vpor %ymm8,%ymm9,%ymm8
+
+vpsrlw $8,%ymm11,%ymm9
+vpsllw $4,%ymm12,%ymm10
+vpor %ymm9,%ymm10,%ymm9
+
+shuffle1 4,5,3,5
+shuffle1 6,7,4,7
+shuffle1 8,9,6,9
+
+shuffle2 3,4,8,4
+shuffle2 6,5,3,5
+shuffle2 7,9,6,9
+
+shuffle4 8,3,7,3
+shuffle4 6,4,8,4
+shuffle4 5,9,6,9
+
+shuffle8 7,8,5,8
+shuffle8 6,3,7,3
+shuffle8 4,9,6,9
+
+#store
+vmovdqu %ymm5,(%rdi)
+vmovdqu %ymm7,32(%rdi)
+vmovdqu %ymm6,64(%rdi)
+vmovdqu %ymm8,96(%rdi)
+vmovdqu %ymm3,128(%rdi)
+vmovdqu %ymm9,160(%rdi)
+
+ret
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttunpack.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttunpack.S
new file mode 100644
index 000000000..0cf45c671
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttunpack.S
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// Implementation from Kyber reference repository
+// https://github.com/pq-crystals/kyber/blob/main/avx2
+
+#include "../../../common.h"
+
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
+
+#include "consts.h"
+#include "fq.inc"
+#include "shuffle.inc"
+
+.text
+.global MLKEM_ASM_NAMESPACE(nttunpack_avx2)
+.balign 4
+MLKEM_ASM_NAMESPACE(nttunpack_avx2):
+call nttunpack128_avx2
+add $256,%rdi
+call nttunpack128_avx2
+ret
+
+nttunpack128_avx2:
+#load
+vmovdqa (%rdi),%ymm4
+vmovdqa 32(%rdi),%ymm5
+vmovdqa 64(%rdi),%ymm6
+vmovdqa 96(%rdi),%ymm7
+vmovdqa 128(%rdi),%ymm8
+vmovdqa 160(%rdi),%ymm9
+vmovdqa 192(%rdi),%ymm10
+vmovdqa 224(%rdi),%ymm11
+
+shuffle8 4,8,3,8
+shuffle8 5,9,4,9
+shuffle8 6,10,5,10
+shuffle8 7,11,6,11
+
+shuffle4 3,5,7,5
+shuffle4 8,10,3,10
+shuffle4 4,6,8,6
+shuffle4 9,11,4,11
+
+shuffle2 7,8,9,8
+shuffle2 5,6,7,6
+shuffle2 3,4,5,4
+shuffle2 10,11,3,11
+
+shuffle1 9,5,10,5
+shuffle1 8,4,9,4
+shuffle1 7,3,8,3
+shuffle1 6,11,7,11
+
+#store
+vmovdqa %ymm10,(%rdi)
+vmovdqa %ymm5,32(%rdi)
+vmovdqa %ymm9,64(%rdi)
+vmovdqa %ymm4,96(%rdi)
+vmovdqa %ymm8,128(%rdi)
+vmovdqa %ymm3,160(%rdi)
+vmovdqa %ymm7,192(%rdi)
+vmovdqa %ymm11,224(%rdi)
+
+ret
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/reduce.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/reduce.S
new file mode 100644
index 000000000..78bad0559
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/reduce.S
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// Implementation based on Kyber reference repository
+// https://github.com/pq-crystals/kyber/blob/main/avx2
+
+// Changes:
+// - Add call to csub in reduce128_avx to produce outputs
+// in [0,1,...,q-1] rather than [0,1,...,q], matching the
+// semantics of poly_reduce().
+
+#include "../../../common.h"
+
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
+/* simpasm: header-end */
+
+#include "consts.h"
+#include "fq.inc"
+
+.text
+.global MLKEM_ASM_NAMESPACE(reduce_avx2)
+.balign 4
+MLKEM_ASM_NAMESPACE(reduce_avx2):
+#consts
+vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0
+vmovdqa AVX2_BACKEND_DATA_OFFSET_16XV*2(%rsi),%ymm1
+call reduce128_avx2
+add $256,%rdi
+call reduce128_avx2
+ret
+
+reduce128_avx2:
+#load
+vmovdqa (%rdi),%ymm2
+vmovdqa 32(%rdi),%ymm3
+vmovdqa 64(%rdi),%ymm4
+vmovdqa 96(%rdi),%ymm5
+vmovdqa 128(%rdi),%ymm6
+vmovdqa 160(%rdi),%ymm7
+vmovdqa 192(%rdi),%ymm8
+vmovdqa 224(%rdi),%ymm9
+
+red16 2
+red16 3
+red16 4
+red16 5
+red16 6
+red16 7
+red16 8
+red16 9
+
+csubq 2
+csubq 3
+csubq 4
+csubq 5
+csubq 6
+csubq 7
+csubq 8
+csubq 9
+
+#store
+vmovdqa %ymm2,(%rdi)
+vmovdqa %ymm3,32(%rdi)
+vmovdqa %ymm4,64(%rdi)
+vmovdqa %ymm5,96(%rdi)
+vmovdqa %ymm6,128(%rdi)
+vmovdqa %ymm7,160(%rdi)
+vmovdqa %ymm8,192(%rdi)
+vmovdqa %ymm9,224(%rdi)
+
+ret
+
+/* simpasm: footer-start */
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/shuffle.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/shuffle.S
deleted file mode 100644
index 9bcd04896..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/shuffle.S
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-// Implementation from Kyber reference repository
-// https://github.com/pq-crystals/kyber/blob/main/avx2
-
-#include "../../../common.h"
-
-#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
-
-#include "consts.h"
-#include "fq.inc"
-#include "shuffle.inc"
-
-.global MLKEM_ASM_NAMESPACE(nttpack_avx2)
-MLKEM_ASM_NAMESPACE(nttpack_avx2):
-#load
-vmovdqa (%rdi),%ymm4
-vmovdqa 32(%rdi),%ymm5
-vmovdqa 64(%rdi),%ymm6
-vmovdqa 96(%rdi),%ymm7
-vmovdqa 128(%rdi),%ymm8
-vmovdqa 160(%rdi),%ymm9
-vmovdqa 192(%rdi),%ymm10
-vmovdqa 224(%rdi),%ymm11
-
-shuffle1 4,5,3,5
-shuffle1 6,7,4,7
-shuffle1 8,9,6,9
-shuffle1 10,11,8,11
-
-shuffle2 3,4,10,4
-shuffle2 6,8,3,8
-shuffle2 5,7,6,7
-shuffle2 9,11,5,11
-
-shuffle4 10,3,9,3
-shuffle4 6,5,10,5
-shuffle4 4,8,6,8
-shuffle4 7,11,4,11
-
-shuffle8 9,10,7,10
-shuffle8 6,4,9,4
-shuffle8 3,5,6,5
-shuffle8 8,11,3,11
-
-#store
-vmovdqa %ymm7,(%rdi)
-vmovdqa %ymm9,32(%rdi)
-vmovdqa %ymm6,64(%rdi)
-vmovdqa %ymm3,96(%rdi)
-vmovdqa %ymm10,128(%rdi)
-vmovdqa %ymm4,160(%rdi)
-vmovdqa %ymm5,192(%rdi)
-vmovdqa %ymm11,224(%rdi)
-
-ret
-
-nttunpack128_avx2:
-#load
-vmovdqa (%rdi),%ymm4
-vmovdqa 32(%rdi),%ymm5
-vmovdqa 64(%rdi),%ymm6
-vmovdqa 96(%rdi),%ymm7
-vmovdqa 128(%rdi),%ymm8
-vmovdqa 160(%rdi),%ymm9
-vmovdqa 192(%rdi),%ymm10
-vmovdqa 224(%rdi),%ymm11
-
-shuffle8 4,8,3,8
-shuffle8 5,9,4,9
-shuffle8 6,10,5,10
-shuffle8 7,11,6,11
-
-shuffle4 3,5,7,5
-shuffle4 8,10,3,10
-shuffle4 4,6,8,6
-shuffle4 9,11,4,11
-
-shuffle2 7,8,9,8
-shuffle2 5,6,7,6
-shuffle2 3,4,5,4
-shuffle2 10,11,3,11
-
-shuffle1 9,5,10,5
-shuffle1 8,4,9,4
-shuffle1 7,3,8,3
-shuffle1 6,11,7,11
-
-#store
-vmovdqa %ymm10,(%rdi)
-vmovdqa %ymm5,32(%rdi)
-vmovdqa %ymm9,64(%rdi)
-vmovdqa %ymm4,96(%rdi)
-vmovdqa %ymm8,128(%rdi)
-vmovdqa %ymm3,160(%rdi)
-vmovdqa %ymm7,192(%rdi)
-vmovdqa %ymm11,224(%rdi)
-
-ret
-
-.global MLKEM_ASM_NAMESPACE(nttunpack_avx2)
-MLKEM_ASM_NAMESPACE(nttunpack_avx2):
-call nttunpack128_avx2
-add $256,%rdi
-call nttunpack128_avx2
-ret
-
-ntttobytes128_avx:
-#load
-vmovdqa (%rsi),%ymm5
-vmovdqa 32(%rsi),%ymm6
-vmovdqa 64(%rsi),%ymm7
-vmovdqa 96(%rsi),%ymm8
-vmovdqa 128(%rsi),%ymm9
-vmovdqa 160(%rsi),%ymm10
-vmovdqa 192(%rsi),%ymm11
-vmovdqa 224(%rsi),%ymm12
-
-#bitpack
-vpsllw $12,%ymm6,%ymm4
-vpor %ymm4,%ymm5,%ymm4
-
-vpsrlw $4,%ymm6,%ymm5
-vpsllw $8,%ymm7,%ymm6
-vpor %ymm5,%ymm6,%ymm5
-
-vpsrlw $8,%ymm7,%ymm6
-vpsllw $4,%ymm8,%ymm7
-vpor %ymm6,%ymm7,%ymm6
-
-vpsllw $12,%ymm10,%ymm7
-vpor %ymm7,%ymm9,%ymm7
-
-vpsrlw $4,%ymm10,%ymm8
-vpsllw $8,%ymm11,%ymm9
-vpor %ymm8,%ymm9,%ymm8
-
-vpsrlw $8,%ymm11,%ymm9
-vpsllw $4,%ymm12,%ymm10
-vpor %ymm9,%ymm10,%ymm9
-
-shuffle1 4,5,3,5
-shuffle1 6,7,4,7
-shuffle1 8,9,6,9
-
-shuffle2 3,4,8,4
-shuffle2 6,5,3,5
-shuffle2 7,9,6,9
-
-shuffle4 8,3,7,3
-shuffle4 6,4,8,4
-shuffle4 5,9,6,9
-
-shuffle8 7,8,5,8
-shuffle8 6,3,7,3
-shuffle8 4,9,6,9
-
-#store
-vmovdqu %ymm5,(%rdi)
-vmovdqu %ymm7,32(%rdi)
-vmovdqu %ymm6,64(%rdi)
-vmovdqu %ymm8,96(%rdi)
-vmovdqu %ymm3,128(%rdi)
-vmovdqu %ymm9,160(%rdi)
-
-ret
-
-.global MLKEM_ASM_NAMESPACE(ntttobytes_avx2)
-MLKEM_ASM_NAMESPACE(ntttobytes_avx2):
-#consts
-vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rdx),%ymm0
-call ntttobytes128_avx
-add $256,%rsi
-add $192,%rdi
-call ntttobytes128_avx
-ret
-
-nttfrombytes128_avx:
-#load
-vmovdqu (%rsi),%ymm4
-vmovdqu 32(%rsi),%ymm5
-vmovdqu 64(%rsi),%ymm6
-vmovdqu 96(%rsi),%ymm7
-vmovdqu 128(%rsi),%ymm8
-vmovdqu 160(%rsi),%ymm9
-
-shuffle8 4,7,3,7
-shuffle8 5,8,4,8
-shuffle8 6,9,5,9
-
-shuffle4 3,8,6,8
-shuffle4 7,5,3,5
-shuffle4 4,9,7,9
-
-shuffle2 6,5,4,5
-shuffle2 8,7,6,7
-shuffle2 3,9,8,9
-
-shuffle1 4,7,10,7
-shuffle1 5,8,4,8
-shuffle1 6,9,5,9
-
-#bitunpack
-vpsrlw $12,%ymm10,%ymm11
-vpsllw $4,%ymm7,%ymm12
-vpor %ymm11,%ymm12,%ymm11
-vpand %ymm0,%ymm10,%ymm10
-vpand %ymm0,%ymm11,%ymm11
-
-vpsrlw $8,%ymm7,%ymm12
-vpsllw $8,%ymm4,%ymm13
-vpor %ymm12,%ymm13,%ymm12
-vpand %ymm0,%ymm12,%ymm12
-
-vpsrlw $4,%ymm4,%ymm13
-vpand %ymm0,%ymm13,%ymm13
-
-vpsrlw $12,%ymm8,%ymm14
-vpsllw $4,%ymm5,%ymm15
-vpor %ymm14,%ymm15,%ymm14
-vpand %ymm0,%ymm8,%ymm8
-vpand %ymm0,%ymm14,%ymm14
-
-vpsrlw $8,%ymm5,%ymm15
-vpsllw $8,%ymm9,%ymm1
-vpor %ymm15,%ymm1,%ymm15
-vpand %ymm0,%ymm15,%ymm15
-
-vpsrlw $4,%ymm9,%ymm1
-vpand %ymm0,%ymm1,%ymm1
-
-#store
-vmovdqa %ymm10,(%rdi)
-vmovdqa %ymm11,32(%rdi)
-vmovdqa %ymm12,64(%rdi)
-vmovdqa %ymm13,96(%rdi)
-vmovdqa %ymm8,128(%rdi)
-vmovdqa %ymm14,160(%rdi)
-vmovdqa %ymm15,192(%rdi)
-vmovdqa %ymm1,224(%rdi)
-
-ret
-
-.global MLKEM_ASM_NAMESPACE(nttfrombytes_avx2)
-MLKEM_ASM_NAMESPACE(nttfrombytes_avx2):
-#consts
-vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMASK*2(%rdx),%ymm0
-call nttfrombytes128_avx
-add $256,%rdi
-add $192,%rsi
-call nttfrombytes128_avx
-ret
-
-#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/fq.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/tomont.S
similarity index 64%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/fq.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/tomont.S
index 3f013a5fa..7774cec0b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/fq.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/tomont.S
@@ -14,63 +14,24 @@
#include "../../../common.h"
#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
-#include "consts.h"
+/* simpasm: header-end */
+#include "consts.h"
#include "fq.inc"
.text
-reduce128_avx2:
-#load
-vmovdqa (%rdi),%ymm2
-vmovdqa 32(%rdi),%ymm3
-vmovdqa 64(%rdi),%ymm4
-vmovdqa 96(%rdi),%ymm5
-vmovdqa 128(%rdi),%ymm6
-vmovdqa 160(%rdi),%ymm7
-vmovdqa 192(%rdi),%ymm8
-vmovdqa 224(%rdi),%ymm9
-
-red16 2
-red16 3
-red16 4
-red16 5
-red16 6
-red16 7
-red16 8
-red16 9
-
-csubq 2
-csubq 3
-csubq 4
-csubq 5
-csubq 6
-csubq 7
-csubq 8
-csubq 9
-
-#store
-vmovdqa %ymm2,(%rdi)
-vmovdqa %ymm3,32(%rdi)
-vmovdqa %ymm4,64(%rdi)
-vmovdqa %ymm5,96(%rdi)
-vmovdqa %ymm6,128(%rdi)
-vmovdqa %ymm7,160(%rdi)
-vmovdqa %ymm8,192(%rdi)
-vmovdqa %ymm9,224(%rdi)
-
-ret
-
-.global MLKEM_ASM_NAMESPACE(reduce_avx2)
-MLKEM_ASM_NAMESPACE(reduce_avx2):
+.global MLKEM_ASM_NAMESPACE(tomont_avx2)
+.balign 4
+MLKEM_ASM_NAMESPACE(tomont_avx2):
#consts
vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0
-vmovdqa AVX2_BACKEND_DATA_OFFSET_16XV*2(%rsi),%ymm1
-call reduce128_avx2
+vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO*2(%rsi),%ymm1
+vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI*2(%rsi),%ymm2
+call tomont128_avx2
add $256,%rdi
-call reduce128_avx2
+call tomont128_avx2
ret
-
tomont128_avx2:
#load
vmovdqa (%rdi),%ymm3
@@ -103,15 +64,5 @@ vmovdqa %ymm10,224(%rdi)
ret
-.global MLKEM_ASM_NAMESPACE(tomont_avx2)
-MLKEM_ASM_NAMESPACE(tomont_avx2):
-#consts
-vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0
-vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO*2(%rsi),%ymm1
-vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI*2(%rsi),%ymm2
-call tomont128_avx2
-add $256,%rdi
-call tomont128_avx2
-ret
-
+/* simpasm: footer-start */
#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */