From 6ddc3bcc693f4205ed4c9d6234cdb8b653401d12 Mon Sep 17 00:00:00 2001 From: Martyn Capewell Date: Wed, 18 Oct 2023 15:51:41 +0100 Subject: [PATCH] Support SHA-3 accelerating instructions Add support for Neon BCAX, EOR3, RAX1 and XAR instructions, used to accelerate SHA-3. --- src/aarch64/assembler-aarch64.cc | 33 ++++ src/aarch64/assembler-aarch64.h | 21 +++ src/aarch64/cpu-features-auditor-aarch64.cc | 8 + src/aarch64/decoder-visitor-map-aarch64.h | 4 - src/aarch64/disasm-aarch64.cc | 15 ++ src/aarch64/disasm-aarch64.h | 3 + src/aarch64/macro-assembler-aarch64.h | 25 +++ src/aarch64/simulator-aarch64.cc | 32 ++++ src/aarch64/simulator-aarch64.h | 1 + test/aarch64/test-cpu-features-aarch64.cc | 9 ++ test/aarch64/test-disasm-aarch64.cc | 8 +- test/aarch64/test-disasm-neon-aarch64.cc | 14 ++ test/aarch64/test-simulator-sve-aarch64.cc | 164 ++++++++++++++++++++ tools/code_coverage.log | 1 + 14 files changed, 330 insertions(+), 8 deletions(-) diff --git a/src/aarch64/assembler-aarch64.cc b/src/aarch64/assembler-aarch64.cc index ad2e7c96..009b08c3 100644 --- a/src/aarch64/assembler-aarch64.cc +++ b/src/aarch64/assembler-aarch64.cc @@ -5876,6 +5876,39 @@ void Assembler::ummla(const VRegister& vd, const VRegister& vn, const VRegister& Emit(0x6e80a400 | Rd(vd) | Rn(vn) | Rm(vm)); } +void Assembler::bcax(const VRegister& vd, const VRegister& vn, const VRegister& vm, const VRegister& va) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSHA3)); + VIXL_ASSERT(vd.Is16B() && vn.Is16B() && vm.Is16B()); + + Emit(0xce200000 | Rd(vd) | Rn(vn) | Rm(vm) | Ra(va)); +} + +void Assembler::eor3(const VRegister& vd, const VRegister& vn, const VRegister& vm, const VRegister& va) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSHA3)); + VIXL_ASSERT(vd.Is16B() && vn.Is16B() && vm.Is16B() && va.Is16B()); + + Emit(0xce000000 | Rd(vd) | Rn(vn) | Rm(vm) | Ra(va)); +} + +void Assembler::xar(const VRegister& vd, const VRegister& vn, const VRegister& vm, int rotate) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSHA3)); + VIXL_ASSERT(vd.Is2D() && vn.Is2D() && vm.Is2D()); + VIXL_ASSERT(IsUint6(rotate)); + + Emit(0xce800000 | Rd(vd) | Rn(vn) | Rm(vm) | rotate << 10); +} + +void Assembler::rax1(const VRegister& vd, const VRegister& vn, const VRegister& vm) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSHA3)); + VIXL_ASSERT(vd.Is2D() && vn.Is2D() && vm.Is2D()); + + Emit(0xce608c00 | Rd(vd) | Rn(vn) | Rm(vm)); +} + // Note: // For all ToImm instructions below, a difference in case // for the same letter indicates a negated bit. diff --git a/src/aarch64/assembler-aarch64.h b/src/aarch64/assembler-aarch64.h index 9bc7076d..bbba5db7 100644 --- a/src/aarch64/assembler-aarch64.h +++ b/src/aarch64/assembler-aarch64.h @@ -3621,6 +3621,27 @@ class Assembler : public vixl::internal::AssemblerBase { // Unsigned 8-bit integer matrix multiply-accumulate (vector). void ummla(const VRegister& vd, const VRegister& vn, const VRegister& vm); + // Bit Clear and exclusive-OR. + void bcax(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + const VRegister& va); + + // Three-way Exclusive-OR. + void eor3(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + const VRegister& va); + + // Exclusive-OR and Rotate. + void xar(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + int rotate); + + // Rotate and Exclusive-OR + void rax1(const VRegister& vd, const VRegister& vn, const VRegister& vm); + // Scalable Vector Extensions. // Absolute value (predicated). diff --git a/src/aarch64/cpu-features-auditor-aarch64.cc b/src/aarch64/cpu-features-auditor-aarch64.cc index 3925ced3..a85587b8 100644 --- a/src/aarch64/cpu-features-auditor-aarch64.cc +++ b/src/aarch64/cpu-features-auditor-aarch64.cc @@ -1835,6 +1835,14 @@ void CPUFeaturesAuditor::Visit(Metadata* metadata, const Instruction* instr) { {"umax_64u_minmax_imm"_h, CPUFeatures::kCSSC}, {"umin_32u_minmax_imm"_h, CPUFeatures::kCSSC}, {"umin_64u_minmax_imm"_h, CPUFeatures::kCSSC}, + {"bcax_vvv16_crypto4"_h, + CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3)}, + {"eor3_vvv16_crypto4"_h, + CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3)}, + {"rax1_vvv2_cryptosha512_3"_h, + CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3)}, + {"xar_vvv2_crypto3_imm6"_h, + CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3)}, }; if (features.count(form_hash_) > 0) { diff --git a/src/aarch64/decoder-visitor-map-aarch64.h b/src/aarch64/decoder-visitor-map-aarch64.h index a0a6ef22..92131da7 100644 --- a/src/aarch64/decoder-visitor-map-aarch64.h +++ b/src/aarch64/decoder-visitor-map-aarch64.h @@ -2638,7 +2638,6 @@ &VISITORCLASS::VisitUnconditionalBranchToRegister}, \ {"ret_64r_branch_reg"_h, \ &VISITORCLASS::VisitUnconditionalBranchToRegister}, \ - {"bcax_vvv16_crypto4"_h, &VISITORCLASS::VisitUnimplemented}, \ {"bfcvtn_asimdmisc_4s"_h, &VISITORCLASS::VisitUnimplemented}, \ {"bfdot_asimdelem_e"_h, &VISITORCLASS::VisitUnimplemented}, \ {"bfdot_asimdsame2_d"_h, &VISITORCLASS::VisitUnimplemented}, \ @@ -2646,7 +2645,6 @@ {"bfmlal_asimdsame2_f"_h, &VISITORCLASS::VisitUnimplemented}, \ {"bfmmla_asimdsame2_e"_h, &VISITORCLASS::VisitUnimplemented}, \ {"dsb_bon_barriers"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"eor3_vvv16_crypto4"_h, &VISITORCLASS::VisitUnimplemented}, \ {"ld64b_64l_memop"_h, &VISITORCLASS::VisitUnimplemented}, \ {"ldgm_64bulk_ldsttags"_h, &VISITORCLASS::VisitUnimplemented}, \ {"ldtrb_32_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented}, \ @@ -2658,7 +2656,6 @@ {"ldtrsw_64_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented}, \ {"ldtr_32_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented}, \ {"ldtr_64_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"rax1_vvv2_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented}, \ {"sha512h2_qqv_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented}, \ {"sha512h_qqv_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented}, \ {"sha512su0_vv2_cryptosha512_2"_h, &VISITORCLASS::VisitUnimplemented}, \ @@ -2686,7 +2683,6 @@ {"ttest_br_systemresult"_h, &VISITORCLASS::VisitUnimplemented}, \ {"wfet_only_systeminstrswithreg"_h, &VISITORCLASS::VisitUnimplemented}, \ {"wfit_only_systeminstrswithreg"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"xar_vvv2_crypto3_imm6"_h, &VISITORCLASS::VisitUnimplemented}, \ {"bfcvt_z_p_z_s2bf"_h, &VISITORCLASS::VisitUnimplemented}, \ {"bfcvtnt_z_p_z_s2bf"_h, &VISITORCLASS::VisitUnimplemented}, \ {"bfdot_z_zzz"_h, &VISITORCLASS::VisitUnimplemented}, \ diff --git a/src/aarch64/disasm-aarch64.cc b/src/aarch64/disasm-aarch64.cc index 9f53e813..5a8241dc 100644 --- a/src/aarch64/disasm-aarch64.cc +++ b/src/aarch64/disasm-aarch64.cc @@ -753,6 +753,10 @@ const Disassembler::FormToVisitorFnMap *Disassembler::GetFormToVisitorFnMap() { {"umax_64u_minmax_imm"_h, &Disassembler::DisassembleMinMaxImm}, {"umin_32u_minmax_imm"_h, &Disassembler::DisassembleMinMaxImm}, {"umin_64u_minmax_imm"_h, &Disassembler::DisassembleMinMaxImm}, + {"bcax_vvv16_crypto4"_h, &Disassembler::DisassembleNEON4Same}, + {"eor3_vvv16_crypto4"_h, &Disassembler::DisassembleNEON4Same}, + {"xar_vvv2_crypto3_imm6"_h, &Disassembler::DisassembleNEONXar}, + {"rax1_vvv2_cryptosha512_3"_h, &Disassembler::DisassembleNEONRax1}, }; return &form_to_visitor; } // NOLINT(readability/fn_size) @@ -2430,6 +2434,17 @@ void Disassembler::VisitNEON3SameExtra(const Instruction *instr) { Format(instr, mnemonic, nfd.Substitute(form), suffix); } +void Disassembler::DisassembleNEON4Same(const Instruction *instr) { + FormatWithDecodedMnemonic(instr, "'Vd.16b, 'Vn.16b, 'Vm.16b, 'Va.16b"); +} + +void Disassembler::DisassembleNEONXar(const Instruction *instr) { + FormatWithDecodedMnemonic(instr, "'Vd.2d, 'Vn.2d, 'Vm.2d, #'u1510"); +} + +void Disassembler::DisassembleNEONRax1(const Instruction *instr) { + FormatWithDecodedMnemonic(instr, "'Vd.2d, 'Vn.2d, 'Vm.2d"); +} void Disassembler::VisitNEON3Different(const Instruction *instr) { const char *mnemonic = mnemonic_.c_str(); diff --git a/src/aarch64/disasm-aarch64.h b/src/aarch64/disasm-aarch64.h index 7985383b..0da49e41 100644 --- a/src/aarch64/disasm-aarch64.h +++ b/src/aarch64/disasm-aarch64.h @@ -229,6 +229,9 @@ class Disassembler : public DecoderVisitor { void DisassembleNEONScalar2RegMiscOnlyD(const Instruction* instr); void DisassembleNEONFPScalar2RegMisc(const Instruction* instr); void DisassembleNEONPolynomialMul(const Instruction* instr); + void DisassembleNEON4Same(const Instruction* instr); + void DisassembleNEONXar(const Instruction* instr); + void DisassembleNEONRax1(const Instruction* instr); void DisassembleMTELoadTag(const Instruction* instr); void DisassembleMTEStoreTag(const Instruction* instr); diff --git a/src/aarch64/macro-assembler-aarch64.h b/src/aarch64/macro-assembler-aarch64.h index f6fc4d74..8878ef73 100644 --- a/src/aarch64/macro-assembler-aarch64.h +++ b/src/aarch64/macro-assembler-aarch64.h @@ -2787,6 +2787,7 @@ class MacroAssembler : public Assembler, public MacroAssemblerInterface { V(pmull2, Pmull2) \ V(raddhn, Raddhn) \ V(raddhn2, Raddhn2) \ + V(rax1, Rax1) \ V(rsubhn, Rsubhn) \ V(rsubhn2, Rsubhn2) \ V(saba, Saba) \ @@ -3152,6 +3153,14 @@ class MacroAssembler : public Assembler, public MacroAssemblerInterface { SVE_3VREG_COMMUTATIVE_MACRO_LIST(DEFINE_MACRO_ASM_FUNC) #undef DEFINE_MACRO_ASM_FUNC + void Bcax(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + const VRegister& va) { + VIXL_ASSERT(allow_macro_instructions_); + SingleEmissionCheckScope guard(this); + bcax(vd, vn, vm, va); + } void Bic(const VRegister& vd, const int imm8, const int left_shift = 0) { VIXL_ASSERT(allow_macro_instructions_); SingleEmissionCheckScope guard(this); @@ -3192,6 +3201,14 @@ class MacroAssembler : public Assembler, public MacroAssemblerInterface { SingleEmissionCheckScope guard(this); dup(vd, rn); } + void Eor3(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + const VRegister& va) { + VIXL_ASSERT(allow_macro_instructions_); + SingleEmissionCheckScope guard(this); + eor3(vd, vn, vm, va); + } void Ext(const VRegister& vd, const VRegister& vn, const VRegister& vm, @@ -3498,6 +3515,14 @@ class MacroAssembler : public Assembler, public MacroAssemblerInterface { SingleEmissionCheckScope guard(this); umov(rd, vn, vn_index); } + void Xar(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + int rotate) { + VIXL_ASSERT(allow_macro_instructions_); + SingleEmissionCheckScope guard(this); + xar(vd, vn, vm, rotate); + } void Crc32b(const Register& rd, const Register& rn, const Register& rm) { VIXL_ASSERT(allow_macro_instructions_); SingleEmissionCheckScope guard(this); diff --git a/src/aarch64/simulator-aarch64.cc b/src/aarch64/simulator-aarch64.cc index 81bab07d..fdc71068 100644 --- a/src/aarch64/simulator-aarch64.cc +++ b/src/aarch64/simulator-aarch64.cc @@ -507,6 +507,10 @@ const Simulator::FormToVisitorFnMap* Simulator::GetFormToVisitorFnMap() { {"umax_64u_minmax_imm"_h, &Simulator::SimulateUnsignedMinMax}, {"umin_32u_minmax_imm"_h, &Simulator::SimulateUnsignedMinMax}, {"umin_64u_minmax_imm"_h, &Simulator::SimulateUnsignedMinMax}, + {"bcax_vvv16_crypto4"_h, &Simulator::SimulateNEONSHA3}, + {"eor3_vvv16_crypto4"_h, &Simulator::SimulateNEONSHA3}, + {"rax1_vvv2_cryptosha512_3"_h, &Simulator::SimulateNEONSHA3}, + {"xar_vvv2_crypto3_imm6"_h, &Simulator::SimulateNEONSHA3}, }; return &form_to_visitor; } @@ -9926,6 +9930,34 @@ void Simulator::VisitNEONPerm(const Instruction* instr) { } } +void Simulator::SimulateNEONSHA3(const Instruction* instr) { + SimVRegister& rd = ReadVRegister(instr->GetRd()); + SimVRegister& rn = ReadVRegister(instr->GetRn()); + SimVRegister& rm = ReadVRegister(instr->GetRm()); + SimVRegister& ra = ReadVRegister(instr->GetRa()); + SimVRegister temp; + + switch (form_hash_) { + case "bcax_vvv16_crypto4"_h: + bic(kFormat16B, temp, rm, ra); + eor(kFormat16B, rd, rn, temp); + break; + case "eor3_vvv16_crypto4"_h: + eor(kFormat16B, temp, rm, ra); + eor(kFormat16B, rd, rn, temp); + break; + case "rax1_vvv2_cryptosha512_3"_h: + ror(kFormat2D, temp, rm, 63); // rol(1) => ror(63) + eor(kFormat2D, rd, rn, temp); + break; + case "xar_vvv2_crypto3_imm6"_h: + int rot = instr->ExtractBits(15, 10); + eor(kFormat2D, temp, rn, rm); + ror(kFormat2D, rd, temp, rot); + break; + } +} + void Simulator::VisitSVEAddressGeneration(const Instruction* instr) { SimVRegister& zd = ReadVRegister(instr->GetRd()); SimVRegister& zn = ReadVRegister(instr->GetRn()); diff --git a/src/aarch64/simulator-aarch64.h b/src/aarch64/simulator-aarch64.h index 760fa6cb..73277e4c 100644 --- a/src/aarch64/simulator-aarch64.h +++ b/src/aarch64/simulator-aarch64.h @@ -1509,6 +1509,7 @@ class Simulator : public DecoderVisitor { void SimulateNEONFPMulByElementLong(const Instruction* instr); void SimulateNEONComplexMulByElement(const Instruction* instr); void SimulateNEONDotProdByElement(const Instruction* instr); + void SimulateNEONSHA3(const Instruction* instr); void SimulateMTEAddSubTag(const Instruction* instr); void SimulateMTETagMaskInsert(const Instruction* instr); void SimulateMTESubPointer(const Instruction* instr); diff --git a/test/aarch64/test-cpu-features-aarch64.cc b/test/aarch64/test-cpu-features-aarch64.cc index 187bbd55..8430d7f2 100644 --- a/test/aarch64/test-cpu-features-aarch64.cc +++ b/test/aarch64/test-cpu-features-aarch64.cc @@ -3785,5 +3785,14 @@ TEST_FP_FCMA_NEON_NEONHALF(fcmla_3, fcmla(v0.V8H(), v1.V8H(), v2.V8H(), 0)) TEST_FEAT(pmull1q_0, pmull(v5.V1Q(), v6.V1D(), v7.V1D())) #undef TEST_FEAT +#define TEST_NEON_SHA3(NAME, ASM) \ + TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3), \ + NEON_SHA3_##NAME, \ + ASM) +TEST_NEON_SHA3(bcax_0, bcax(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B())) +TEST_NEON_SHA3(eor3_0, eor3(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B())) +TEST_NEON_SHA3(xar_0, xar(v0.V2D(), v1.V2D(), v2.V2D(), 42)) +TEST_NEON_SHA3(rax1_0, rax1(v0.V2D(), v1.V2D(), v2.V2D())) + } // namespace aarch64 } // namespace vixl diff --git a/test/aarch64/test-disasm-aarch64.cc b/test/aarch64/test-disasm-aarch64.cc index 7c8f2ccf..5d8579f7 100644 --- a/test/aarch64/test-disasm-aarch64.cc +++ b/test/aarch64/test-disasm-aarch64.cc @@ -3789,10 +3789,10 @@ TEST(architecture_features) { COMPARE_PREFIX(dci(0xd503221f), "esb"); // ESB_HI_hints // ARMv8.2 - SHA3 - // COMPARE_PREFIX(dci(0xce000000), "eor3"); // EOR3_VVV16_crypto4 - // COMPARE_PREFIX(dci(0xce200000), "bcax"); // BCAX_VVV16_crypto4 - // COMPARE_PREFIX(dci(0xce608c00), "rax1"); // RAX1_VVV2_cryptosha512_3 - // COMPARE_PREFIX(dci(0xce800000), "xar"); // XAR_VVV2_crypto3_imm6 + COMPARE_PREFIX(dci(0xce000000), "eor3"); // EOR3_VVV16_crypto4 + COMPARE_PREFIX(dci(0xce200000), "bcax"); // BCAX_VVV16_crypto4 + COMPARE_PREFIX(dci(0xce608c00), "rax1"); // RAX1_VVV2_cryptosha512_3 + COMPARE_PREFIX(dci(0xce800000), "xar"); // XAR_VVV2_crypto3_imm6 // ARMv8.2 - SHA512 // COMPARE_PREFIX(dci(0xce608000), "sha512h"); // SHA512H_QQV_cryptosha512_3 diff --git a/test/aarch64/test-disasm-neon-aarch64.cc b/test/aarch64/test-disasm-neon-aarch64.cc index c2824c9e..774114da 100644 --- a/test/aarch64/test-disasm-neon-aarch64.cc +++ b/test/aarch64/test-disasm-neon-aarch64.cc @@ -4516,6 +4516,20 @@ TEST(neon_matmul) { CLEANUP(); } +TEST(neon_sha3) { + SETUP(); + + COMPARE_MACRO(Bcax(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B()), + "bcax v0.16b, v1.16b, v2.16b, v3.16b"); + COMPARE_MACRO(Eor3(v10.V16B(), v11.V16B(), v12.V16B(), v13.V16B()), + "eor3 v10.16b, v11.16b, v12.16b, v13.16b"); + COMPARE_MACRO(Xar(v20.V2D(), v21.V2D(), v22.V2D(), 42), + "xar v20.2d, v21.2d, v22.2d, #42"); + COMPARE_MACRO(Rax1(v0.V2D(), v1.V2D(), v2.V2D()), "rax1 v0.2d, v1.2d, v2.2d"); + + CLEANUP(); +} + TEST(neon_unallocated_regression_test) { SETUP(); diff --git a/test/aarch64/test-simulator-sve-aarch64.cc b/test/aarch64/test-simulator-sve-aarch64.cc index 1ba77835..bdd5c81d 100644 --- a/test/aarch64/test-simulator-sve-aarch64.cc +++ b/test/aarch64/test-simulator-sve-aarch64.cc @@ -394,5 +394,169 @@ TEST_SVE(neon_pmull) { } } +TEST_SVE(neon_sha3) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kNEON, + CPUFeatures::kCRC32, + CPUFeatures::kSHA3); + START(); + + SetInitialMachineState(&masm); + // state = 0xe2bd2480 + + { + ExactAssemblyScope scope(&masm, 60 * kInstructionSize); + __ dci(0xce608c00); // rax1 v0.2d, v0.2d, v0.2d + // vl128 state = 0x960c2b9f + __ dci(0xce608e28); // rax1 v8.2d, v17.2d, v0.2d + // vl128 state = 0x89ea3f7b + __ dci(0xce618e6c); // rax1 v12.2d, v19.2d, v1.2d + // vl128 state = 0xa7801384 + __ dci(0xce718e48); // rax1 v8.2d, v18.2d, v17.2d + // vl128 state = 0x4477d70d + __ dci(0xce738e60); // rax1 v0.2d, v19.2d, v19.2d + // vl128 state = 0xdee66854 + __ dci(0xce6b8e61); // rax1 v1.2d, v19.2d, v11.2d + // vl128 state = 0x2e383dc2 + __ dci(0xce6e8e60); // rax1 v0.2d, v19.2d, v14.2d + // vl128 state = 0xa022bb6d + __ dci(0xce6e8e62); // rax1 v2.2d, v19.2d, v14.2d + // vl128 state = 0x923f5d32 + __ dci(0xce668e23); // rax1 v3.2d, v17.2d, v6.2d + // vl128 state = 0xc2c6ca00 + __ dci(0xce260e33); // bcax v19.16b, v17.16b, v6.16b, v3.16b + // vl128 state = 0x517e85e9 + __ dci(0xce260e23); // bcax v3.16b, v17.16b, v6.16b, v3.16b + // vl128 state = 0xbcf4c332 + __ dci(0xce260e93); // bcax v19.16b, v20.16b, v6.16b, v3.16b + // vl128 state = 0x5d9d51ef + __ dci(0xce260a11); // bcax v17.16b, v16.16b, v6.16b, v2.16b + // vl128 state = 0x69ce0099 + __ dci(0xce260a15); // bcax v21.16b, v16.16b, v6.16b, v2.16b + // vl128 state = 0x9a2cdc9f + __ dci(0xce244a11); // bcax v17.16b, v16.16b, v4.16b, v18.16b + // vl128 state = 0x27eeff29 + __ dci(0xce304a10); // bcax v16.16b, v16.16b, v16.16b, v18.16b + // vl128 state = 0x6d586875 + __ dci(0xce314b18); // bcax v24.16b, v24.16b, v17.16b, v18.16b + // vl128 state = 0xe38b6054 + __ dci(0xce214b28); // bcax v8.16b, v25.16b, v1.16b, v18.16b + // vl128 state = 0x27a3f5f6 + __ dci(0xce294f38); // bcax v24.16b, v25.16b, v9.16b, v19.16b + // vl128 state = 0x7d7ffa9b + __ dci(0xce214e39); // bcax v25.16b, v17.16b, v1.16b, v19.16b + // vl128 state = 0x936374f0 + __ dci(0xce216a3d); // bcax v29.16b, v17.16b, v1.16b, v26.16b + // vl128 state = 0x1c5136d5 + __ dci(0xce296b39); // bcax v25.16b, v25.16b, v9.16b, v26.16b + // vl128 state = 0x75cd7131 + __ dci(0xce216338); // bcax v24.16b, v25.16b, v1.16b, v24.16b + // vl128 state = 0xcc747626 + __ dci(0xce2163f9); // bcax v25.16b, v31.16b, v1.16b, v24.16b + // vl128 state = 0x9409c8bc + __ dci(0xce2043f1); // bcax v17.16b, v31.16b, v0.16b, v16.16b + // vl128 state = 0x8db3a0c8 + __ dci(0xce2043f5); // bcax v21.16b, v31.16b, v0.16b, v16.16b + // vl128 state = 0xa55f8d7d + __ dci(0xce2043e5); // bcax v5.16b, v31.16b, v0.16b, v16.16b + // vl128 state = 0xe1960c7a + __ dci(0xce224be7); // bcax v7.16b, v31.16b, v2.16b, v18.16b + // vl128 state = 0xc9599bde + __ dci(0xce204bb7); // bcax v23.16b, v29.16b, v0.16b, v18.16b + // vl128 state = 0x7176d08d + __ dci(0xce004b9f); // eor3 v31.16b, v28.16b, v0.16b, v18.16b + // vl128 state = 0x10620821 + __ dci(0xce000baf); // eor3 v15.16b, v29.16b, v0.16b, v2.16b + // vl128 state = 0x0aba0288 + __ dci(0xce0a0bab); // eor3 v11.16b, v29.16b, v10.16b, v2.16b + // vl128 state = 0xe6517156 + __ dci(0xce0e1baf); // eor3 v15.16b, v29.16b, v14.16b, v6.16b + // vl128 state = 0x6b7021fb + __ dci(0xce0e3fa7); // eor3 v7.16b, v29.16b, v14.16b, v15.16b + // vl128 state = 0x05761b1f + __ dci(0xce0e2fe5); // eor3 v5.16b, v31.16b, v14.16b, v11.16b + // vl128 state = 0xe01822c6 + __ dci(0xce2e2fc7); // bcax v7.16b, v30.16b, v14.16b, v11.16b + // vl128 state = 0xdc6444d7 + __ dci(0xce3e2dcf); // bcax v15.16b, v14.16b, v30.16b, v11.16b + // vl128 state = 0xa5ecad2e + __ dci(0xce3e3fdf); // bcax v31.16b, v30.16b, v30.16b, v15.16b + // vl128 state = 0x2124dc42 + __ dci(0xce3a3ede); // bcax v30.16b, v22.16b, v26.16b, v15.16b + // vl128 state = 0x57f77204 + __ dci(0xce3a2e9c); // bcax v28.16b, v20.16b, v26.16b, v11.16b + // vl128 state = 0x6e8d303d + __ dci(0xce3a2294); // bcax v20.16b, v20.16b, v26.16b, v8.16b + // vl128 state = 0xdb53d42c + __ dci(0xce38029c); // bcax v28.16b, v20.16b, v24.16b, v0.16b + // vl128 state = 0x258d49b8 + __ dci(0xce38088c); // bcax v12.16b, v4.16b, v24.16b, v2.16b + // vl128 state = 0xe751a348 + __ dci(0xce28008e); // bcax v14.16b, v4.16b, v8.16b, v0.16b + // vl128 state = 0x8ce0aa1a + __ dci(0xce28008a); // bcax v10.16b, v4.16b, v8.16b, v0.16b + // vl128 state = 0x1fdf89a5 + __ dci(0xce280088); // bcax v8.16b, v4.16b, v8.16b, v0.16b + // vl128 state = 0xcc51f5e1 + __ dci(0xce2a1089); // bcax v9.16b, v4.16b, v10.16b, v4.16b + // vl128 state = 0xdaf766b0 + __ dci(0xce0b1081); // eor3 v1.16b, v4.16b, v11.16b, v4.16b + // vl128 state = 0x2da7deb5 + __ dci(0xce0a1011); // eor3 v17.16b, v0.16b, v10.16b, v4.16b + // vl128 state = 0xcc86f5d4 + __ dci(0xce121010); // eor3 v16.16b, v0.16b, v18.16b, v4.16b + // vl128 state = 0xfb722105 + __ dci(0xce921118); // xar v24.2d, v8.2d, v18.2d, #4 + // vl128 state = 0x9a7752e3 + __ dci(0xce9a1199); // xar v25.2d, v12.2d, v26.2d, #4 + // vl128 state = 0x83a251c2 + __ dci(0xce9e11dd); // xar v29.2d, v14.2d, v30.2d, #4 + // vl128 state = 0x1e31c9d5 + __ dci(0xce9e915c); // xar v28.2d, v10.2d, v30.2d, #36 + // vl128 state = 0x0e421d73 + __ dci(0xce1e115d); // eor3 v29.16b, v10.16b, v30.16b, v4.16b + // vl128 state = 0xb5a8c677 + __ dci(0xce3e515c); // bcax v28.16b, v10.16b, v30.16b, v20.16b + // vl128 state = 0x21587300 + __ dci(0xce3e5154); // bcax v20.16b, v10.16b, v30.16b, v20.16b + // vl128 state = 0x9459c629 + __ dci(0xce3e1056); // bcax v22.16b, v2.16b, v30.16b, v4.16b + // vl128 state = 0xdb02263a + __ dci(0xce2a105e); // bcax v30.16b, v2.16b, v10.16b, v4.16b + // vl128 state = 0xc9d210aa + __ dci(0xce3a5056); // bcax v22.16b, v2.16b, v26.16b, v20.16b + // vl128 state = 0x4cc56293 + } + + uint32_t state; + ComputeMachineStateHash(&masm, &state); + __ Mov(x0, reinterpret_cast(&state)); + __ Ldr(w0, MemOperand(x0)); + + END(); + if (CAN_RUN()) { + RUN(); + uint32_t expected_hashes[] = { + 0x4cc56293, + 0xee8bac03, + 0xc1253ac9, + 0x9fe5aa0f, + 0x43df27f4, + 0x19f03be6, + 0xd26c928b, + 0x7b9da4c4, + 0xe13149a7, + 0x9fa11ed9, + 0xe02cc4dd, + 0x7848dfe7, + 0x5ed1726f, + 0x983e0123, + 0x34166240, + 0xc4ee172f, + }; + ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0); + } +} + } // namespace aarch64 } // namespace vixl diff --git a/tools/code_coverage.log b/tools/code_coverage.log index f913151b..d29b39b3 100644 --- a/tools/code_coverage.log +++ b/tools/code_coverage.log @@ -23,6 +23,7 @@ 1693487542 82.91% 97.57% 94.87% 1694008240 82.72% 97.50% 94.95% 1697036303 82.87% 97.56% 94.76% +1698330215 82.92% 97.57% 94.88% 1702052331 82.89% 97.59% 94.77% 1706691191 82.87% 97.59% 94.74% 1707395574 82.89% 97.59% 94.77%