From b7ab6ec2ee9f1f576f6f44b595a283c17b3acb56 Mon Sep 17 00:00:00 2001 From: mmc28a <78873583+mmc28a@users.noreply.github.com> Date: Wed, 24 Jul 2024 11:06:17 +0100 Subject: [PATCH] Support SM3 accelerating instructions (#108) Add support for seven Neon SM3 accelerating instructions. --- src/aarch64/assembler-aarch64.cc | 64 ++++ src/aarch64/assembler-aarch64.h | 36 +++ src/aarch64/cpu-features-auditor-aarch64.cc | 6 + src/aarch64/cpu-features-auditor-aarch64.h | 1 + src/aarch64/decoder-visitor-map-aarch64.h | 14 +- src/aarch64/disasm-aarch64.cc | 19 ++ src/aarch64/disasm-aarch64.h | 2 + src/aarch64/logic-aarch64.cc | 129 +++++++- src/aarch64/macro-assembler-aarch64.h | 16 +- src/aarch64/simulator-aarch64.cc | 33 ++ src/aarch64/simulator-aarch64.h | 26 +- test/aarch64/test-cpu-features-aarch64.cc | 13 + test/aarch64/test-disasm-aarch64.cc | 18 +- test/aarch64/test-disasm-neon-aarch64.cc | 21 ++ test/aarch64/test-simulator-sve-aarch64.cc | 336 ++++++++++++++++++++ 15 files changed, 710 insertions(+), 24 deletions(-) diff --git a/src/aarch64/assembler-aarch64.cc b/src/aarch64/assembler-aarch64.cc index de2dbc99..534c1f76 100644 --- a/src/aarch64/assembler-aarch64.cc +++ b/src/aarch64/assembler-aarch64.cc @@ -6053,6 +6053,70 @@ void Assembler::aesmc(const VRegister& vd, const VRegister& vn) { Emit(0x4e286800 | Rd(vd) | Rn(vn)); } +void Assembler::sm3partw1(const VRegister& vd, const VRegister& vn, const VRegister& vm) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSM3)); + VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S()); + + Emit(0xce60c000 | Rd(vd) | Rn(vn) | Rm(vm)); +} + +void Assembler::sm3partw2(const VRegister& vd, const VRegister& vn, const VRegister& vm) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSM3)); + VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S()); + + Emit(0xce60c400 | Rd(vd) | Rn(vn) | Rm(vm)); +} + +void Assembler::sm3ss1(const VRegister& vd, const VRegister& vn, const VRegister& vm, const VRegister& va) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSM3)); + VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S() && va.Is4S()); + + Emit(0xce400000 | Rd(vd) | Rn(vn) | Rm(vm) | Ra(va)); +} + +void Assembler::sm3tt1a(const VRegister& vd, const VRegister& vn, const VRegister& vm, int index) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSM3)); + VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S()); + VIXL_ASSERT(IsUint2(index)); + + Instr i = static_cast(index) << 12; + Emit(0xce408000 | Rd(vd) | Rn(vn) | Rm(vm) | i); +} + +void Assembler::sm3tt1b(const VRegister& vd, const VRegister& vn, const VRegister& vm, int index) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSM3)); + VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S()); + VIXL_ASSERT(IsUint2(index)); + + Instr i = static_cast(index) << 12; + Emit(0xce408400 | Rd(vd) | Rn(vn) | Rm(vm) | i); +} + +void Assembler::sm3tt2a(const VRegister& vd, const VRegister& vn, const VRegister& vm, int index) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSM3)); + VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S()); + VIXL_ASSERT(IsUint2(index)); + + Instr i = static_cast(index) << 12; + Emit(0xce408800 | Rd(vd) | Rn(vn) | Rm(vm) | i); +} + +void Assembler::sm3tt2b(const VRegister& vd, const VRegister& vn, const VRegister& vm, int index) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSM3)); + VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S()); + VIXL_ASSERT(IsUint2(index)); + + Instr i = static_cast(index) << 12; + Emit(0xce408c00 | Rd(vd) | Rn(vn) | Rm(vm) | i); +} + // Note: // For all ToImm instructions below, a difference in case // for the same letter indicates a negated bit. diff --git a/src/aarch64/assembler-aarch64.h b/src/aarch64/assembler-aarch64.h index cc4b3467..c1e4e6a7 100644 --- a/src/aarch64/assembler-aarch64.h +++ b/src/aarch64/assembler-aarch64.h @@ -3696,6 +3696,42 @@ class Assembler : public vixl::internal::AssemblerBase { // AES mix columns. void aesmc(const VRegister& vd, const VRegister& vn); + // SM3PARTW1. + void sm3partw1(const VRegister& vd, const VRegister& vn, const VRegister& vm); + + // SM3PARTW2. + void sm3partw2(const VRegister& vd, const VRegister& vn, const VRegister& vm); + + // SM3SS1. + void sm3ss1(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + const VRegister& va); + + // SM3TT1A. + void sm3tt1a(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + int index); + + // SM3TT1B. + void sm3tt1b(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + int index); + + // SM3TT2A. + void sm3tt2a(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + int index); + + // SM3TT2B. + void sm3tt2b(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + int index); + // Scalable Vector Extensions. // Absolute value (predicated). diff --git a/src/aarch64/cpu-features-auditor-aarch64.cc b/src/aarch64/cpu-features-auditor-aarch64.cc index d815924a..66d29f0e 100644 --- a/src/aarch64/cpu-features-auditor-aarch64.cc +++ b/src/aarch64/cpu-features-auditor-aarch64.cc @@ -279,6 +279,12 @@ void CPUFeaturesAuditor::VisitCryptoAES(const Instruction* instr) { USE(instr); } +void CPUFeaturesAuditor::VisitCryptoSM3(const Instruction* instr) { + RecordInstructionFeaturesScope scope(this); + scope.Record(CPUFeatures::kNEON, CPUFeatures::kSM3); + USE(instr); +} + void CPUFeaturesAuditor::VisitDataProcessing1Source(const Instruction* instr) { RecordInstructionFeaturesScope scope(this); switch (instr->Mask(DataProcessing1SourceMask)) { diff --git a/src/aarch64/cpu-features-auditor-aarch64.h b/src/aarch64/cpu-features-auditor-aarch64.h index 67de6443..24967562 100644 --- a/src/aarch64/cpu-features-auditor-aarch64.h +++ b/src/aarch64/cpu-features-auditor-aarch64.h @@ -113,6 +113,7 @@ class CPUFeaturesAuditor : public DecoderVisitor { #define DECLARE(A) virtual void Visit##A(const Instruction* instr); VISITOR_LIST(DECLARE) #undef DECLARE + void VisitCryptoSM3(const Instruction* instr); void LoadStoreHelper(const Instruction* instr); void LoadStorePairHelper(const Instruction* instr); diff --git a/src/aarch64/decoder-visitor-map-aarch64.h b/src/aarch64/decoder-visitor-map-aarch64.h index db51cadb..b4b39f55 100644 --- a/src/aarch64/decoder-visitor-map-aarch64.h +++ b/src/aarch64/decoder-visitor-map-aarch64.h @@ -2656,13 +2656,13 @@ {"ldtrsw_64_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented}, \ {"ldtr_32_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented}, \ {"ldtr_64_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"sm3partw1_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"sm3partw2_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"sm3ss1_vvv4_crypto4"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"sm3tt1a_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"sm3tt1b_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"sm3tt2a_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"sm3tt2b_vvv_crypto3_imm2"_h, &VISITORCLASS::VisitUnimplemented}, \ + {"sm3partw1_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitCryptoSM3}, \ + {"sm3partw2_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitCryptoSM3}, \ + {"sm3ss1_vvv4_crypto4"_h, &VISITORCLASS::VisitCryptoSM3}, \ + {"sm3tt1a_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitCryptoSM3}, \ + {"sm3tt1b_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitCryptoSM3}, \ + {"sm3tt2a_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitCryptoSM3}, \ + {"sm3tt2b_vvv_crypto3_imm2"_h, &VISITORCLASS::VisitCryptoSM3}, \ {"sm4ekey_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented}, \ {"sm4e_vv4_cryptosha512_2"_h, &VISITORCLASS::VisitUnimplemented}, \ {"st64b_64l_memop"_h, &VISITORCLASS::VisitUnimplemented}, \ diff --git a/src/aarch64/disasm-aarch64.cc b/src/aarch64/disasm-aarch64.cc index f03b2572..ebfc2c71 100644 --- a/src/aarch64/disasm-aarch64.cc +++ b/src/aarch64/disasm-aarch64.cc @@ -2204,6 +2204,25 @@ void Disassembler::VisitCryptoAES(const Instruction *instr) { FormatWithDecodedMnemonic(instr, "'Vd.16b, 'Vn.16b"); } +void Disassembler::VisitCryptoSM3(const Instruction *instr) { + const char *form = "'Vd.4s, 'Vn.4s, 'Vm."; + const char *suffix = "4s"; + + switch (form_hash_) { + case "sm3ss1_vvv4_crypto4"_h: + suffix = "4s, 'Va.4s"; + break; + case "sm3tt1a_vvv4_crypto3_imm2"_h: + case "sm3tt1b_vvv4_crypto3_imm2"_h: + case "sm3tt2a_vvv4_crypto3_imm2"_h: + case "sm3tt2b_vvv_crypto3_imm2"_h: + suffix = "s['u1312]"; + break; + } + + FormatWithDecodedMnemonic(instr, form, suffix); +} + void Disassembler::DisassembleSHA512(const Instruction *instr) { const char *form = "'Qd, 'Qn, 'Vm.2d"; const char *suffix = NULL; diff --git a/src/aarch64/disasm-aarch64.h b/src/aarch64/disasm-aarch64.h index 574d4f4b..b139c4c2 100644 --- a/src/aarch64/disasm-aarch64.h +++ b/src/aarch64/disasm-aarch64.h @@ -243,6 +243,8 @@ class Disassembler : public DecoderVisitor { void Disassemble_Xd_XnSP_Xm(const Instruction* instr); void Disassemble_Xd_XnSP_XmSP(const Instruction* instr); + void VisitCryptoSM3(const Instruction* instr); + void Format(const Instruction* instr, const char* mnemonic, const char* format0, diff --git a/src/aarch64/logic-aarch64.cc b/src/aarch64/logic-aarch64.cc index 9a81e49c..4d50568e 100644 --- a/src/aarch64/logic-aarch64.cc +++ b/src/aarch64/logic-aarch64.cc @@ -7895,17 +7895,17 @@ LogicVRegister Simulator::fmatmul(VectorFormat vform, } template <> -uint64_t SHA1Operation<"choose"_h>(uint64_t x, uint64_t y, uint64_t z) { +uint64_t CryptoOp<"choose"_h>(uint64_t x, uint64_t y, uint64_t z) { return ((y ^ z) & x) ^ z; } template <> -uint64_t SHA1Operation<"majority"_h>(uint64_t x, uint64_t y, uint64_t z) { +uint64_t CryptoOp<"majority"_h>(uint64_t x, uint64_t y, uint64_t z) { return (x & y) | ((x | y) & z); } template <> -uint64_t SHA1Operation<"parity"_h>(uint64_t x, uint64_t y, uint64_t z) { +uint64_t CryptoOp<"parity"_h>(uint64_t x, uint64_t y, uint64_t z) { return x ^ y ^ z; } @@ -7932,8 +7932,8 @@ LogicVRegister Simulator::sha2h(LogicVRegister srcdst, } for (unsigned i = 0; i < ArrayLength(x); i++) { - uint64_t chs = SHA1Operation<"choose"_h>(y[0], y[1], y[2]); - uint64_t maj = SHA1Operation<"majority"_h>(x[0], x[1], x[2]); + uint64_t chs = CryptoOp<"choose"_h>(y[0], y[1], y[2]); + uint64_t maj = CryptoOp<"majority"_h>(x[0], x[1], x[2]); uint64_t w = src2.Uint(kFormat4S, i); uint64_t t = y[3] + SHASigma(y[0]) + chs + w; @@ -8351,6 +8351,125 @@ LogicVRegister Simulator::aes(LogicVRegister dst, return dst; } +LogicVRegister Simulator::sm3partw1(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2) { + using namespace std::placeholders; + auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize); + + SimVRegister temp; + + ext(kFormat16B, temp, src2, temp, 4); + rol(kFormat4S, temp, temp, 15); + eor(kFormat4S, temp, temp, src1); + LogicVRegister r = eor(kFormat4S, temp, temp, srcdst); + + uint64_t result[4] = {}; + r.UintArray(kFormat4S, result); + for (int i = 0; i < 4; i++) { + if (i == 3) { + // result[3] already contains srcdst[3] ^ src1[3] from the operations + // above. + result[i] ^= ROL(result[0], 15); + } + result[i] ^= ROL(result[i], 15) ^ ROL(result[i], 23); + } + srcdst.SetUintArray(kFormat4S, result); + return srcdst; +} + +LogicVRegister Simulator::sm3partw2(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2) { + using namespace std::placeholders; + auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize); + + SimVRegister temp; + VectorFormat vf = kFormat4S; + + rol(vf, temp, src2, 7); + LogicVRegister r = eor(vf, temp, temp, src1); + eor(vf, srcdst, temp, srcdst); + + uint64_t tmp2 = ROL(r.Uint(vf, 0), 15); + tmp2 ^= ROL(tmp2, 15) ^ ROL(tmp2, 23); + srcdst.SetUint(vf, 3, srcdst.Uint(vf, 3) ^ tmp2); + return srcdst; +} + +LogicVRegister Simulator::sm3ss1(LogicVRegister dst, + const LogicVRegister& src1, + const LogicVRegister& src2, + const LogicVRegister& src3) { + using namespace std::placeholders; + auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize); + + VectorFormat vf = kFormat4S; + uint64_t result = ROL(src1.Uint(vf, 3), 12); + result += src2.Uint(vf, 3) + src3.Uint(vf, 3); + dst.Clear(); + dst.SetUint(vf, 3, ROL(result, 7)); + return dst; +} + +LogicVRegister Simulator::sm3tt1(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2, + int index, + bool is_a) { + VectorFormat vf = kFormat4S; + using namespace std::placeholders; + auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize); + auto sd = std::bind(&LogicVRegister::Uint, srcdst, vf, _1); + + VIXL_ASSERT(IsUint2(index)); + + uint64_t wjprime = src2.Uint(vf, index); + uint64_t ss2 = src1.Uint(vf, 3) ^ ROL(sd(3), 12); + + uint64_t tt1; + if (is_a) { + tt1 = CryptoOp<"parity"_h>(sd(1), sd(2), sd(3)); + } else { + tt1 = CryptoOp<"majority"_h>(sd(1), sd(2), sd(3)); + } + tt1 += sd(0) + ss2 + wjprime; + + ext(kFormat16B, srcdst, srcdst, srcdst, 4); + srcdst.SetUint(vf, 1, ROL(sd(1), 9)); + srcdst.SetUint(vf, 3, tt1); + return srcdst; +} + +LogicVRegister Simulator::sm3tt2(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2, + int index, + bool is_a) { + VectorFormat vf = kFormat4S; + using namespace std::placeholders; + auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize); + auto sd = std::bind(&LogicVRegister::Uint, srcdst, vf, _1); + + VIXL_ASSERT(IsUint2(index)); + + uint64_t wj = src2.Uint(vf, index); + + uint64_t tt2; + if (is_a) { + tt2 = CryptoOp<"parity"_h>(sd(1), sd(2), sd(3)); + } else { + tt2 = CryptoOp<"choose"_h>(sd(3), sd(2), sd(1)); + } + tt2 += sd(0) + src1.Uint(vf, 3) + wj; + + ext(kFormat16B, srcdst, srcdst, srcdst, 4); + srcdst.SetUint(vf, 1, ROL(sd(1), 19)); + tt2 ^= ROL(tt2, 9) ^ ROL(tt2, 17); + srcdst.SetUint(vf, 3, tt2); + return srcdst; +} + } // namespace aarch64 } // namespace vixl diff --git a/src/aarch64/macro-assembler-aarch64.h b/src/aarch64/macro-assembler-aarch64.h index a989dddb..b74be350 100644 --- a/src/aarch64/macro-assembler-aarch64.h +++ b/src/aarch64/macro-assembler-aarch64.h @@ -2812,6 +2812,8 @@ class MacroAssembler : public Assembler, public MacroAssemblerInterface { V(sha512su1, Sha512su1) \ V(shadd, Shadd) \ V(shsub, Shsub) \ + V(sm3partw1, Sm3partw1) \ + V(sm3partw2, Sm3partw2) \ V(smax, Smax) \ V(smaxp, Smaxp) \ V(smin, Smin) \ @@ -3052,7 +3054,11 @@ class MacroAssembler : public Assembler, public MacroAssemblerInterface { V(umlsl, Umlsl) \ V(umlsl2, Umlsl2) \ V(sudot, Sudot) \ - V(usdot, Usdot) + V(usdot, Usdot) \ + V(sm3tt1a, Sm3tt1a) \ + V(sm3tt1b, Sm3tt1b) \ + V(sm3tt2a, Sm3tt2a) \ + V(sm3tt2b, Sm3tt2b) #define DEFINE_MACRO_ASM_FUNC(ASM, MASM) \ @@ -3523,6 +3529,14 @@ class MacroAssembler : public Assembler, public MacroAssemblerInterface { SingleEmissionCheckScope guard(this); st4(vt, vt2, vt3, vt4, lane, dst); } + void Sm3ss1(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + const VRegister& va) { + VIXL_ASSERT(allow_macro_instructions_); + SingleEmissionCheckScope guard(this); + sm3ss1(vd, vn, vm, va); + } void Smov(const Register& rd, const VRegister& vn, int vn_index) { VIXL_ASSERT(allow_macro_instructions_); SingleEmissionCheckScope guard(this); diff --git a/src/aarch64/simulator-aarch64.cc b/src/aarch64/simulator-aarch64.cc index e63715cd..83d1649a 100644 --- a/src/aarch64/simulator-aarch64.cc +++ b/src/aarch64/simulator-aarch64.cc @@ -7261,6 +7261,39 @@ void Simulator::VisitCryptoAES(const Instruction* instr) { } } +void Simulator::VisitCryptoSM3(const Instruction* instr) { + SimVRegister& rd = ReadVRegister(instr->GetRd()); + SimVRegister& rn = ReadVRegister(instr->GetRn()); + SimVRegister& rm = ReadVRegister(instr->GetRm()); + SimVRegister& ra = ReadVRegister(instr->GetRa()); + int index = instr->ExtractBits(13, 12); + + bool is_a = false; + switch (form_hash_) { + case "sm3partw1_vvv4_cryptosha512_3"_h: + sm3partw1(rd, rn, rm); + break; + case "sm3partw2_vvv4_cryptosha512_3"_h: + sm3partw2(rd, rn, rm); + break; + case "sm3ss1_vvv4_crypto4"_h: + sm3ss1(rd, rn, rm, ra); + break; + case "sm3tt1a_vvv4_crypto3_imm2"_h: + is_a = true; + VIXL_FALLTHROUGH(); + case "sm3tt1b_vvv4_crypto3_imm2"_h: + sm3tt1(rd, rn, rm, index, is_a); + break; + case "sm3tt2a_vvv4_crypto3_imm2"_h: + is_a = true; + VIXL_FALLTHROUGH(); + case "sm3tt2b_vvv_crypto3_imm2"_h: + sm3tt2(rd, rn, rm, index, is_a); + break; + } +} + void Simulator::SimulateSHA512(const Instruction* instr) { SimVRegister& rd = ReadVRegister(instr->GetRd()); SimVRegister& rn = ReadVRegister(instr->GetRn()); diff --git a/src/aarch64/simulator-aarch64.h b/src/aarch64/simulator-aarch64.h index c20ec3c1..6e36246d 100644 --- a/src/aarch64/simulator-aarch64.h +++ b/src/aarch64/simulator-aarch64.h @@ -1283,7 +1283,7 @@ class SimExclusiveGlobalMonitor { class Debugger; template -uint64_t SHA1Operation(uint64_t x, uint64_t y, uint64_t z); +uint64_t CryptoOp(uint64_t x, uint64_t y, uint64_t z); class Simulator : public DecoderVisitor { public: @@ -1532,6 +1532,7 @@ class Simulator : public DecoderVisitor { void SimulateUnsignedMinMax(const Instruction* instr); void SimulateSHA512(const Instruction* instr); + void VisitCryptoSM3(const Instruction* instr); // Integer register accessors. @@ -4518,7 +4519,7 @@ class Simulator : public DecoderVisitor { srcdst.UintArray(kFormat4S, sd); for (unsigned i = 0; i < ArrayLength(sd); i++) { - uint64_t t = SHA1Operation(sd[1], sd[2], sd[3]); + uint64_t t = CryptoOp(sd[1], sd[2], sd[3]); y += RotateLeft(sd[0], 5, kSRegSize) + t; y += src2.Uint(kFormat4S, i); @@ -4561,6 +4562,27 @@ class Simulator : public DecoderVisitor { const LogicVRegister& src1, bool inverse); + LogicVRegister sm3partw1(LogicVRegister dst, + const LogicVRegister& src1, + const LogicVRegister& src2); + LogicVRegister sm3partw2(LogicVRegister dst, + const LogicVRegister& src1, + const LogicVRegister& src2); + LogicVRegister sm3ss1(LogicVRegister dst, + const LogicVRegister& src1, + const LogicVRegister& src2, + const LogicVRegister& src3); + LogicVRegister sm3tt1(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2, + int index, + bool is_a); + LogicVRegister sm3tt2(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2, + int index, + bool is_a); + #define NEON_3VREG_LOGIC_LIST(V) \ V(addhn) \ V(addhn2) \ diff --git a/test/aarch64/test-cpu-features-aarch64.cc b/test/aarch64/test-cpu-features-aarch64.cc index a14c5241..5cc2a58e 100644 --- a/test/aarch64/test-cpu-features-aarch64.cc +++ b/test/aarch64/test-cpu-features-aarch64.cc @@ -3835,5 +3835,18 @@ TEST_FEAT(aesimc_0, aesimc(v0.V16B(), v29.V16B())) TEST_FEAT(aesmc_0, aesmc(v0.V16B(), v29.V16B())) #undef TEST_FEAT +#define TEST_FEAT(NAME, ASM) \ + TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSM3), \ + NEON_SM3_##NAME, \ + ASM) +TEST_FEAT(sm3partw1_0, sm3partw1(v12.V4S(), v13.V4S(), v14.V4S())) +TEST_FEAT(sm3partw2_0, sm3partw2(v12.V4S(), v13.V4S(), v14.V4S())) +TEST_FEAT(sm3ss1_0, sm3ss1(v13.V4S(), v15.V4S(), v17.V4S(), v21.V4S())) +TEST_FEAT(sm3tt1a_0, sm3tt1a(v30.V4S(), v29.V4S(), v9.V4S(), 1)) +TEST_FEAT(sm3tt1b_0, sm3tt1b(v30.V4S(), v29.V4S(), v9.V4S(), 3)) +TEST_FEAT(sm3tt2a_0, sm3tt2a(v30.V4S(), v29.V4S(), v9.V4S(), 2)) +TEST_FEAT(sm3tt2b_0, sm3tt2b(v30.V4S(), v29.V4S(), v9.V4S(), 0)) +#undef TEST_FEAT + } // namespace aarch64 } // namespace vixl diff --git a/test/aarch64/test-disasm-aarch64.cc b/test/aarch64/test-disasm-aarch64.cc index 25820f63..6a29ffe5 100644 --- a/test/aarch64/test-disasm-aarch64.cc +++ b/test/aarch64/test-disasm-aarch64.cc @@ -3802,15 +3802,15 @@ TEST(architecture_features) { COMPARE_PREFIX(dci(0xcec08000), "sha512su0"); // SHA512SU0_VV2_cryptosha512_2 // ARMv8.2 - SM3 - // COMPARE_PREFIX(dci(0xce400000), "sm3ss1"); // SM3SS1_VVV4_crypto4 - // COMPARE_PREFIX(dci(0xce408000), "sm3tt1a"); // SM3TT1A_VVV4_crypto3_imm2 - // COMPARE_PREFIX(dci(0xce408400), "sm3tt1b"); // SM3TT1B_VVV4_crypto3_imm2 - // COMPARE_PREFIX(dci(0xce408800), "sm3tt2a"); // SM3TT2A_VVV4_crypto3_imm2 - // COMPARE_PREFIX(dci(0xce408c00), "sm3tt2b"); // SM3TT2B_VVV_crypto3_imm2 - // COMPARE_PREFIX(dci(0xce60c000), "sm3partw1"); // - // SM3PARTW1_VVV4_cryptosha512_3 - // COMPARE_PREFIX(dci(0xce60c400), "sm3partw2"); // - // SM3PARTW2_VVV4_cryptosha512_3 + COMPARE_PREFIX(dci(0xce400000), "sm3ss1"); // SM3SS1_VVV4_crypto4 + COMPARE_PREFIX(dci(0xce408000), "sm3tt1a"); // SM3TT1A_VVV4_crypto3_imm2 + COMPARE_PREFIX(dci(0xce408400), "sm3tt1b"); // SM3TT1B_VVV4_crypto3_imm2 + COMPARE_PREFIX(dci(0xce408800), "sm3tt2a"); // SM3TT2A_VVV4_crypto3_imm2 + COMPARE_PREFIX(dci(0xce408c00), "sm3tt2b"); // SM3TT2B_VVV_crypto3_imm2 + COMPARE_PREFIX(dci(0xce60c000), + "sm3partw1"); // SM3PARTW1_VVV4_cryptosha512_3 + COMPARE_PREFIX(dci(0xce60c400), + "sm3partw2"); // SM3PARTW2_VVV4_cryptosha512_3 // ARMv8.2 - SM4 // COMPARE_PREFIX(dci(0xce60c800), "sm4ekey"); // diff --git a/test/aarch64/test-disasm-neon-aarch64.cc b/test/aarch64/test-disasm-neon-aarch64.cc index 5bdded9d..26ecf4cc 100644 --- a/test/aarch64/test-disasm-neon-aarch64.cc +++ b/test/aarch64/test-disasm-neon-aarch64.cc @@ -4579,6 +4579,27 @@ TEST(neon_aes) { CLEANUP(); } +TEST(neon_sm3) { + SETUP(); + + COMPARE_MACRO(Sm3partw1(v12.V4S(), v13.V4S(), v14.V4S()), + "sm3partw1 v12.4s, v13.4s, v14.4s"); + COMPARE_MACRO(Sm3partw2(v12.V4S(), v13.V4S(), v14.V4S()), + "sm3partw2 v12.4s, v13.4s, v14.4s"); + COMPARE_MACRO(Sm3ss1(v13.V4S(), v15.V4S(), v17.V4S(), v21.V4S()), + "sm3ss1 v13.4s, v15.4s, v17.4s, v21.4s"); + COMPARE_MACRO(Sm3tt1a(v30.V4S(), v29.V4S(), v9.V4S(), 1), + "sm3tt1a v30.4s, v29.4s, v9.s[1]"); + COMPARE_MACRO(Sm3tt1b(v30.V4S(), v29.V4S(), v9.V4S(), 3), + "sm3tt1b v30.4s, v29.4s, v9.s[3]"); + COMPARE_MACRO(Sm3tt2a(v30.V4S(), v29.V4S(), v9.V4S(), 2), + "sm3tt2a v30.4s, v29.4s, v9.s[2]"); + COMPARE_MACRO(Sm3tt2b(v30.V4S(), v29.V4S(), v9.V4S(), 0), + "sm3tt2b v30.4s, v29.4s, v9.s[0]"); + + CLEANUP(); +} + TEST(neon_unallocated_regression_test) { SETUP(); diff --git a/test/aarch64/test-simulator-sve-aarch64.cc b/test/aarch64/test-simulator-sve-aarch64.cc index e62fea26..585c00c1 100644 --- a/test/aarch64/test-simulator-sve-aarch64.cc +++ b/test/aarch64/test-simulator-sve-aarch64.cc @@ -1534,5 +1534,341 @@ TEST_SVE(neon_aesmc) { } } +TEST_SVE(neon_sm3) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kNEON, + CPUFeatures::kCRC32, + CPUFeatures::kSM3); + START(); + + SetInitialMachineState(&masm); + // state = 0xe2bd2480 + + { + ExactAssemblyScope scope(&masm, 10 * kInstructionSize); + __ dci(0xce591017); // sm3ss1 v23.4s, v0.4s, v25.4s, v4.4s + // vl128 state = 0xad4bba0a + __ dci(0xce49121f); // sm3ss1 v31.4s, v16.4s, v9.4s, v4.4s + // vl128 state = 0x84adef21 + __ dci(0xce49121e); // sm3ss1 v30.4s, v16.4s, v9.4s, v4.4s + // vl128 state = 0xccfd7e5a + __ dci(0xce49301a); // sm3ss1 v26.4s, v0.4s, v9.4s, v12.4s + // vl128 state = 0x60833cc7 + __ dci(0xce49720a); // sm3ss1 v10.4s, v16.4s, v9.4s, v28.4s + // vl128 state = 0x03f03263 + __ dci(0xce58721a); // sm3ss1 v26.4s, v16.4s, v24.4s, v28.4s + // vl128 state = 0x31845f40 + __ dci(0xce58702a); // sm3ss1 v10.4s, v1.4s, v24.4s, v28.4s + // vl128 state = 0x54c64f70 + __ dci(0xce58753a); // sm3ss1 v26.4s, v9.4s, v24.4s, v29.4s + // vl128 state = 0x3d5cb04f + __ dci(0xce507518); // sm3ss1 v24.4s, v8.4s, v16.4s, v29.4s + // vl128 state = 0xe02de221 + __ dci(0xce406519); // sm3ss1 v25.4s, v8.4s, v0.4s, v25.4s + // vl128 state = 0x73d36ae8 + } + + uint32_t state; + ComputeMachineStateHash(&masm, &state); + __ Mov(x0, reinterpret_cast(&state)); + __ Ldr(w0, MemOperand(x0)); + + END(); + if (CAN_RUN()) { + RUN(); + uint32_t expected_hashes[] = { + 0x73d36ae8, + 0xcbcda2db, + 0x6ee9ad3d, + 0xa6857a16, + 0xa238ec05, + 0x1bc82d1d, + 0xe4530773, + 0xfb0d092e, + 0xe62aff0a, + 0xf56a593f, + 0x3967d590, + 0xebcd14a0, + 0xa7bedcb8, + 0x867fa43c, + 0x1679eab5, + 0x0a836861, + }; + ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0); + } +} + +TEST_SVE(neon_sm3partw12) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kNEON, + CPUFeatures::kCRC32, + CPUFeatures::kSM3); + START(); + + SetInitialMachineState(&masm); + // state = 0xe2bd2480 + + { + ExactAssemblyScope scope(&masm, 30 * kInstructionSize); + __ dci(0xce70c201); // sm3partw1 v1.4s, v16.4s, v16.4s + // vl128 state = 0x6f2069a6 + __ dci(0xce72c303); // sm3partw1 v3.4s, v24.4s, v18.4s + // vl128 state = 0x986fa56c + __ dci(0xce76c381); // sm3partw1 v1.4s, v28.4s, v22.4s + // vl128 state = 0x5dbd953c + __ dci(0xce7ec3b1); // sm3partw1 v17.4s, v29.4s, v30.4s + // vl128 state = 0xc72ccca5 + __ dci(0xce7ac1b5); // sm3partw1 v21.4s, v13.4s, v26.4s + // vl128 state = 0x33cdfd6a + __ dci(0xce7ac1b7); // sm3partw1 v23.4s, v13.4s, v26.4s + // vl128 state = 0x4303e945 + __ dci(0xce7ac1bf); // sm3partw1 v31.4s, v13.4s, v26.4s + // vl128 state = 0x56acac84 + __ dci(0xce78c1fd); // sm3partw1 v29.4s, v15.4s, v24.4s + // vl128 state = 0x5e2a2793 + __ dci(0xce78c5df); // sm3partw2 v31.4s, v14.4s, v24.4s + // vl128 state = 0xf7c457f3 + __ dci(0xce70c55d); // sm3partw2 v29.4s, v10.4s, v16.4s + // vl128 state = 0xfa3557ac + __ dci(0xce60c159); // sm3partw1 v25.4s, v10.4s, v0.4s + // vl128 state = 0xb3ae6830 + __ dci(0xce62c55b); // sm3partw2 v27.4s, v10.4s, v2.4s + // vl128 state = 0xa7747c70 + __ dci(0xce66c753); // sm3partw2 v19.4s, v26.4s, v6.4s + // vl128 state = 0xb55f5895 + __ dci(0xce67c551); // sm3partw2 v17.4s, v10.4s, v7.4s + // vl128 state = 0x519b1342 + __ dci(0xce65c750); // sm3partw2 v16.4s, v26.4s, v5.4s + // vl128 state = 0xc4e6e4b9 + __ dci(0xce61c718); // sm3partw2 v24.4s, v24.4s, v1.4s + // vl128 state = 0x127c483c + __ dci(0xce61c71c); // sm3partw2 v28.4s, v24.4s, v1.4s + // vl128 state = 0x92783ecc + __ dci(0xce6dc714); // sm3partw2 v20.4s, v24.4s, v13.4s + // vl128 state = 0xe11e87d3 + __ dci(0xce65c756); // sm3partw2 v22.4s, v26.4s, v5.4s + // vl128 state = 0x8b6878d0 + __ dci(0xce65c5d2); // sm3partw2 v18.4s, v14.4s, v5.4s + // vl128 state = 0xf2fb1e86 + __ dci(0xce64c550); // sm3partw2 v16.4s, v10.4s, v4.4s + // vl128 state = 0x73ad3b0f + __ dci(0xce66c578); // sm3partw2 v24.4s, v11.4s, v6.4s + // vl128 state = 0x7e03900d + __ dci(0xce76c55c); // sm3partw2 v28.4s, v10.4s, v22.4s + // vl128 state = 0x1d0b5df6 + __ dci(0xce76c54c); // sm3partw2 v12.4s, v10.4s, v22.4s + // vl128 state = 0x1a3d7a77 + __ dci(0xce7ec448); // sm3partw2 v8.4s, v2.4s, v30.4s + // vl128 state = 0x3ed2e4bd + __ dci(0xce6ec409); // sm3partw2 v9.4s, v0.4s, v14.4s + // vl128 state = 0x826dd348 + __ dci(0xce6ec52b); // sm3partw2 v11.4s, v9.4s, v14.4s + // vl128 state = 0x3ff5e482 + __ dci(0xce66c72f); // sm3partw2 v15.4s, v25.4s, v6.4s + // vl128 state = 0x6fd24cd4 + __ dci(0xce65c73f); // sm3partw2 v31.4s, v25.4s, v5.4s + // vl128 state = 0xd51ac474 + __ dci(0xce67c77b); // sm3partw2 v27.4s, v27.4s, v7.4s + // vl128 state = 0x720d7419 + } + + uint32_t state; + ComputeMachineStateHash(&masm, &state); + __ Mov(x0, reinterpret_cast(&state)); + __ Ldr(w0, MemOperand(x0)); + + END(); + if (CAN_RUN()) { + RUN(); + uint32_t expected_hashes[] = { + 0x720d7419, + 0x31445e06, + 0xd2aee240, + 0x45a27e4b, + 0xd6c46f08, + 0xcaed7f9e, + 0x734820c7, + 0x377e1f38, + 0x12e03585, + 0x1b9cbe63, + 0x1d58d49a, + 0xc160a9dc, + 0x22c2fe25, + 0x86b7af0f, + 0xfeae7bf5, + 0xf8dfcc40, + }; + ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0); + } +} + +TEST_SVE(neon_sm3tt1) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kNEON, + CPUFeatures::kCRC32, + CPUFeatures::kSM3); + START(); + + SetInitialMachineState(&masm); + // state = 0xe2bd2480 + + { + ExactAssemblyScope scope(&masm, 20 * kInstructionSize); + __ dci(0xce53a363); // sm3tt1a v3.4s, v27.4s, v19.s[2] + // vl128 state = 0xaaa8c715 + __ dci(0xce58a7a7); // sm3tt1b v7.4s, v29.4s, v24.s[2] + // vl128 state = 0xb99a301d + __ dci(0xce5eb2b7); // sm3tt1a v23.4s, v21.4s, v30.s[3] + // vl128 state = 0xe8dabe99 + __ dci(0xce43b6ce); // sm3tt1b v14.4s, v22.4s, v3.s[3] + // vl128 state = 0xaa498ae5 + __ dci(0xce448027); // sm3tt1a v7.4s, v1.4s, v4.s[0] + // vl128 state = 0x32093547 + __ dci(0xce4286d8); // sm3tt1b v24.4s, v22.4s, v2.s[0] + // vl128 state = 0xe03e3a81 + __ dci(0xce44a0f3); // sm3tt1a v19.4s, v7.4s, v4.s[2] + // vl128 state = 0xcb555b4a + __ dci(0xce418233); // sm3tt1a v19.4s, v17.4s, v1.s[0] + // vl128 state = 0x751e4f7d + __ dci(0xce58a49f); // sm3tt1b v31.4s, v4.4s, v24.s[2] + // vl128 state = 0xcaff7580 + __ dci(0xce548326); // sm3tt1a v6.4s, v25.4s, v20.s[0] + // vl128 state = 0xc4308a78 + __ dci(0xce548124); // sm3tt1a v4.4s, v9.4s, v20.s[0] + // vl128 state = 0x1f1bfdfb + __ dci(0xce5fb282); // sm3tt1a v2.4s, v20.4s, v31.s[3] + // vl128 state = 0xa632c0b2 + __ dci(0xce549573); // sm3tt1b v19.4s, v11.4s, v20.s[1] + // vl128 state = 0x7fb7c2d3 + __ dci(0xce4387ae); // sm3tt1b v14.4s, v29.4s, v3.s[0] + // vl128 state = 0xe8d4c534 + __ dci(0xce5094eb); // sm3tt1b v11.4s, v7.4s, v16.s[1] + // vl128 state = 0xf34a4fbc + __ dci(0xce51b59f); // sm3tt1b v31.4s, v12.4s, v17.s[3] + // vl128 state = 0x98e388e9 + __ dci(0xce50a7bf); // sm3tt1b v31.4s, v29.4s, v16.s[2] + // vl128 state = 0x7cd7a6ac + __ dci(0xce5ca52e); // sm3tt1b v14.4s, v9.4s, v28.s[2] + // vl128 state = 0xce9410c5 + __ dci(0xce5aa741); // sm3tt1b v1.4s, v26.4s, v26.s[2] + // vl128 state = 0xd83fbd58 + __ dci(0xce5e94da); // sm3tt1b v26.4s, v6.4s, v30.s[1] + // vl128 state = 0xc6055fe3 + } + + uint32_t state; + ComputeMachineStateHash(&masm, &state); + __ Mov(x0, reinterpret_cast(&state)); + __ Ldr(w0, MemOperand(x0)); + + END(); + if (CAN_RUN()) { + RUN(); + uint32_t expected_hashes[] = { + 0xc6055fe3, + 0xa2c33f98, + 0x1cc9a227, + 0xf29eb254, + 0xd1739d6e, + 0x1c4fff34, + 0x0c182795, + 0x96e46836, + 0x43d010c9, + 0xd7c4f94c, + 0x78c387f2, + 0x4319fef3, + 0x72407eef, + 0xa77d3869, + 0x3c81c49a, + 0x68cc20ef, + }; + ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0); + } +} + +TEST_SVE(neon_sm3tt2) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kNEON, + CPUFeatures::kCRC32, + CPUFeatures::kSM3); + START(); + + SetInitialMachineState(&masm); + // state = 0xe2bd2480 + + { + ExactAssemblyScope scope(&masm, 20 * kInstructionSize); + __ dci(0xce439d42); // sm3tt2b v2.4s, v10.4s, v3.s[1] + // vl128 state = 0x388642cc + __ dci(0xce42b89d); // sm3tt2a v29.4s, v4.4s, v2.s[3] + // vl128 state = 0x66f4e60a + __ dci(0xce4da95d); // sm3tt2a v29.4s, v10.4s, v13.s[2] + // vl128 state = 0x95d4651d + __ dci(0xce49b926); // sm3tt2a v6.4s, v9.4s, v9.s[3] + // vl128 state = 0x826919fe + __ dci(0xce5cae33); // sm3tt2b v19.4s, v17.4s, v28.s[2] + // vl128 state = 0xb5cfefb0 + __ dci(0xce478959); // sm3tt2a v25.4s, v10.4s, v7.s[0] + // vl128 state = 0xfe17b730 + __ dci(0xce549cc2); // sm3tt2b v2.4s, v6.4s, v20.s[1] + // vl128 state = 0x769a0d76 + __ dci(0xce4c9f90); // sm3tt2b v16.4s, v28.4s, v12.s[1] + // vl128 state = 0x8f633b95 + __ dci(0xce508d49); // sm3tt2b v9.4s, v10.4s, v16.s[0] + // vl128 state = 0x5eab6daa + __ dci(0xce59ad79); // sm3tt2b v25.4s, v11.4s, v25.s[2] + // vl128 state = 0xfb197616 + __ dci(0xce458fd6); // sm3tt2b v22.4s, v30.4s, v5.s[0] + // vl128 state = 0x875ff29d + __ dci(0xce4ab92c); // sm3tt2a v12.4s, v9.4s, v10.s[3] + // vl128 state = 0xad159c01 + __ dci(0xce598a1c); // sm3tt2a v28.4s, v16.4s, v25.s[0] + // vl128 state = 0x3da313e4 + __ dci(0xce43989f); // sm3tt2a v31.4s, v4.4s, v3.s[1] + // vl128 state = 0xc0a54179 + __ dci(0xce459c8a); // sm3tt2b v10.4s, v4.4s, v5.s[1] + // vl128 state = 0x4739cdbf + __ dci(0xce539959); // sm3tt2a v25.4s, v10.4s, v19.s[1] + // vl128 state = 0xd85f84ab + __ dci(0xce429be1); // sm3tt2a v1.4s, v31.4s, v2.s[1] + // vl128 state = 0x85b5871c + __ dci(0xce5d9fe3); // sm3tt2b v3.4s, v31.4s, v29.s[1] + // vl128 state = 0x2be5bd95 + __ dci(0xce4ebe16); // sm3tt2b v22.4s, v16.4s, v14.s[3] + // vl128 state = 0x2f8146e9 + __ dci(0xce599a63); // sm3tt2a v3.4s, v19.4s, v25.s[1] + // vl128 state = 0xa6e513e2 + } + + uint32_t state; + ComputeMachineStateHash(&masm, &state); + __ Mov(x0, reinterpret_cast(&state)); + __ Ldr(w0, MemOperand(x0)); + + END(); + if (CAN_RUN()) { + RUN(); + uint32_t expected_hashes[] = { + 0xa6e513e2, + 0x6bf4ae47, + 0x74e074db, + 0xae1a57e0, + 0x0db67f09, + 0x85332e49, + 0xc40d6565, + 0x07ed81aa, + 0xfa0e10bb, + 0x9addadfa, + 0xa9cea561, + 0xa481e17b, + 0x7c2be34e, + 0xd4cf493f, + 0x8b30cc5e, + 0xe44416d3, + }; + ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0); + } +} + } // namespace aarch64 } // namespace vixl