Skip to content

Commit

Permalink
Support SHA-3 accelerating instructions (#101)
Browse files Browse the repository at this point in the history
Add support for Neon BCAX, EOR3, RAX1 and XAR instructions, used to accelerate
SHA-3.
  • Loading branch information
mmc28a authored Jun 20, 2024
1 parent 3c40723 commit f307b4a
Show file tree
Hide file tree
Showing 14 changed files with 330 additions and 8 deletions.
33 changes: 33 additions & 0 deletions src/aarch64/assembler-aarch64.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5876,6 +5876,39 @@ void Assembler::ummla(const VRegister& vd, const VRegister& vn, const VRegister&
Emit(0x6e80a400 | Rd(vd) | Rn(vn) | Rm(vm));
}

void Assembler::bcax(const VRegister& vd, const VRegister& vn, const VRegister& vm, const VRegister& va) {
VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
VIXL_ASSERT(CPUHas(CPUFeatures::kSHA3));
VIXL_ASSERT(vd.Is16B() && vn.Is16B() && vm.Is16B());

Emit(0xce200000 | Rd(vd) | Rn(vn) | Rm(vm) | Ra(va));
}

void Assembler::eor3(const VRegister& vd, const VRegister& vn, const VRegister& vm, const VRegister& va) {
VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
VIXL_ASSERT(CPUHas(CPUFeatures::kSHA3));
VIXL_ASSERT(vd.Is16B() && vn.Is16B() && vm.Is16B() && va.Is16B());

Emit(0xce000000 | Rd(vd) | Rn(vn) | Rm(vm) | Ra(va));
}

void Assembler::xar(const VRegister& vd, const VRegister& vn, const VRegister& vm, int rotate) {
VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
VIXL_ASSERT(CPUHas(CPUFeatures::kSHA3));
VIXL_ASSERT(vd.Is2D() && vn.Is2D() && vm.Is2D());
VIXL_ASSERT(IsUint6(rotate));

Emit(0xce800000 | Rd(vd) | Rn(vn) | Rm(vm) | rotate << 10);
}

void Assembler::rax1(const VRegister& vd, const VRegister& vn, const VRegister& vm) {
VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
VIXL_ASSERT(CPUHas(CPUFeatures::kSHA3));
VIXL_ASSERT(vd.Is2D() && vn.Is2D() && vm.Is2D());

Emit(0xce608c00 | Rd(vd) | Rn(vn) | Rm(vm));
}

// Note:
// For all ToImm instructions below, a difference in case
// for the same letter indicates a negated bit.
Expand Down
21 changes: 21 additions & 0 deletions src/aarch64/assembler-aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -3621,6 +3621,27 @@ class Assembler : public vixl::internal::AssemblerBase {
// Unsigned 8-bit integer matrix multiply-accumulate (vector).
void ummla(const VRegister& vd, const VRegister& vn, const VRegister& vm);

// Bit Clear and exclusive-OR.
void bcax(const VRegister& vd,
const VRegister& vn,
const VRegister& vm,
const VRegister& va);

// Three-way Exclusive-OR.
void eor3(const VRegister& vd,
const VRegister& vn,
const VRegister& vm,
const VRegister& va);

// Exclusive-OR and Rotate.
void xar(const VRegister& vd,
const VRegister& vn,
const VRegister& vm,
int rotate);

// Rotate and Exclusive-OR
void rax1(const VRegister& vd, const VRegister& vn, const VRegister& vm);

// Scalable Vector Extensions.

// Absolute value (predicated).
Expand Down
8 changes: 8 additions & 0 deletions src/aarch64/cpu-features-auditor-aarch64.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1835,6 +1835,14 @@ void CPUFeaturesAuditor::Visit(Metadata* metadata, const Instruction* instr) {
{"umax_64u_minmax_imm"_h, CPUFeatures::kCSSC},
{"umin_32u_minmax_imm"_h, CPUFeatures::kCSSC},
{"umin_64u_minmax_imm"_h, CPUFeatures::kCSSC},
{"bcax_vvv16_crypto4"_h,
CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3)},
{"eor3_vvv16_crypto4"_h,
CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3)},
{"rax1_vvv2_cryptosha512_3"_h,
CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3)},
{"xar_vvv2_crypto3_imm6"_h,
CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3)},
};

if (features.count(form_hash_) > 0) {
Expand Down
4 changes: 0 additions & 4 deletions src/aarch64/decoder-visitor-map-aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -2638,15 +2638,13 @@
&VISITORCLASS::VisitUnconditionalBranchToRegister}, \
{"ret_64r_branch_reg"_h, \
&VISITORCLASS::VisitUnconditionalBranchToRegister}, \
{"bcax_vvv16_crypto4"_h, &VISITORCLASS::VisitUnimplemented}, \
{"bfcvtn_asimdmisc_4s"_h, &VISITORCLASS::VisitUnimplemented}, \
{"bfdot_asimdelem_e"_h, &VISITORCLASS::VisitUnimplemented}, \
{"bfdot_asimdsame2_d"_h, &VISITORCLASS::VisitUnimplemented}, \
{"bfmlal_asimdelem_f"_h, &VISITORCLASS::VisitUnimplemented}, \
{"bfmlal_asimdsame2_f"_h, &VISITORCLASS::VisitUnimplemented}, \
{"bfmmla_asimdsame2_e"_h, &VISITORCLASS::VisitUnimplemented}, \
{"dsb_bon_barriers"_h, &VISITORCLASS::VisitUnimplemented}, \
{"eor3_vvv16_crypto4"_h, &VISITORCLASS::VisitUnimplemented}, \
{"ld64b_64l_memop"_h, &VISITORCLASS::VisitUnimplemented}, \
{"ldgm_64bulk_ldsttags"_h, &VISITORCLASS::VisitUnimplemented}, \
{"ldtrb_32_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented}, \
Expand All @@ -2658,7 +2656,6 @@
{"ldtrsw_64_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented}, \
{"ldtr_32_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented}, \
{"ldtr_64_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented}, \
{"rax1_vvv2_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented}, \
{"sha512h2_qqv_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented}, \
{"sha512h_qqv_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented}, \
{"sha512su0_vv2_cryptosha512_2"_h, &VISITORCLASS::VisitUnimplemented}, \
Expand Down Expand Up @@ -2686,7 +2683,6 @@
{"ttest_br_systemresult"_h, &VISITORCLASS::VisitUnimplemented}, \
{"wfet_only_systeminstrswithreg"_h, &VISITORCLASS::VisitUnimplemented}, \
{"wfit_only_systeminstrswithreg"_h, &VISITORCLASS::VisitUnimplemented}, \
{"xar_vvv2_crypto3_imm6"_h, &VISITORCLASS::VisitUnimplemented}, \
{"bfcvt_z_p_z_s2bf"_h, &VISITORCLASS::VisitUnimplemented}, \
{"bfcvtnt_z_p_z_s2bf"_h, &VISITORCLASS::VisitUnimplemented}, \
{"bfdot_z_zzz"_h, &VISITORCLASS::VisitUnimplemented}, \
Expand Down
15 changes: 15 additions & 0 deletions src/aarch64/disasm-aarch64.cc
Original file line number Diff line number Diff line change
Expand Up @@ -753,6 +753,10 @@ const Disassembler::FormToVisitorFnMap *Disassembler::GetFormToVisitorFnMap() {
{"umax_64u_minmax_imm"_h, &Disassembler::DisassembleMinMaxImm},
{"umin_32u_minmax_imm"_h, &Disassembler::DisassembleMinMaxImm},
{"umin_64u_minmax_imm"_h, &Disassembler::DisassembleMinMaxImm},
{"bcax_vvv16_crypto4"_h, &Disassembler::DisassembleNEON4Same},
{"eor3_vvv16_crypto4"_h, &Disassembler::DisassembleNEON4Same},
{"xar_vvv2_crypto3_imm6"_h, &Disassembler::DisassembleNEONXar},
{"rax1_vvv2_cryptosha512_3"_h, &Disassembler::DisassembleNEONRax1},
};
return &form_to_visitor;
} // NOLINT(readability/fn_size)
Expand Down Expand Up @@ -2430,6 +2434,17 @@ void Disassembler::VisitNEON3SameExtra(const Instruction *instr) {
Format(instr, mnemonic, nfd.Substitute(form), suffix);
}

void Disassembler::DisassembleNEON4Same(const Instruction *instr) {
FormatWithDecodedMnemonic(instr, "'Vd.16b, 'Vn.16b, 'Vm.16b, 'Va.16b");
}

void Disassembler::DisassembleNEONXar(const Instruction *instr) {
FormatWithDecodedMnemonic(instr, "'Vd.2d, 'Vn.2d, 'Vm.2d, #'u1510");
}

void Disassembler::DisassembleNEONRax1(const Instruction *instr) {
FormatWithDecodedMnemonic(instr, "'Vd.2d, 'Vn.2d, 'Vm.2d");
}

void Disassembler::VisitNEON3Different(const Instruction *instr) {
const char *mnemonic = mnemonic_.c_str();
Expand Down
3 changes: 3 additions & 0 deletions src/aarch64/disasm-aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,9 @@ class Disassembler : public DecoderVisitor {
void DisassembleNEONScalar2RegMiscOnlyD(const Instruction* instr);
void DisassembleNEONFPScalar2RegMisc(const Instruction* instr);
void DisassembleNEONPolynomialMul(const Instruction* instr);
void DisassembleNEON4Same(const Instruction* instr);
void DisassembleNEONXar(const Instruction* instr);
void DisassembleNEONRax1(const Instruction* instr);

void DisassembleMTELoadTag(const Instruction* instr);
void DisassembleMTEStoreTag(const Instruction* instr);
Expand Down
25 changes: 25 additions & 0 deletions src/aarch64/macro-assembler-aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -2787,6 +2787,7 @@ class MacroAssembler : public Assembler, public MacroAssemblerInterface {
V(pmull2, Pmull2) \
V(raddhn, Raddhn) \
V(raddhn2, Raddhn2) \
V(rax1, Rax1) \
V(rsubhn, Rsubhn) \
V(rsubhn2, Rsubhn2) \
V(saba, Saba) \
Expand Down Expand Up @@ -3152,6 +3153,14 @@ class MacroAssembler : public Assembler, public MacroAssemblerInterface {
SVE_3VREG_COMMUTATIVE_MACRO_LIST(DEFINE_MACRO_ASM_FUNC)
#undef DEFINE_MACRO_ASM_FUNC

void Bcax(const VRegister& vd,
const VRegister& vn,
const VRegister& vm,
const VRegister& va) {
VIXL_ASSERT(allow_macro_instructions_);
SingleEmissionCheckScope guard(this);
bcax(vd, vn, vm, va);
}
void Bic(const VRegister& vd, const int imm8, const int left_shift = 0) {
VIXL_ASSERT(allow_macro_instructions_);
SingleEmissionCheckScope guard(this);
Expand Down Expand Up @@ -3192,6 +3201,14 @@ class MacroAssembler : public Assembler, public MacroAssemblerInterface {
SingleEmissionCheckScope guard(this);
dup(vd, rn);
}
void Eor3(const VRegister& vd,
const VRegister& vn,
const VRegister& vm,
const VRegister& va) {
VIXL_ASSERT(allow_macro_instructions_);
SingleEmissionCheckScope guard(this);
eor3(vd, vn, vm, va);
}
void Ext(const VRegister& vd,
const VRegister& vn,
const VRegister& vm,
Expand Down Expand Up @@ -3498,6 +3515,14 @@ class MacroAssembler : public Assembler, public MacroAssemblerInterface {
SingleEmissionCheckScope guard(this);
umov(rd, vn, vn_index);
}
void Xar(const VRegister& vd,
const VRegister& vn,
const VRegister& vm,
int rotate) {
VIXL_ASSERT(allow_macro_instructions_);
SingleEmissionCheckScope guard(this);
xar(vd, vn, vm, rotate);
}
void Crc32b(const Register& rd, const Register& rn, const Register& rm) {
VIXL_ASSERT(allow_macro_instructions_);
SingleEmissionCheckScope guard(this);
Expand Down
32 changes: 32 additions & 0 deletions src/aarch64/simulator-aarch64.cc
Original file line number Diff line number Diff line change
Expand Up @@ -507,6 +507,10 @@ const Simulator::FormToVisitorFnMap* Simulator::GetFormToVisitorFnMap() {
{"umax_64u_minmax_imm"_h, &Simulator::SimulateUnsignedMinMax},
{"umin_32u_minmax_imm"_h, &Simulator::SimulateUnsignedMinMax},
{"umin_64u_minmax_imm"_h, &Simulator::SimulateUnsignedMinMax},
{"bcax_vvv16_crypto4"_h, &Simulator::SimulateNEONSHA3},
{"eor3_vvv16_crypto4"_h, &Simulator::SimulateNEONSHA3},
{"rax1_vvv2_cryptosha512_3"_h, &Simulator::SimulateNEONSHA3},
{"xar_vvv2_crypto3_imm6"_h, &Simulator::SimulateNEONSHA3},
};
return &form_to_visitor;
}
Expand Down Expand Up @@ -9926,6 +9930,34 @@ void Simulator::VisitNEONPerm(const Instruction* instr) {
}
}

void Simulator::SimulateNEONSHA3(const Instruction* instr) {
SimVRegister& rd = ReadVRegister(instr->GetRd());
SimVRegister& rn = ReadVRegister(instr->GetRn());
SimVRegister& rm = ReadVRegister(instr->GetRm());
SimVRegister& ra = ReadVRegister(instr->GetRa());
SimVRegister temp;

switch (form_hash_) {
case "bcax_vvv16_crypto4"_h:
bic(kFormat16B, temp, rm, ra);
eor(kFormat16B, rd, rn, temp);
break;
case "eor3_vvv16_crypto4"_h:
eor(kFormat16B, temp, rm, ra);
eor(kFormat16B, rd, rn, temp);
break;
case "rax1_vvv2_cryptosha512_3"_h:
ror(kFormat2D, temp, rm, 63); // rol(1) => ror(63)
eor(kFormat2D, rd, rn, temp);
break;
case "xar_vvv2_crypto3_imm6"_h:
int rot = instr->ExtractBits(15, 10);
eor(kFormat2D, temp, rn, rm);
ror(kFormat2D, rd, temp, rot);
break;
}
}

void Simulator::VisitSVEAddressGeneration(const Instruction* instr) {
SimVRegister& zd = ReadVRegister(instr->GetRd());
SimVRegister& zn = ReadVRegister(instr->GetRn());
Expand Down
1 change: 1 addition & 0 deletions src/aarch64/simulator-aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -1509,6 +1509,7 @@ class Simulator : public DecoderVisitor {
void SimulateNEONFPMulByElementLong(const Instruction* instr);
void SimulateNEONComplexMulByElement(const Instruction* instr);
void SimulateNEONDotProdByElement(const Instruction* instr);
void SimulateNEONSHA3(const Instruction* instr);
void SimulateMTEAddSubTag(const Instruction* instr);
void SimulateMTETagMaskInsert(const Instruction* instr);
void SimulateMTESubPointer(const Instruction* instr);
Expand Down
9 changes: 9 additions & 0 deletions test/aarch64/test-cpu-features-aarch64.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3785,5 +3785,14 @@ TEST_FP_FCMA_NEON_NEONHALF(fcmla_3, fcmla(v0.V8H(), v1.V8H(), v2.V8H(), 0))
TEST_FEAT(pmull1q_0, pmull(v5.V1Q(), v6.V1D(), v7.V1D()))
#undef TEST_FEAT

#define TEST_NEON_SHA3(NAME, ASM) \
TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3), \
NEON_SHA3_##NAME, \
ASM)
TEST_NEON_SHA3(bcax_0, bcax(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B()))
TEST_NEON_SHA3(eor3_0, eor3(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B()))
TEST_NEON_SHA3(xar_0, xar(v0.V2D(), v1.V2D(), v2.V2D(), 42))
TEST_NEON_SHA3(rax1_0, rax1(v0.V2D(), v1.V2D(), v2.V2D()))

} // namespace aarch64
} // namespace vixl
8 changes: 4 additions & 4 deletions test/aarch64/test-disasm-aarch64.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3789,10 +3789,10 @@ TEST(architecture_features) {
COMPARE_PREFIX(dci(0xd503221f), "esb"); // ESB_HI_hints

// ARMv8.2 - SHA3
// COMPARE_PREFIX(dci(0xce000000), "eor3"); // EOR3_VVV16_crypto4
// COMPARE_PREFIX(dci(0xce200000), "bcax"); // BCAX_VVV16_crypto4
// COMPARE_PREFIX(dci(0xce608c00), "rax1"); // RAX1_VVV2_cryptosha512_3
// COMPARE_PREFIX(dci(0xce800000), "xar"); // XAR_VVV2_crypto3_imm6
COMPARE_PREFIX(dci(0xce000000), "eor3"); // EOR3_VVV16_crypto4
COMPARE_PREFIX(dci(0xce200000), "bcax"); // BCAX_VVV16_crypto4
COMPARE_PREFIX(dci(0xce608c00), "rax1"); // RAX1_VVV2_cryptosha512_3
COMPARE_PREFIX(dci(0xce800000), "xar"); // XAR_VVV2_crypto3_imm6

// ARMv8.2 - SHA512
// COMPARE_PREFIX(dci(0xce608000), "sha512h"); // SHA512H_QQV_cryptosha512_3
Expand Down
14 changes: 14 additions & 0 deletions test/aarch64/test-disasm-neon-aarch64.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4516,6 +4516,20 @@ TEST(neon_matmul) {
CLEANUP();
}

TEST(neon_sha3) {
SETUP();

COMPARE_MACRO(Bcax(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B()),
"bcax v0.16b, v1.16b, v2.16b, v3.16b");
COMPARE_MACRO(Eor3(v10.V16B(), v11.V16B(), v12.V16B(), v13.V16B()),
"eor3 v10.16b, v11.16b, v12.16b, v13.16b");
COMPARE_MACRO(Xar(v20.V2D(), v21.V2D(), v22.V2D(), 42),
"xar v20.2d, v21.2d, v22.2d, #42");
COMPARE_MACRO(Rax1(v0.V2D(), v1.V2D(), v2.V2D()), "rax1 v0.2d, v1.2d, v2.2d");

CLEANUP();
}

TEST(neon_unallocated_regression_test) {
SETUP();

Expand Down
Loading

0 comments on commit f307b4a

Please sign in to comment.