From 663b4db9cab6f3d0a8de32f74d7e0d49015a0f62 Mon Sep 17 00:00:00 2001 From: Sintendo <3380580+Sintendo@users.noreply.github.com> Date: Sun, 29 Dec 2024 11:45:59 +0100 Subject: [PATCH 1/5] CPUDetect: Add CSSC flag --- Source/Core/Common/CPUDetect.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Source/Core/Common/CPUDetect.h b/Source/Core/Common/CPUDetect.h index 5a4fac2364a1..6731e373eaac 100644 --- a/Source/Core/Common/CPUDetect.h +++ b/Source/Core/Common/CPUDetect.h @@ -47,7 +47,8 @@ struct CPUInfo bool bSHA2 = false; // ARMv8 specific - bool bAFP = false; // Alternate floating-point behavior + bool bAFP = false; // Alternate floating-point behavior + bool bCSSC = false; // Common Short Sequence Compression // Call Detect() explicit CPUInfo(); From 4835e620c38b07e4d543ac7bf5b3da2ac6ae4dcf Mon Sep 17 00:00:00 2001 From: Sintendo <3380580+Sintendo@users.noreply.github.com> Date: Sun, 29 Dec 2024 11:47:07 +0100 Subject: [PATCH 2/5] ArmCPUDetect: Expose CSSC in CPU info summary --- Source/Core/Common/ArmCPUDetect.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Source/Core/Common/ArmCPUDetect.cpp b/Source/Core/Common/ArmCPUDetect.cpp index 90cc0f7b44a4..f481655506f3 100644 --- a/Source/Core/Common/ArmCPUDetect.cpp +++ b/Source/Core/Common/ArmCPUDetect.cpp @@ -332,6 +332,8 @@ std::string CPUInfo::Summarize() sum.push_back("SHA1"); if (bSHA2) sum.push_back("SHA2"); + if (bCSSC) + sum.push_back("CSSC"); return fmt::to_string(fmt::join(sum, ",")); } From 47bc36d319fa02873e632246a76a56b7f212e20e Mon Sep 17 00:00:00 2001 From: Sintendo <3380580+Sintendo@users.noreply.github.com> Date: Sun, 29 Dec 2024 11:49:25 +0100 Subject: [PATCH 3/5] Arm64Emitter: Add CSSC instruction emitters --- Source/Core/Common/Arm64Emitter.cpp | 130 ++++++++++++++++++++-------- Source/Core/Common/Arm64Emitter.h | 48 +++++++++- 2 files changed, 143 insertions(+), 35 deletions(-) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index 8d71016f7e14..3c386938d126 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -16,6 +16,7 @@ #include "Common/Align.h" #include "Common/Assert.h" +#include "Common/CPUDetect.h" #include "Common/CommonTypes.h" #include "Common/MathUtil.h" #include "Common/SmallVector.h" @@ -204,24 +205,9 @@ static const u32 Data1SrcEnc[][2] = { {0, 3}, // REV64 {0, 4}, // CLZ {0, 5}, // CLS -}; - -// Data-Processing (2 source) -static const u32 Data2SrcEnc[] = { - 0x02, // UDIV - 0x03, // SDIV - 0x08, // LSLV - 0x09, // LSRV - 0x0A, // ASRV - 0x0B, // RORV - 0x10, // CRC32B - 0x11, // CRC32H - 0x12, // CRC32W - 0x14, // CRC32CB - 0x15, // CRC32CH - 0x16, // CRC32CW - 0x13, // CRC32X (64bit Only) - 0x17, // XRC32CX (64bit Only) + {0, 6}, // CTZ + {0, 7}, // CNT + {0, 8}, // ABS }; // Data-Processing (3 source) @@ -409,14 +395,22 @@ void ARM64XEmitter::EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn) (Data1SrcEnc[instenc][1] << 10) | (DecodeReg(Rn) << 5) | DecodeReg(Rd)); } -void ARM64XEmitter::EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +void ARM64XEmitter::EncodeData2SrcInst(Data2SrcEnc instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { bool b64Bit = Is64Bit(Rd); - Write32((b64Bit << 31) | (0x0D6 << 21) | (DecodeReg(Rm) << 16) | (Data2SrcEnc[instenc] << 10) | + Write32((b64Bit << 31) | (0x0D6 << 21) | (DecodeReg(Rm) << 16) | (u32(instenc) << 10) | (DecodeReg(Rn) << 5) | DecodeReg(Rd)); } +void ARM64XEmitter::EncodeDataCSSCImmInst(DataCSSCImm8Enc opc, ARM64Reg Rd, ARM64Reg Rn, u8 imm) +{ + bool b64Bit = Is64Bit(Rd); + + Write32((b64Bit << 31) | (0x47 << 22) | (u32(opc) << 18) | (imm << 10) | (DecodeReg(Rn) << 5) | + DecodeReg(Rd)); +} + void ARM64XEmitter::EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) { @@ -1180,59 +1174,59 @@ void ARM64XEmitter::CLS(ARM64Reg Rd, ARM64Reg Rn) // Data-Processing 2 source void ARM64XEmitter::UDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EncodeData2SrcInst(0, Rd, Rn, Rm); + EncodeData2SrcInst(Data2SrcEnc::UDIV, Rd, Rn, Rm); } void ARM64XEmitter::SDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EncodeData2SrcInst(1, Rd, Rn, Rm); + EncodeData2SrcInst(Data2SrcEnc::SDIV, Rd, Rn, Rm); } void ARM64XEmitter::LSLV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EncodeData2SrcInst(2, Rd, Rn, Rm); + EncodeData2SrcInst(Data2SrcEnc::LSLV, Rd, Rn, Rm); } void ARM64XEmitter::LSRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EncodeData2SrcInst(3, Rd, Rn, Rm); + EncodeData2SrcInst(Data2SrcEnc::LSRV, Rd, Rn, Rm); } void ARM64XEmitter::ASRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EncodeData2SrcInst(4, Rd, Rn, Rm); + EncodeData2SrcInst(Data2SrcEnc::ASRV, Rd, Rn, Rm); } void ARM64XEmitter::RORV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EncodeData2SrcInst(5, Rd, Rn, Rm); + EncodeData2SrcInst(Data2SrcEnc::RORV, Rd, Rn, Rm); } void ARM64XEmitter::CRC32B(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EncodeData2SrcInst(6, Rd, Rn, Rm); + EncodeData2SrcInst(Data2SrcEnc::CRC32B, Rd, Rn, Rm); } void ARM64XEmitter::CRC32H(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EncodeData2SrcInst(7, Rd, Rn, Rm); + EncodeData2SrcInst(Data2SrcEnc::CRC32H, Rd, Rn, Rm); } void ARM64XEmitter::CRC32W(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EncodeData2SrcInst(8, Rd, Rn, Rm); + EncodeData2SrcInst(Data2SrcEnc::CRC32W, Rd, Rn, Rm); } void ARM64XEmitter::CRC32CB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EncodeData2SrcInst(9, Rd, Rn, Rm); + EncodeData2SrcInst(Data2SrcEnc::CRC32CB, Rd, Rn, Rm); } void ARM64XEmitter::CRC32CH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EncodeData2SrcInst(10, Rd, Rn, Rm); + EncodeData2SrcInst(Data2SrcEnc::CRC32CH, Rd, Rn, Rm); } void ARM64XEmitter::CRC32CW(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EncodeData2SrcInst(11, Rd, Rn, Rm); + EncodeData2SrcInst(Data2SrcEnc::CRC32CW, Rd, Rn, Rm); } void ARM64XEmitter::CRC32X(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EncodeData2SrcInst(12, Rd, Rn, Rm); + EncodeData2SrcInst(Data2SrcEnc::CRC32X, Rd, Rn, Rm); } void ARM64XEmitter::CRC32CX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EncodeData2SrcInst(13, Rd, Rn, Rm); + EncodeData2SrcInst(Data2SrcEnc::CRC32CX, Rd, Rn, Rm); } // Data-Processing 3 source @@ -1775,6 +1769,74 @@ void ARM64XEmitter::ADRP(ARM64Reg Rd, s64 imm) EncodeAddressInst(1, Rd, static_cast(imm >> 12)); } +// Common Short Sequence Compression (CSSC) instructions +void ARM64XEmitter::ABS(ARM64Reg Rd, ARM64Reg Rn) +{ + if (!cpu_info.bCSSC) + PanicAlertFmt("Trying to use CSSC on a system that doesn't support it. Bad programmer."); + EncodeData1SrcInst(8, Rd, Rn); +} +void ARM64XEmitter::CNT(ARM64Reg Rd, ARM64Reg Rn) +{ + if (!cpu_info.bCSSC) + PanicAlertFmt("Trying to use CSSC on a system that doesn't support it. Bad programmer."); + EncodeData1SrcInst(7, Rd, Rn); +} +void ARM64XEmitter::CTZ(ARM64Reg Rd, ARM64Reg Rn) +{ + if (!cpu_info.bCSSC) + PanicAlertFmt("Trying to use CSSC on a system that doesn't support it. Bad programmer."); + EncodeData1SrcInst(6, Rd, Rn); +} +void ARM64XEmitter::SMIN(ARM64Reg Rd, ARM64Reg Rn, s8 imm) +{ + if (!cpu_info.bCSSC) + PanicAlertFmt("Trying to use CSSC on a system that doesn't support it. Bad programmer."); + EncodeDataCSSCImmInst(DataCSSCImm8Enc::SMIN, Rd, Rn, static_cast(imm)); +} +void ARM64XEmitter::SMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + if (!cpu_info.bCSSC) + PanicAlertFmt("Trying to use CSSC on a system that doesn't support it. Bad programmer."); + EncodeData2SrcInst(Data2SrcEnc::SMIN, Rd, Rn, Rm); +} +void ARM64XEmitter::SMAX(ARM64Reg Rd, ARM64Reg Rn, s8 imm) +{ + if (!cpu_info.bCSSC) + PanicAlertFmt("Trying to use CSSC on a system that doesn't support it. Bad programmer."); + EncodeDataCSSCImmInst(DataCSSCImm8Enc::SMAX, Rd, Rn, static_cast(imm)); +} +void ARM64XEmitter::SMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + if (!cpu_info.bCSSC) + PanicAlertFmt("Trying to use CSSC on a system that doesn't support it. Bad programmer."); + EncodeData2SrcInst(Data2SrcEnc::SMAX, Rd, Rn, Rm); +} +void ARM64XEmitter::UMIN(ARM64Reg Rd, ARM64Reg Rn, u8 imm) +{ + if (!cpu_info.bCSSC) + PanicAlertFmt("Trying to use CSSC on a system that doesn't support it. Bad programmer."); + EncodeDataCSSCImmInst(DataCSSCImm8Enc::UMIN, Rd, Rn, static_cast(imm)); +} +void ARM64XEmitter::UMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + if (!cpu_info.bCSSC) + PanicAlertFmt("Trying to use CSSC on a system that doesn't support it. Bad programmer."); + EncodeData2SrcInst(Data2SrcEnc::UMIN, Rd, Rn, Rm); +} +void ARM64XEmitter::UMAX(ARM64Reg Rd, ARM64Reg Rn, u8 imm) +{ + if (!cpu_info.bCSSC) + PanicAlertFmt("Trying to use CSSC on a system that doesn't support it. Bad programmer."); + EncodeDataCSSCImmInst(DataCSSCImm8Enc::UMAX, Rd, Rn, static_cast(imm)); +} +void ARM64XEmitter::UMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + if (!cpu_info.bCSSC) + PanicAlertFmt("Trying to use CSSC on a system that doesn't support it. Bad programmer."); + EncodeData2SrcInst(Data2SrcEnc::UMAX, Rd, Rn, Rm); +} + // This is using a hand-rolled algorithm. The goal is zero memory allocations, not necessarily // the best JIT-time time complexity. (The number of moves is usually very small.) void ARM64XEmitter::ParallelMoves(RegisterMove* begin, RegisterMove* end, diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index 134f9b64cce5..3c2e1062a290 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -624,6 +624,38 @@ class ARM64XEmitter // Must be cleared with SetCodePtr() afterwards. bool m_write_failed = false; + // Data-Processing (2 source) + enum class Data2SrcEnc : u32 + { + UDIV = 0x02, + SDIV = 0x03, + LSLV = 0x08, + LSRV = 0x09, + ASRV = 0x0A, + RORV = 0x0B, + CRC32B = 0x10, + CRC32H = 0x11, + CRC32W = 0x12, + CRC32CB = 0x14, + CRC32CH = 0x15, + CRC32CW = 0x16, + CRC32X = 0x13, // 64-bit only + CRC32CX = 0x17, // 64-bit only + // CSSC + SMAX = 0x18, + UMAX = 0x19, + SMIN = 0x1A, + UMIN = 0x1B, + }; + + enum class DataCSSCImm8Enc : u8 + { + SMAX = 0b0000, + UMAX = 0b0001, + SMIN = 0b0010, + UMIN = 0b0011, + }; + void AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, bool negative, bool flags); void EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr); void EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr); @@ -638,7 +670,7 @@ class ARM64XEmitter void EncodeCondCompareRegInst(u32 op, ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond); void EncodeCondSelectInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); void EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn); - void EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void EncodeData2SrcInst(Data2SrcEnc instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); void EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); void EncodeLoadRegisterInst(u32 bitop, ARM64Reg Rt, u32 imm); @@ -655,6 +687,7 @@ class ARM64XEmitter s32 imm); void EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm); void EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void EncodeDataCSSCImmInst(DataCSSCImm8Enc opc, ARM64Reg Rd, ARM64Reg Rn, u8 imm); [[nodiscard]] FixupBranch WriteFixupBranch(); @@ -1021,6 +1054,19 @@ class ARM64XEmitter void ADR(ARM64Reg Rd, s32 imm); void ADRP(ARM64Reg Rd, s64 imm); + // Common Short Sequence Compression (CSSC) instructions + void ABS(ARM64Reg Rd, ARM64Reg Rn); + void CNT(ARM64Reg Rd, ARM64Reg Rn); + void CTZ(ARM64Reg Rd, ARM64Reg Rn); + void SMIN(ARM64Reg Rd, ARM64Reg Rn, s8 imm); + void SMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SMAX(ARM64Reg Rd, ARM64Reg Rn, s8 imm); + void SMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void UMIN(ARM64Reg Rd, ARM64Reg Rn, u8 imm); + void UMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void UMAX(ARM64Reg Rd, ARM64Reg Rn, u8 imm); + void UMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + // Wrapper around ADR/ADRP/MOVZ/MOVN/MOVK void MOVI2R(ARM64Reg Rd, u64 imm); bool MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2); From 4d5d6d34640a7bf155aa0d7b573eae48d1edf10e Mon Sep 17 00:00:00 2001 From: Sintendo <3380580+Sintendo@users.noreply.github.com> Date: Sun, 29 Dec 2024 11:51:31 +0100 Subject: [PATCH 4/5] JitArm64_LoadStore: Use UMIN in dcbx when available --- .../Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index ebcb8142b7a4..88fa3e727a96 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -7,6 +7,7 @@ #include "Common/Arm64Emitter.h" #include "Common/BitSet.h" +#include "Common/CPUDetect.h" #include "Common/CommonTypes.h" #include "Common/ScopeGuard.h" @@ -805,8 +806,15 @@ void JitArm64::dcbx(UGeckoInstruction inst) SDIV(WB, reg_downcount, reg_cycle_count); // WB = downcount / cycle_count SUB(WA, loop_counter, 1); // WA = CTR - 1 // ^ Note that this CTR-1 implicitly handles the CTR == 0 case correctly. - CMP(WB, WA); - CSEL(WA, WB, WA, CCFlags::CC_LO); // WA = min(WB, WA) + if (cpu_info.bCSSC) + { + UMIN(WA, WB, WA); + } + else + { + CMP(WB, WA); + CSEL(WA, WB, WA, CCFlags::CC_LO); // WA = min(WB, WA) + } // WA now holds the amount of loops to execute minus 1, which is the amount we need to adjust // downcount, CTR, and Rb by to exit the loop construct with the right values in those From 952fc0d79514efe03ddd5b734da59e45ae8d3849 Mon Sep 17 00:00:00 2001 From: Sintendo <3380580+Sintendo@users.noreply.github.com> Date: Sun, 29 Dec 2024 11:13:11 +0100 Subject: [PATCH 5/5] ArmCPUDetect: Implement Linux CSSC detection --- Source/Core/Common/ArmCPUDetect.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Source/Core/Common/ArmCPUDetect.cpp b/Source/Core/Common/ArmCPUDetect.cpp index f481655506f3..95d5c65a7fb8 100644 --- a/Source/Core/Common/ArmCPUDetect.cpp +++ b/Source/Core/Common/ArmCPUDetect.cpp @@ -273,9 +273,14 @@ void CPUInfo::Detect() bSHA1 = hwcap & HWCAP_SHA1; bSHA2 = hwcap & HWCAP_SHA2; -#if defined(AT_HWCAP2) && defined(HWCAP2_AFP) +#if defined(AT_HWCAP2) const u32 hwcap2 = ReadHwCap(AT_HWCAP2); +#if defined(HWCAP2_AFP) bAFP = hwcap2 & HWCAP2_AFP; +#endif +#if defined(HWCAP2_CSSC) + bCSSC = hwcap2 & HWCAP2_CSSC; +#endif #endif u64 midr = 0;