Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure predicate cache is reset when control flow leaves block #4274

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion FEXCore/Scripts/json_ir_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def parse_ops(ops):
(OpArg.Type == "GPR" or
OpArg.Type == "GPRPair" or
OpArg.Type == "FPR" or
OpArg.Type == "PR")):
OpArg.Type == "PRED")):
OpDef.EmitValidation.append(f"GetOpRegClass({ArgName}) == InvalidClass || WalkFindRegClass({ArgName}) == {OpArg.Type}Class")

OpArg.Name = ArgName
Expand Down
4 changes: 2 additions & 2 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4314,7 +4314,7 @@ Ref OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, const X86T
Ref MemSrc = LoadEffectiveAddress(A, true);
if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) {
// Using SVE we can load this with a single instruction.
auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
auto PReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5);
return _LoadMemPredicate(OpSize::i128Bit, OpSize::i16Bit, PReg, MemSrc);
} else {
// For X87 extended doubles, Split the load.
Expand Down Expand Up @@ -4448,7 +4448,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl
if (OpSize == OpSize::f80Bit) {
Ref MemStoreDst = LoadEffectiveAddress(A, true);
if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) {
auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
auto PReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5);
_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, Src, PReg, MemStoreDst);
} else {
// For X87 extended doubles, split before storing
Expand Down
8 changes: 3 additions & 5 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -718,7 +718,6 @@ class OpDispatchBuilder final : public IREmitter {
void FNINIT(OpcodeArgs);

void X87ModifySTP(OpcodeArgs, bool Inc);
void X87SinCos(OpcodeArgs);
void X87FYL2X(OpcodeArgs, bool IsFYL2XP1);
void X87LDENV(OpcodeArgs);
void X87FLDCW(OpcodeArgs);
Expand Down Expand Up @@ -764,9 +763,6 @@ class OpDispatchBuilder final : public IREmitter {
void FTSTF64(OpcodeArgs);
void FRNDINTF64(OpcodeArgs);
void FSQRTF64(OpcodeArgs);
void X87UnaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp);
void X87BinaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp);
void X87SinCosF64(OpcodeArgs);
void X87FLDCWF64(OpcodeArgs);
void X87TANF64(OpcodeArgs);
void X87ATANF64(OpcodeArgs);
Expand Down Expand Up @@ -1175,9 +1171,11 @@ class OpDispatchBuilder final : public IREmitter {
}

void FlushRegisterCache(bool SRAOnly = false) {
// At block boundaries, fix up the carry flag.

// At block boundaries, fix up the carry flag, and reset the predicate cache.
if (!SRAOnly) {
RectifyCarryInvert(CFInvertedABI);
ResetInitPredicateCache();
}

CalculateDeferredFlags();
Expand Down
6 changes: 5 additions & 1 deletion FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,11 @@ void OpDispatchBuilder::FILD(OpcodeArgs) {

void OpDispatchBuilder::FST(OpcodeArgs, IR::OpSize Width) {
Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
_StoreStackMemory(Mem, OpSize::i128Bit, true, Width);
Ref PredReg = Invalid();
if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) {
PredReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5);
}
_StoreStackMemory(PredReg, Mem, OpSize::i128Bit, true, Width);
if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) {
_PopStackDestroy();
}
Expand Down
2 changes: 1 addition & 1 deletion FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ void OpDispatchBuilder::FILDF64(OpcodeArgs) {

void OpDispatchBuilder::FSTF64(OpcodeArgs, IR::OpSize Width) {
Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
_StoreStackMemory(Mem, OpSize::i64Bit, true, Width);
_StoreStackMemory(Invalid(), Mem, OpSize::i64Bit, true, Width);

if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) {
_PopStackDestroy();
Expand Down
5 changes: 3 additions & 2 deletions FEXCore/Source/Interface/IR/IR.json
Original file line number Diff line number Diff line change
Expand Up @@ -2788,13 +2788,14 @@
"HasSideEffects": true,
"X87": true
},
"StoreStackMemory GPR:$Addr, OpSize:$SourceSize, i1:$Float, OpSize:$StoreSize": {
"StoreStackMemory PRED:$PredReg, GPR:$Addr, OpSize:$SourceSize, i1:$Float, OpSize:$StoreSize": {
"Desc": [
"Takes the top value off the x87 stack and stores it to memory.",
"SourceSize is 128bit for F80 values, 64-bit for low precision.",
"StoreSize is the store size for conversion:",
"Float: 80-bit, 64-bit, or 32-bit",
"Int: 64-bit, 32-bit, 16-bit"
"Int: 64-bit, 32-bit, 16-bit",
"If possible, it will use the PredReg for an SVE store."
],
"HasSideEffects": true,
"X87": true
Expand Down
1 change: 1 addition & 0 deletions FEXCore/Source/Interface/IR/IREmitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ FEXCore::IR::RegisterClassType IREmitter::WalkFindRegClass(Ref Node) {
case FPRClass:
case GPRFixedClass:
case FPRFixedClass:
case PREDClass:
case InvalidClass: return Class;
default: break;
}
Expand Down
34 changes: 33 additions & 1 deletion FEXCore/Source/Interface/IR/IREmitter.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: MIT
#pragma once

#include "CodeEmitter/Emitter.h"
#include "Interface/IR/IR.h"
#include "Interface/IR/IntrusiveIRList.h"

Expand All @@ -9,9 +10,9 @@

#include <FEXCore/Utils/LogManager.h>
#include <FEXCore/fextl/vector.h>
#include <FEXCore/fextl/unordered_map.h>

#include <algorithm>
#include <new>
#include <stdint.h>
#include <string.h>

Expand Down Expand Up @@ -45,6 +46,37 @@ class IREmitter {
}
void ResetWorkingList();

// Predicate Cache Implementation
// This lives here rather than OpcodeDispatcher because x87StackOptimization Pass
// also needs it.
struct PredicateKey {
ARMEmitter::PredicatePattern Pattern;
OpSize Size;
bool operator==(const PredicateKey& rhs) const = default;
};

struct PredicateKeyHash {
size_t operator()(const PredicateKey& key) const {
return FEXCore::ToUnderlying(key.Pattern) + (FEXCore::ToUnderlying(key.Size) * FEXCore::ToUnderlying(OpSize::iInvalid));
}
};
fextl::unordered_map<PredicateKey, Ref, PredicateKeyHash> InitPredicateCache;

Ref InitPredicateCached(OpSize Size, ARMEmitter::PredicatePattern Pattern) {
PredicateKey Key {Pattern, Size};
auto ValIt = InitPredicateCache.find(Key);
if (ValIt == InitPredicateCache.end()) {
auto Predicate = _InitPredicate(Size, static_cast<uint8_t>(FEXCore::ToUnderlying(Pattern)));
InitPredicateCache[Key] = Predicate;
return Predicate;
}
return ValIt->second;
}

void ResetInitPredicateCache() {
InitPredicateCache.clear();
}

/**
* @name IR allocation routines
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -822,10 +822,11 @@ void X87StackOptimization::Run(IREmitter* Emit) {
if (Op->StoreSize != OpSize::f80Bit) { // if it's not 80bits then convert
StackNode = IREmit->_F80CVT(Op->StoreSize, StackNode);
}
if (Op->StoreSize == OpSize::f80Bit) { // Part of code from StoreResult_WithOpSize()
if (Features.SupportsSVE128 || Features.SupportsSVE256) {
auto PReg = IREmit->_InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PReg, AddrNode);
if (Op->StoreSize == OpSize::f80Bit) {
Ref PredReg = CurrentIR.GetNode(Op->PredReg);
bool CanUsePredicateStore = (Features.SupportsSVE128 || Features.SupportsSVE256) && PredReg;
if (CanUsePredicateStore) {
IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PredReg, AddrNode);
} else {
// For X87 extended doubles, split before storing
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode);
Expand Down
30 changes: 7 additions & 23 deletions unittests/InstructionCountCI/X87ldst-SVE.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
"ExpectedInstructionCount": 13,
"Comment": "Single 80-bit store.",
"ExpectedArm64ASM": [
"ptrue p2.h, vl5",
"ldrb w20, [x28, #1019]",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x4]",
"ldrb w21, [x28, #1298]",
"mov w22, #0x1",
Expand All @@ -34,16 +34,16 @@
},
"2-store 80bit": {
"x86InstructionCount": 2,
"ExpectedInstructionCount": 25,
"ExpectedInstructionCount": 24,
"x86Insts": [
"fstp tword [rax]",
"fstp tword [rax+10]"
],
"ExpectedArm64ASM": [
"ptrue p2.h, vl5",
"ldrb w20, [x28, #1019]",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x4]",
"ldrb w21, [x28, #1298]",
"mov w22, #0x1",
Expand All @@ -56,7 +56,6 @@
"add x21, x4, #0xa (10)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w22, w22, w20",
Expand All @@ -69,7 +68,7 @@
},
"8-store 80bit": {
"x86InstructionCount": 8,
"ExpectedInstructionCount": 97,
"ExpectedInstructionCount": 90,
"x86Insts": [
"fstp tword [rax]",
"fstp tword [rax+10]",
Expand All @@ -81,10 +80,10 @@
"fstp tword [rax+70]"
],
"ExpectedArm64ASM": [
"ptrue p2.h, vl5",
"ldrb w20, [x28, #1019]",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x4]",
"ldrb w21, [x28, #1298]",
"mov w22, #0x1",
Expand All @@ -97,7 +96,6 @@
"add x21, x4, #0xa (10)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w23, w22, w20",
Expand All @@ -109,7 +107,6 @@
"add x21, x4, #0x14 (20)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w23, w22, w20",
Expand All @@ -121,7 +118,6 @@
"add x21, x4, #0x1e (30)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w23, w22, w20",
Expand All @@ -133,7 +129,6 @@
"add x21, x4, #0x28 (40)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w23, w22, w20",
Expand All @@ -145,7 +140,6 @@
"add x21, x4, #0x32 (50)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w23, w22, w20",
Expand All @@ -157,7 +151,6 @@
"add x21, x4, #0x3c (60)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w23, w22, w20",
Expand All @@ -169,7 +162,6 @@
"add x21, x4, #0x46 (70)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w22, w22, w20",
Expand Down Expand Up @@ -201,7 +193,7 @@
},
"2-load 80bit": {
"x86InstructionCount": 2,
"ExpectedInstructionCount": 22,
"ExpectedInstructionCount": 21,
"x86Insts": [
"fld tword [rax]",
"fld tword [rax+10]"
Expand All @@ -210,7 +202,6 @@
"ptrue p2.h, vl5",
"ld1h {z2.h}, p2/z, [x4]",
"add x20, x4, #0xa (10)",
"ptrue p2.h, vl5",
"ld1h {z3.h}, p2/z, [x20]",
"ldrb w20, [x28, #1019]",
"sub w20, w20, #0x2 (2)",
Expand All @@ -233,7 +224,7 @@
},
"8-load 80bit": {
"x86InstructionCount": 8,
"ExpectedInstructionCount": 59,
"ExpectedInstructionCount": 52,
"x86Insts": [
"fld tword [rax]",
"fld tword [rax+10]",
Expand All @@ -248,25 +239,18 @@
"ptrue p2.h, vl5",
"ld1h {z2.h}, p2/z, [x4]",
"add x20, x4, #0xa (10)",
"ptrue p2.h, vl5",
"ld1h {z3.h}, p2/z, [x20]",
"add x20, x4, #0x14 (20)",
"ptrue p2.h, vl5",
"ld1h {z4.h}, p2/z, [x20]",
"add x20, x4, #0x1e (30)",
"ptrue p2.h, vl5",
"ld1h {z5.h}, p2/z, [x20]",
"add x20, x4, #0x28 (40)",
"ptrue p2.h, vl5",
"ld1h {z6.h}, p2/z, [x20]",
"add x20, x4, #0x32 (50)",
"ptrue p2.h, vl5",
"ld1h {z7.h}, p2/z, [x20]",
"add x20, x4, #0x3c (60)",
"ptrue p2.h, vl5",
"ld1h {z8.h}, p2/z, [x20]",
"add x20, x4, #0x46 (70)",
"ptrue p2.h, vl5",
"ld1h {z9.h}, p2/z, [x20]",
"ldrb w20, [x28, #1019]",
"sub w20, w20, #0x8 (8)",
Expand Down
Loading