Skip to content

Commit

Permalink
FEXCore/JIT: Encode the JITRIPReconstructionEntries using variable le…
Browse files Browse the repository at this point in the history
…ngth integer

When #2722 implemented this initially and #4271 switched over to signed
int16_t there was assumptions made that int16_t was a reasonable
trade-off in encoding size versus needing to deal with 8-bit values
being too small in some cases.

In the common case we are almost always encoding 8-bit values because
instructions are typically linear (and less than 15-bytes in size), but
16-bit was chosen because optimizing JIT and multiple instructions that
don't cause exceptions can add up to larger than 8-bit.

Instead of hardcoding 16-bit values, implement a variable length integer
class where ~96.8% of values are 8-bit encoded, and the remaining 3.19% are encoded using 16-bit.
Due to some constraints that #4271 put in place, we can basically
guarantee currently that branch targets are within 16-bit. The VL class
does support 32-bit and 64-bit as well so if we change behaviour then
nothing needs to change.

Some stats when running Sonic Mania with multiblock enabled.
Encoded integers: 3,504,907
Encoded 8-bit:    3,393,095 (96.8%)
Encoded 16-bit:     111,812 (3.19%)
Encoded 32/64-bit:        0

Encoded Size:       3,615,181 bytes (3.44MiB)
Fixed encoded size: 7,007,604 bytes (6.68MiB)

Definitely worth using and saves the headache of large RIP/PC offsets
causing problems.
  • Loading branch information
Sonicadvance1 committed Jan 27, 2025
1 parent 8d6a43d commit 0fa2f9c
Show file tree
Hide file tree
Showing 6 changed files with 315 additions and 39 deletions.
6 changes: 2 additions & 4 deletions CodeEmitter/CodeEmitter/BranchOps.inl
Original file line number Diff line number Diff line change
Expand Up @@ -227,8 +227,7 @@ public:
}

void tbz(ARMEmitter::Register rt, uint32_t Bit, ForwardLabel* Label) {
AddLocationToLabel(
Label, ForwardLabel::Reference {.Location = GetCursorAddress<uint8_t*>(), .Type = ForwardLabel::InstType::TEST_BRANCH});
AddLocationToLabel(Label, ForwardLabel::Reference {.Location = GetCursorAddress<uint8_t*>(), .Type = ForwardLabel::InstType::TEST_BRANCH});

constexpr uint32_t Op = 0b0011'0110 << 24;

Expand Down Expand Up @@ -258,8 +257,7 @@ public:
}

void tbnz(ARMEmitter::Register rt, uint32_t Bit, ForwardLabel* Label) {
AddLocationToLabel(
Label, ForwardLabel::Reference {.Location = GetCursorAddress<uint8_t*>(), .Type = ForwardLabel::InstType::TEST_BRANCH});
AddLocationToLabel(Label, ForwardLabel::Reference {.Location = GetCursorAddress<uint8_t*>(), .Type = ForwardLabel::InstType::TEST_BRANCH});
constexpr uint32_t Op = 0b0011'0111 << 24;

TestAndBranch(Op, rt, Bit, 0);
Expand Down
16 changes: 0 additions & 16 deletions FEXCore/Source/Interface/Core/CPUBackend.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,22 +102,6 @@ namespace CPU {
uint32_t _Pad;
};

// Entries that live after the JITCodeTail.
// These entries correlate JIT code regions with guest RIP regions.
// Using these entries FEX is able to reconstruct the guest RIP accurately when an instruction cause a signal fault.
// Packed using 16-bit entries to ensure the size isn't too large.
// These smaller sizes means that each entry is relative to each other instead of absolute offset from the start of the JIT block.
// When reconstructing the RIP, each entry must be walked linearly and accumulated with the previous entries.
// This is a trade-off between compression inside the JIT code space and execution time when reconstruction the RIP.
// RIP reconstruction when faulting is less likely so we are requiring the accumulation.
struct JITRIPReconstructEntries {
// The Host PC offset from the previous entry.
uint16_t HostPCOffset;

// How much to offset the RIP from the previous entry.
int16_t GuestRIPOffset;
};

/**
* @brief Tells this CPUBackend to compile code for the provided IR and DebugData
*
Expand Down
17 changes: 10 additions & 7 deletions FEXCore/Source/Interface/Core/Core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ desc: Glues Frontend, OpDispatcher and IR Opts & Compilation, LookupCache, Dispa
#include "Utils/Allocator.h"
#include "Utils/Allocator/HostAllocator.h"
#include "Utils/SpinWaitLock.h"
#include "Utils/variable_length_integer.h"

#include <FEXCore/Config/Config.h>
#include <FEXCore/Core/Context.h>
Expand Down Expand Up @@ -144,24 +145,26 @@ uint64_t ContextImpl::RestoreRIPFromHostPC(FEXCore::Core::InternalThreadState* T
auto [InlineHeader, InlineTail] = GetFrameBlockInfo(Thread->CurrentFrame);

if (InlineHeader) {
auto RIPEntries = reinterpret_cast<const CPU::CPUBackend::JITRIPReconstructEntries*>(
Frame->State.InlineJITBlockHeader + InlineHeader->OffsetToBlockTail + InlineTail->OffsetToRIPEntries);

// Check if the host PC is currently within a code block.
// If it is then RIP can be reconstructed from the beginning of the code block.
// This is currently as close as FEX can get RIP reconstructions.
if (HostPC >= reinterpret_cast<uint64_t>(BlockBegin) && HostPC < reinterpret_cast<uint64_t>(BlockBegin + InlineTail->Size)) {

auto RIPEntry =
reinterpret_cast<const uint8_t*>(Frame->State.InlineJITBlockHeader + InlineHeader->OffsetToBlockTail + InlineTail->OffsetToRIPEntries);

// Reconstruct RIP from JIT entries for this block.
uint64_t StartingHostPC = BlockBegin;
uint64_t StartingGuestRIP = InlineTail->RIP;

for (uint32_t i = 0; i < InlineTail->NumberOfRIPEntries; ++i) {
const auto& RIPEntry = RIPEntries[i];
if (HostPC >= (StartingHostPC + RIPEntry.HostPCOffset)) {
auto HostPCOffset = FEXCore::Utils::vl64::Decode(RIPEntry);
RIPEntry += HostPCOffset.Size;
auto GuestRIPOffset = FEXCore::Utils::vl64::Decode(RIPEntry);
if (HostPC >= (StartingHostPC + HostPCOffset.Integer)) {
// We are beyond this entry, keep going forward.
StartingHostPC += RIPEntry.HostPCOffset;
StartingGuestRIP += RIPEntry.GuestRIPOffset;
StartingHostPC += HostPCOffset.Integer;
StartingGuestRIP += GuestRIPOffset.Integer;
} else {
// Passed where the Host PC is at. Break now.
break;
Expand Down
45 changes: 33 additions & 12 deletions FEXCore/Source/Interface/Core/JIT/JIT.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ desc: Main glue logic of the arm64 splatter backend
#include "Interface/IR/Passes/RegisterAllocationPass.h"

#include "Utils/MemberFunctionToPointer.h"
#include "Utils/variable_length_integer.h"

#include <FEXCore/Core/X86Enums.h>
#include <FEXCore/Debug/InternalThreadState.h>
Expand Down Expand Up @@ -853,10 +854,23 @@ CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, uint64_t Size
auto JITBlockTail = GetCursorAddress<JITCodeTail*>();
CursorIncrement(sizeof(JITCodeTail));

auto JITRIPEntriesLocation = GetCursorAddress<uint8_t*>();
auto JITRIPEntries = GetCursorAddress<JITRIPReconstructEntries*>();
// Entries that live after the JITCodeTail.
// These entries correlate JIT code regions with guest RIP regions.
// Using these entries FEX is able to reconstruct the guest RIP accurately when an instruction cause a signal fault.
// Packed using two variable length integer entries to ensure the size isn't too large.
// These smaller sizes means that each entry is relative to each other instead of absolute offset from the start of the JIT block.
// When reconstructing the RIP, each entry must be walked linearly and accumulated with the previous entries.
// This is a trade-off between compression inside the JIT code space and execution time when reconstruction the RIP.
// RIP reconstruction when faulting is less likely so we are requiring the accumulation.
//
// struct {
// // The Host PC offset from the previous entry.
// FEXCore::Utils::vl64 HostPCOffset;
// // How much to offset the RIP from the previous entry.
// FEXCore::Utils::vl64 GuestRIPOffset;
// };

CursorIncrement(sizeof(JITRIPReconstructEntries) * DebugData->GuestOpcodes.size());
auto JITRIPEntriesBegin = GetCursorAddress<uint8_t*>();

// Put the block's RIP entry in the tail.
// This will be used for RIP reconstruction in the future.
Expand All @@ -867,27 +881,34 @@ CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, uint64_t Size
JITBlockTail->SingleInst = SingleInst;
JITBlockTail->SpinLockFutex = 0;

auto JITRIPEntriesLocation = JITRIPEntriesBegin;

{
// Store the RIP entries.
JITBlockTail->NumberOfRIPEntries = DebugData->GuestOpcodes.size();
JITBlockTail->OffsetToRIPEntries = JITRIPEntriesLocation - JITBlockTailLocation;
JITBlockTail->OffsetToRIPEntries = JITRIPEntriesBegin - JITBlockTailLocation;
uintptr_t CurrentRIPOffset = 0;
uint64_t CurrentPCOffset = 0;

for (size_t i = 0; i < DebugData->GuestOpcodes.size(); i++) {
const auto& GuestOpcode = DebugData->GuestOpcodes[i];
auto& RIPEntry = JITRIPEntries[i];
[[maybe_unused]] uint64_t HostPCOffset = GuestOpcode.HostEntryOffset - CurrentPCOffset;
[[maybe_unused]] int64_t GuestRIPOffset = GuestOpcode.GuestEntryOffset - CurrentRIPOffset;
LOGMAN_THROW_A_FMT(HostPCOffset <= std::numeric_limits<uint16_t>::max(), "PC offset too large");
LOGMAN_THROW_A_FMT(GuestRIPOffset >= std::numeric_limits<int16_t>::min(), "RIP offset too small");
LOGMAN_THROW_A_FMT(GuestRIPOffset <= std::numeric_limits<int16_t>::max(), "RIP offset too large");
RIPEntry.HostPCOffset = GuestOpcode.HostEntryOffset - CurrentPCOffset;
RIPEntry.GuestRIPOffset = GuestOpcode.GuestEntryOffset - CurrentRIPOffset;
int64_t HostPCOffset = GuestOpcode.HostEntryOffset - CurrentPCOffset;
int64_t GuestRIPOffset = GuestOpcode.GuestEntryOffset - CurrentRIPOffset;

size_t Size = FEXCore::Utils::vl64::Encode(JITRIPEntriesLocation, HostPCOffset);
JITRIPEntriesLocation += Size;

Size = FEXCore::Utils::vl64::Encode(JITRIPEntriesLocation, GuestRIPOffset);
JITRIPEntriesLocation += Size;

CurrentPCOffset = GuestOpcode.HostEntryOffset;
CurrentRIPOffset = GuestOpcode.GuestEntryOffset;
}
}

CursorIncrement(JITRIPEntriesLocation - JITRIPEntriesBegin);
Align();

CodeHeader->OffsetToBlockTail = JITBlockTailLocation - CodeData.BlockBegin;

CodeData.Size = GetCursorAddress<uint8_t*>() - CodeData.BlockBegin;
Expand Down
156 changes: 156 additions & 0 deletions FEXCore/Source/Utils/variable_length_integer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
// SPDX-License-Identifier: MIT
#pragma once

#include <FEXCore/Utils/CompilerDefs.h>

#include <cstdio>
#include <cstdint>
#include <cstddef>
#include <limits>

namespace FEXCore::Utils {
// Variable length signed integer
// The most common encoded size is 8-bit positive, but other values can occur
//
// 8-bit:
// bit[7] = 0 - 8-bit
// bit[6:0] = 7-bit encoding
//
// 16-bit:
// byte1[7:6] = 0b10 - 16-bit
// byte1[5:0] = top 6-bits
// byte2[7:0] = Bottom 8-bits bits
//
// 32-bit
// byte1[7:5] = 0b110 - 32-bit
// byte1[4:0] = <reserved>
// word[31:0] = signed word
//
// 64-bit
// byte1[7:5] = 0b111 - 64-bit
// byte1[4:0] = <reserved>
// dword[63:0] = signed dword
struct vl64 final {
static size_t EncodedSize(int64_t Data) {
if (Data >= vl8_min && Data <= vl8_max) {
return sizeof(vl8_enc);
} else if (Data >= vl16_min && Data <= vl16_max) {
return sizeof(vl16_enc);
} else if (Data >= vl32_min && Data <= vl32_max) {
return sizeof(vl32_enc);
}
return sizeof(vl64_enc);
}

struct Decoded {
int64_t Integer;
size_t Size;
};

static Decoded Decode(const uint8_t* data) {
auto vl8_type = reinterpret_cast<const vl8_enc*>(data);
auto vl16_type = reinterpret_cast<const vl16_enc*>(data);
auto vl32_type = reinterpret_cast<const vl32_enc*>(data);
auto vl64_type = reinterpret_cast<const vl64_enc*>(data);

if (vl8_type->Type == vl8_type_header) {
return {vl8_type->Integer, sizeof(vl8_enc)};
} else if (vl16_type->Type == vl16_type_header) {
return {vl16_type->Integer, sizeof(vl16_enc)};
} else if (vl32_type->Type == vl32_type_header) {
return {vl32_type->Integer, sizeof(vl32_enc)};
}
return {vl64_type->Integer, sizeof(vl64_enc)};
}

static size_t Encode(uint8_t* dst, int64_t Data) {
auto vl8_type = reinterpret_cast<vl8_enc*>(dst);
auto vl16_type = reinterpret_cast<vl16_enc*>(dst);
auto vl32_type = reinterpret_cast<vl32_enc*>(dst);
auto vl64_type = reinterpret_cast<vl64_enc*>(dst);

if (Data >= vl8_min && Data <= vl8_max) {
*vl8_type = {
.Integer = static_cast<int8_t>(Data),
.Type = vl8_type_header,
};
return sizeof(vl8_enc);
} else if (Data >= vl16_min && Data <= vl16_max) {
*vl16_type = {
.Integer = static_cast<int16_t>(Data),
.Type = vl16_type_header,
};
return sizeof(vl16_enc);
} else if (Data >= vl32_min && Data <= vl32_max) {
*vl32_type = {
.Type = vl32_type_header,
.Integer = static_cast<int32_t>(Data),
};
return sizeof(vl32_enc);
}

*vl64_type = {
.Type = vl64_type_header,
.Integer = Data,
};
return sizeof(vl64_enc);
}

private:

struct vl8_enc {
int8_t Integer : 7;
uint8_t Type : 1;
};
static_assert(sizeof(vl8_enc) == 1);

struct vl16_enc {
int16_t Integer : 14;
uint16_t Type : 2;
};
static_assert(sizeof(vl16_enc) == 2);

struct FEX_PACKED vl32_enc {
uint8_t Type;
int32_t Integer;
};
static_assert(sizeof(vl32_enc) == 5);

struct FEX_PACKED vl64_enc {
uint8_t Type;
int64_t Integer;
};
static_assert(sizeof(vl64_enc) == 9);

// Maximum ranges for encodings.

// vl8 can hold a signed 7-bit integer.
// Encoded in one 8-bit value.
constexpr static int64_t vl8_encoded_bits = 7;
constexpr static int64_t vl8_type_header = 0;
constexpr static int64_t vl8_min = std::numeric_limits<int64_t>::min() >> ((sizeof(int64_t) * 8) - vl8_encoded_bits);
constexpr static int64_t vl8_max = std::numeric_limits<int64_t>::max() >> ((sizeof(int64_t) * 8) - vl8_encoded_bits);

// vl16 can hold a signed 14-bit integer.
// Encoded in one 16-bit value.
constexpr static int64_t vl16_encoded_bits = 14;
constexpr static int64_t vl16_type_header = 0b10;
constexpr static int64_t vl16_min = std::numeric_limits<int64_t>::min() >> ((sizeof(int64_t) * 8) - vl16_encoded_bits);
constexpr static int64_t vl16_max = std::numeric_limits<int64_t>::max() >> ((sizeof(int64_t) * 8) - vl16_encoded_bits);

// vl32 can hold a signed 32-bit integer.
// Encoded in 8-bit and 32-bit value;
constexpr static int64_t vl32_encoded_bits = 32;
constexpr static int64_t vl32_type_header = 0b1100'0000;
constexpr static int64_t vl32_min = std::numeric_limits<int32_t>::min();
constexpr static int64_t vl32_max = std::numeric_limits<int32_t>::max();

// vl64 can hold a signed 32-bit integer.
// Encoded in 8-bit and 64-bit value.
constexpr static int64_t vl64_encoded_bits = 64;
constexpr static int64_t vl64_type_header = 0b1110'0000;
constexpr static int64_t vl64_min = std::numeric_limits<int64_t>::min();
constexpr static int64_t vl64_max = std::numeric_limits<int64_t>::max();
};

} // namespace FEXCore::Utils
Loading

0 comments on commit 0fa2f9c

Please sign in to comment.