FEXCore/JIT: Encode the JITRIPReconstructionEntries using variable le…

…ngth integer When #2722 implemented this initially and #4271 switched over to signed int16_t there was assumptions made that int16_t was a reasonable trade-off in encoding size versus needing to deal with 8-bit values being too small in some cases. In the common case we are almost always encoding 8-bit values because instructions are typically linear (and less than 15-bytes in size), but 16-bit was chosen because optimizing JIT and multiple instructions that don't cause exceptions can add up to larger than 8-bit. Instead of hardcoding 16-bit values, implement a variable length integer class where ~96.8% of values are 8-bit encoded, and the remaining 3.19% are encoded using 16-bit. Due to some constraints that #4271 put in place, we can basically guarantee currently that branch targets are within 16-bit. The VL class does support 32-bit and 64-bit as well so if we change behaviour then nothing needs to change. Some stats when running Sonic Mania with multiblock enabled. Encoded integers: 3,504,907 Encoded 8-bit: 3,393,095 (96.8%) Encoded 16-bit: 111,812 (3.19%) Encoded 32/64-bit: 0 Encoded Size: 3,615,181 bytes (3.44MiB) Fixed encoded size: 7,007,604 bytes (6.68MiB) Definitely worth using and saves the headache of large RIP/PC offsets causing problems.
FEX-Emu · Jan 27, 2025 · 0fa2f9c · 0fa2f9c
1 parent 8d6a43d
commit 0fa2f9c
Show file tree

Hide file tree

Showing 6 changed files with 315 additions and 39 deletions.
diff --git a/CodeEmitter/CodeEmitter/BranchOps.inl b/CodeEmitter/CodeEmitter/BranchOps.inl
@@ -227,8 +227,7 @@ public:
   }
 
   void tbz(ARMEmitter::Register rt, uint32_t Bit, ForwardLabel* Label) {
-    AddLocationToLabel(
-      Label, ForwardLabel::Reference {.Location = GetCursorAddress<uint8_t*>(), .Type = ForwardLabel::InstType::TEST_BRANCH});
+    AddLocationToLabel(Label, ForwardLabel::Reference {.Location = GetCursorAddress<uint8_t*>(), .Type = ForwardLabel::InstType::TEST_BRANCH});
 
     constexpr uint32_t Op = 0b0011'0110 << 24;
 
@@ -258,8 +257,7 @@ public:
   }
 
   void tbnz(ARMEmitter::Register rt, uint32_t Bit, ForwardLabel* Label) {
-    AddLocationToLabel(
-      Label, ForwardLabel::Reference {.Location = GetCursorAddress<uint8_t*>(), .Type = ForwardLabel::InstType::TEST_BRANCH});
+    AddLocationToLabel(Label, ForwardLabel::Reference {.Location = GetCursorAddress<uint8_t*>(), .Type = ForwardLabel::InstType::TEST_BRANCH});
     constexpr uint32_t Op = 0b0011'0111 << 24;
 
     TestAndBranch(Op, rt, Bit, 0);

diff --git a/FEXCore/Source/Interface/Core/CPUBackend.h b/FEXCore/Source/Interface/Core/CPUBackend.h
@@ -102,22 +102,6 @@ namespace CPU {
       uint32_t _Pad;
     };
 
-    // Entries that live after the JITCodeTail.
-    // These entries correlate JIT code regions with guest RIP regions.
-    // Using these entries FEX is able to reconstruct the guest RIP accurately when an instruction cause a signal fault.
-    // Packed using 16-bit entries to ensure the size isn't too large.
-    // These smaller sizes means that each entry is relative to each other instead of absolute offset from the start of the JIT block.
-    // When reconstructing the RIP, each entry must be walked linearly and accumulated with the previous entries.
-    // This is a trade-off between compression inside the JIT code space and execution time when reconstruction the RIP.
-    // RIP reconstruction when faulting is less likely so we are requiring the accumulation.
-    struct JITRIPReconstructEntries {
-      // The Host PC offset from the previous entry.
-      uint16_t HostPCOffset;
-
-      // How much to offset the RIP from the previous entry.
-      int16_t GuestRIPOffset;
-    };
-
     /**
      * @brief Tells this CPUBackend to compile code for the provided IR and DebugData
      *

diff --git a/FEXCore/Source/Interface/Core/Core.cpp b/FEXCore/Source/Interface/Core/Core.cpp
@@ -28,6 +28,7 @@ desc: Glues Frontend, OpDispatcher and IR Opts & Compilation, LookupCache, Dispa
 #include "Utils/Allocator.h"
 #include "Utils/Allocator/HostAllocator.h"
 #include "Utils/SpinWaitLock.h"
+#include "Utils/variable_length_integer.h"
 
 #include <FEXCore/Config/Config.h>
 #include <FEXCore/Core/Context.h>
@@ -144,24 +145,26 @@ uint64_t ContextImpl::RestoreRIPFromHostPC(FEXCore::Core::InternalThreadState* T
   auto [InlineHeader, InlineTail] = GetFrameBlockInfo(Thread->CurrentFrame);
 
   if (InlineHeader) {
-    auto RIPEntries = reinterpret_cast<const CPU::CPUBackend::JITRIPReconstructEntries*>(
-      Frame->State.InlineJITBlockHeader + InlineHeader->OffsetToBlockTail + InlineTail->OffsetToRIPEntries);
-
     // Check if the host PC is currently within a code block.
     // If it is then RIP can be reconstructed from the beginning of the code block.
     // This is currently as close as FEX can get RIP reconstructions.
     if (HostPC >= reinterpret_cast<uint64_t>(BlockBegin) && HostPC < reinterpret_cast<uint64_t>(BlockBegin + InlineTail->Size)) {
 
+      auto RIPEntry =
+        reinterpret_cast<const uint8_t*>(Frame->State.InlineJITBlockHeader + InlineHeader->OffsetToBlockTail + InlineTail->OffsetToRIPEntries);
+
       // Reconstruct RIP from JIT entries for this block.
       uint64_t StartingHostPC = BlockBegin;
       uint64_t StartingGuestRIP = InlineTail->RIP;
 
       for (uint32_t i = 0; i < InlineTail->NumberOfRIPEntries; ++i) {
-        const auto& RIPEntry = RIPEntries[i];
-        if (HostPC >= (StartingHostPC + RIPEntry.HostPCOffset)) {
+        auto HostPCOffset = FEXCore::Utils::vl64::Decode(RIPEntry);
+        RIPEntry += HostPCOffset.Size;
+        auto GuestRIPOffset = FEXCore::Utils::vl64::Decode(RIPEntry);
+        if (HostPC >= (StartingHostPC + HostPCOffset.Integer)) {
           // We are beyond this entry, keep going forward.
-          StartingHostPC += RIPEntry.HostPCOffset;
-          StartingGuestRIP += RIPEntry.GuestRIPOffset;
+          StartingHostPC += HostPCOffset.Integer;
+          StartingGuestRIP += GuestRIPOffset.Integer;
         } else {
           // Passed where the Host PC is at. Break now.
           break;

diff --git a/FEXCore/Source/Interface/Core/JIT/JIT.cpp b/FEXCore/Source/Interface/Core/JIT/JIT.cpp
@@ -21,6 +21,7 @@ desc: Main glue logic of the arm64 splatter backend
 #include "Interface/IR/Passes/RegisterAllocationPass.h"
 
 #include "Utils/MemberFunctionToPointer.h"
+#include "Utils/variable_length_integer.h"
 
 #include <FEXCore/Core/X86Enums.h>
 #include <FEXCore/Debug/InternalThreadState.h>
@@ -853,10 +854,23 @@ CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, uint64_t Size
   auto JITBlockTail = GetCursorAddress<JITCodeTail*>();
   CursorIncrement(sizeof(JITCodeTail));
 
-  auto JITRIPEntriesLocation = GetCursorAddress<uint8_t*>();
-  auto JITRIPEntries = GetCursorAddress<JITRIPReconstructEntries*>();
+  // Entries that live after the JITCodeTail.
+  // These entries correlate JIT code regions with guest RIP regions.
+  // Using these entries FEX is able to reconstruct the guest RIP accurately when an instruction cause a signal fault.
+  // Packed using two variable length integer entries to ensure the size isn't too large.
+  // These smaller sizes means that each entry is relative to each other instead of absolute offset from the start of the JIT block.
+  // When reconstructing the RIP, each entry must be walked linearly and accumulated with the previous entries.
+  // This is a trade-off between compression inside the JIT code space and execution time when reconstruction the RIP.
+  // RIP reconstruction when faulting is less likely so we are requiring the accumulation.
+  //
+  // struct {
+  //   // The Host PC offset from the previous entry.
+  //   FEXCore::Utils::vl64 HostPCOffset;
+  //   // How much to offset the RIP from the previous entry.
+  //   FEXCore::Utils::vl64 GuestRIPOffset;
+  // };
 
-  CursorIncrement(sizeof(JITRIPReconstructEntries) * DebugData->GuestOpcodes.size());
+  auto JITRIPEntriesBegin = GetCursorAddress<uint8_t*>();
 
   // Put the block's RIP entry in the tail.
   // This will be used for RIP reconstruction in the future.
@@ -867,27 +881,34 @@ CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, uint64_t Size
   JITBlockTail->SingleInst = SingleInst;
   JITBlockTail->SpinLockFutex = 0;
 
+  auto JITRIPEntriesLocation = JITRIPEntriesBegin;
+
   {
     // Store the RIP entries.
     JITBlockTail->NumberOfRIPEntries = DebugData->GuestOpcodes.size();
-    JITBlockTail->OffsetToRIPEntries = JITRIPEntriesLocation - JITBlockTailLocation;
+    JITBlockTail->OffsetToRIPEntries = JITRIPEntriesBegin - JITBlockTailLocation;
     uintptr_t CurrentRIPOffset = 0;
     uint64_t CurrentPCOffset = 0;
+
     for (size_t i = 0; i < DebugData->GuestOpcodes.size(); i++) {
       const auto& GuestOpcode = DebugData->GuestOpcodes[i];
-      auto& RIPEntry = JITRIPEntries[i];
-      [[maybe_unused]] uint64_t HostPCOffset = GuestOpcode.HostEntryOffset - CurrentPCOffset;
-      [[maybe_unused]] int64_t GuestRIPOffset = GuestOpcode.GuestEntryOffset - CurrentRIPOffset;
-      LOGMAN_THROW_A_FMT(HostPCOffset <= std::numeric_limits<uint16_t>::max(), "PC offset too large");
-      LOGMAN_THROW_A_FMT(GuestRIPOffset >= std::numeric_limits<int16_t>::min(), "RIP offset too small");
-      LOGMAN_THROW_A_FMT(GuestRIPOffset <= std::numeric_limits<int16_t>::max(), "RIP offset too large");
-      RIPEntry.HostPCOffset = GuestOpcode.HostEntryOffset - CurrentPCOffset;
-      RIPEntry.GuestRIPOffset = GuestOpcode.GuestEntryOffset - CurrentRIPOffset;
+      int64_t HostPCOffset = GuestOpcode.HostEntryOffset - CurrentPCOffset;
+      int64_t GuestRIPOffset = GuestOpcode.GuestEntryOffset - CurrentRIPOffset;
+
+      size_t Size = FEXCore::Utils::vl64::Encode(JITRIPEntriesLocation, HostPCOffset);
+      JITRIPEntriesLocation += Size;
+
+      Size = FEXCore::Utils::vl64::Encode(JITRIPEntriesLocation, GuestRIPOffset);
+      JITRIPEntriesLocation += Size;
+
       CurrentPCOffset = GuestOpcode.HostEntryOffset;
       CurrentRIPOffset = GuestOpcode.GuestEntryOffset;
     }
   }
 
+  CursorIncrement(JITRIPEntriesLocation - JITRIPEntriesBegin);
+  Align();
+
   CodeHeader->OffsetToBlockTail = JITBlockTailLocation - CodeData.BlockBegin;
 
   CodeData.Size = GetCursorAddress<uint8_t*>() - CodeData.BlockBegin;

diff --git a/FEXCore/Source/Utils/variable_length_integer.h b/FEXCore/Source/Utils/variable_length_integer.h
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: MIT
+#pragma once
+
+#include <FEXCore/Utils/CompilerDefs.h>
+
+#include <cstdio>
+#include <cstdint>
+#include <cstddef>
+#include <limits>
+
+namespace FEXCore::Utils {
+// Variable length signed integer
+// The most common encoded size is 8-bit positive, but other values can occur
+//
+// 8-bit:
+// bit[7] = 0 - 8-bit
+// bit[6:0] = 7-bit encoding
+//
+// 16-bit:
+// byte1[7:6] = 0b10 - 16-bit
+// byte1[5:0] = top 6-bits
+// byte2[7:0] = Bottom 8-bits bits
+//
+// 32-bit
+// byte1[7:5] = 0b110 - 32-bit
+// byte1[4:0] = <reserved>
+// word[31:0] = signed word
+//
+// 64-bit
+// byte1[7:5] = 0b111 - 64-bit
+// byte1[4:0] = <reserved>
+// dword[63:0] = signed dword
+struct vl64 final {
+  static size_t EncodedSize(int64_t Data) {
+    if (Data >= vl8_min && Data <= vl8_max) {
+      return sizeof(vl8_enc);
+    } else if (Data >= vl16_min && Data <= vl16_max) {
+      return sizeof(vl16_enc);
+    } else if (Data >= vl32_min && Data <= vl32_max) {
+      return sizeof(vl32_enc);
+    }
+    return sizeof(vl64_enc);
+  }
+
+  struct Decoded {
+    int64_t Integer;
+    size_t Size;
+  };
+
+  static Decoded Decode(const uint8_t* data) {
+    auto vl8_type = reinterpret_cast<const vl8_enc*>(data);
+    auto vl16_type = reinterpret_cast<const vl16_enc*>(data);
+    auto vl32_type = reinterpret_cast<const vl32_enc*>(data);
+    auto vl64_type = reinterpret_cast<const vl64_enc*>(data);
+
+    if (vl8_type->Type == vl8_type_header) {
+      return {vl8_type->Integer, sizeof(vl8_enc)};
+    } else if (vl16_type->Type == vl16_type_header) {
+      return {vl16_type->Integer, sizeof(vl16_enc)};
+    } else if (vl32_type->Type == vl32_type_header) {
+      return {vl32_type->Integer, sizeof(vl32_enc)};
+    }
+    return {vl64_type->Integer, sizeof(vl64_enc)};
+  }
+
+  static size_t Encode(uint8_t* dst, int64_t Data) {
+    auto vl8_type = reinterpret_cast<vl8_enc*>(dst);
+    auto vl16_type = reinterpret_cast<vl16_enc*>(dst);
+    auto vl32_type = reinterpret_cast<vl32_enc*>(dst);
+    auto vl64_type = reinterpret_cast<vl64_enc*>(dst);
+
+    if (Data >= vl8_min && Data <= vl8_max) {
+      *vl8_type = {
+        .Integer = static_cast<int8_t>(Data),
+        .Type = vl8_type_header,
+      };
+      return sizeof(vl8_enc);
+    } else if (Data >= vl16_min && Data <= vl16_max) {
+      *vl16_type = {
+        .Integer = static_cast<int16_t>(Data),
+        .Type = vl16_type_header,
+      };
+      return sizeof(vl16_enc);
+    } else if (Data >= vl32_min && Data <= vl32_max) {
+      *vl32_type = {
+        .Type = vl32_type_header,
+        .Integer = static_cast<int32_t>(Data),
+      };
+      return sizeof(vl32_enc);
+    }
+
+    *vl64_type = {
+      .Type = vl64_type_header,
+      .Integer = Data,
+    };
+    return sizeof(vl64_enc);
+  }
+
+private:
+
+  struct vl8_enc {
+    int8_t Integer : 7;
+    uint8_t Type   : 1;
+  };
+  static_assert(sizeof(vl8_enc) == 1);
+
+  struct vl16_enc {
+    int16_t Integer : 14;
+    uint16_t Type   : 2;
+  };
+  static_assert(sizeof(vl16_enc) == 2);
+
+  struct FEX_PACKED vl32_enc {
+    uint8_t Type;
+    int32_t Integer;
+  };
+  static_assert(sizeof(vl32_enc) == 5);
+
+  struct FEX_PACKED vl64_enc {
+    uint8_t Type;
+    int64_t Integer;
+  };
+  static_assert(sizeof(vl64_enc) == 9);
+
+  // Maximum ranges for encodings.
+
+  // vl8 can hold a signed 7-bit integer.
+  // Encoded in one 8-bit value.
+  constexpr static int64_t vl8_encoded_bits = 7;
+  constexpr static int64_t vl8_type_header = 0;
+  constexpr static int64_t vl8_min = std::numeric_limits<int64_t>::min() >> ((sizeof(int64_t) * 8) - vl8_encoded_bits);
+  constexpr static int64_t vl8_max = std::numeric_limits<int64_t>::max() >> ((sizeof(int64_t) * 8) - vl8_encoded_bits);
+
+  // vl16 can hold a signed 14-bit integer.
+  // Encoded in one 16-bit value.
+  constexpr static int64_t vl16_encoded_bits = 14;
+  constexpr static int64_t vl16_type_header = 0b10;
+  constexpr static int64_t vl16_min = std::numeric_limits<int64_t>::min() >> ((sizeof(int64_t) * 8) - vl16_encoded_bits);
+  constexpr static int64_t vl16_max = std::numeric_limits<int64_t>::max() >> ((sizeof(int64_t) * 8) - vl16_encoded_bits);
+
+  // vl32 can hold a signed 32-bit integer.
+  // Encoded in 8-bit and 32-bit value;
+  constexpr static int64_t vl32_encoded_bits = 32;
+  constexpr static int64_t vl32_type_header = 0b1100'0000;
+  constexpr static int64_t vl32_min = std::numeric_limits<int32_t>::min();
+  constexpr static int64_t vl32_max = std::numeric_limits<int32_t>::max();
+
+  // vl64 can hold a signed 32-bit integer.
+  // Encoded in 8-bit and 64-bit value.
+  constexpr static int64_t vl64_encoded_bits = 64;
+  constexpr static int64_t vl64_type_header = 0b1110'0000;
+  constexpr static int64_t vl64_min = std::numeric_limits<int64_t>::min();
+  constexpr static int64_t vl64_max = std::numeric_limits<int64_t>::max();
+};
+
+} // namespace FEXCore::Utils