FEX-Emu · Sonicadvance1 · Jan 22, 2025 · Jan 22, 2025 · Jan 22, 2025 · Jan 22, 2025
diff --git a/FEXCore/Source/Interface/Config/Config.json.in b/FEXCore/Source/Interface/Config/Config.json.in
@@ -363,6 +363,14 @@
           "Redirects the telemetry folder that FEX usually writes to.",
           "By default telemetry data is stored in {$FEX_APP_DATA_LOCATION,{$XDG_DATA_HOME,$HOME}/.fex-emu/Telemetry/}"
         ]
+      },
+      "ProfileStats": {
+        "Type": "bool",
+        "Default": "false",
+        "Desc": [
+          "Enables FEX's low-overhead sampling profile statistics.",
+          "Requires a supported version of Mangohud to see the results"
+        ]
       }
     },
     "Hacks": {

diff --git a/FEXCore/Source/Interface/Core/Core.cpp b/FEXCore/Source/Interface/Core/Core.cpp
@@ -831,8 +831,9 @@ ContextImpl::CompileCodeResult ContextImpl::CompileCode(FEXCore::Core::InternalT
 }
 
 uintptr_t ContextImpl::CompileBlock(FEXCore::Core::CpuStateFrame* Frame, uint64_t GuestRIP, uint64_t MaxInst) {
-  FEXCORE_PROFILE_SCOPED("CompileBlock");
   auto Thread = Frame->Thread;
+  FEXCORE_PROFILE_SCOPED("CompileBlock");
+  FEXCORE_PROFILE_ACCUMULATION(Thread, AccumulatedJITTime);
 
   // Invalidate might take a unique lock on this, to guarantee that during invalidation no code gets compiled
   auto lk = GuardSignalDeferringSection<std::shared_lock>(CodeInvalidationMutex, Thread);

diff --git a/FEXCore/include/FEXCore/Debug/InternalThreadState.h b/FEXCore/include/FEXCore/Debug/InternalThreadState.h
@@ -36,6 +36,10 @@ class OpDispatchBuilder;
 class PassManager;
 } // namespace FEXCore::IR
 
+namespace FEXCore::Profiler {
+struct ThreadStats;
+};
+
 namespace FEXCore::Core {
 
 // Special-purpose replacement for std::unique_ptr to allow InternalThreadState to be standard layout.
@@ -95,6 +99,9 @@ struct InternalThreadState : public FEXCore::Allocator::FEXAllocOperators {
 
   std::shared_mutex ObjectCacheRefCounter {};
 
+  // This pointer is owned by the frontend.
+  FEXCore::Profiler::ThreadStats* ThreadStats {};
+
   ///< Data pointer for exclusive use by the frontend
   void* FrontendPtr;
 

diff --git a/FEXCore/include/FEXCore/Utils/Profiler.h b/FEXCore/include/FEXCore/Utils/Profiler.h
@@ -1,13 +1,73 @@
 // SPDX-License-Identifier: MIT
 #pragma once
+#include <atomic>
 #include <cstdint>
 #include <string_view>
 
+#ifdef _M_X86_64
+#include <x86intrin.h>
+#endif
+
 #include <FEXCore/Utils/CompilerDefs.h>
 
 namespace FEXCore::Profiler {
+// FEXCore live-stats
+constexpr uint8_t STATS_VERSION = 1;
+enum class AppType : uint8_t {
+  LINUX_32,
+  LINUX_64,
+  WIN_ARM64EC,
+  WIN_WOW64,
+};
+
+struct ThreadStatsHeader {
+  uint8_t Version;
+  AppType app_type;
+  uint8_t _pad[2];
+  char fex_version[48];
+  std::atomic<uint32_t> Head;
+  std::atomic<uint32_t> Size;
+  uint32_t Pad;
+};
+
+struct ThreadStats {
+  std::atomic<uint32_t> Next;
+  std::atomic<uint32_t> TID;
+
+  // Accumulated time (In unscaled CPU cycles!)
+  uint64_t AccumulatedJITTime;
+  uint64_t AccumulatedSignalTime;
+
+  // Accumulated event counts
+  uint64_t AccumulatedSIGBUSCount;
+  uint64_t AccumulatedSMCCount;
+};
+
 #ifdef ENABLE_FEXCORE_PROFILER
 
+#ifdef _M_ARM_64
+/**
+ * @brief Get the raw cycle counter with synchronizing isb.
+ *
+ * `CNTVCTSS_EL0` also does the same thing, but requires the FEAT_ECV feature.
+ */
+static inline uint64_t GetCycleCounter() {
+  uint64_t Result {};
+  __asm volatile(R"(
+      isb;
+      mrs %[Res], CNTVCT_EL0;
+    )"
+                 : [Res] "=r"(Result));
+  return Result;
+}
+#else
+static inline uint64_t GetCycleCounter() {
+  unsigned dummy;
+  uint64_t tsc = __rdtscp(&dummy);
+  return tsc;
+}
+#endif
+
 FEX_DEFAULT_VISIBILITY void Init();
 FEX_DEFAULT_VISIBILITY void Shutdown();
 FEX_DEFAULT_VISIBILITY void TraceObject(std::string_view const Format);
@@ -34,6 +94,36 @@ class ProfilerBlock final {
 // Declare a scoped profile block variable with a fixed name.
 #define FEXCORE_PROFILE_SCOPED(name) FEXCore::Profiler::ProfilerBlock UniqueScopeName(ScopedBlock_, __LINE__)(name)
 
+template<typename T, size_t FlatOffset = 0>
+class AccumulationBlock final {
+public:
+  AccumulationBlock(T* Stat)
+    : Begin {GetCycleCounter()}
+    , Stat {Stat} {}
+
+  ~AccumulationBlock() {
+    const auto Duration = GetCycleCounter() - Begin + FlatOffset;
+    if (Stat) {
+      auto ref = std::atomic_ref<T>(*Stat);
+      ref.fetch_add(Duration, std::memory_order_relaxed);
+    }
+  }
+
+private:
+  uint64_t Begin;
+  T* Stat;
+};
+
+#define FEXCORE_PROFILE_ACCUMULATION(ThreadState, Stat)                                                                          \
+  FEXCore::Profiler::AccumulationBlock<decltype(ThreadState->ThreadStats->Stat)> UniqueScopeName(ScopedAccumulation_, __LINE__)( \
+    ThreadState->ThreadStats ? &ThreadState->ThreadStats->Stat : nullptr);
+#define FEXCORE_PROFILE_INSTANT_INCREMENT(ThreadState, Stat, value) \
+  do {                                                              \
+    if (ThreadState->ThreadStats) {                                 \
+      ThreadState->ThreadStats->Stat += value;                      \
+    }                                                               \
+  } while (0)
+
 #else
 [[maybe_unused]]
 static void Init() {}
@@ -50,5 +140,12 @@ static void TraceObject(std::string_view const, uint64_t) {}
 #define FEXCORE_PROFILE_SCOPED(...) \
   do {                              \
   } while (0)
+#define FEXCORE_PROFILE_ACCUMULATION(...) \
+  do {                                    \
+  } while (0)
+#define FEXCORE_PROFILE_INSTANT_INCREMENT(...) \
+  do {                                         \
+  } while (0)
+
 #endif
 } // namespace FEXCore::Profiler
diff --git a/Source/Common/CMakeLists.txt b/Source/Common/CMakeLists.txt
@@ -7,7 +7,8 @@ set(SRCS
   EnvironmentLoader.cpp
   HostFeatures.cpp
   JSONPool.cpp
-  StringUtil.cpp)
+  StringUtil.cpp
+  Profiler.cpp)
 
 if (NOT MINGW_BUILD)
   list (APPEND SRCS

diff --git a/Source/Common/Profiler.cpp b/Source/Common/Profiler.cpp
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: MIT
+#include "Common/Profiler.h"
+#include "git_version.h"
+
+#include <FEXCore/Debug/InternalThreadState.h>
+
+namespace FEX::Profiler {
+void StatAllocBase::SaveHeader(FEXCore::Profiler::AppType AppType) {
+  if (!Base) {
+    return;
+  }
+
+  Head = reinterpret_cast<FEXCore::Profiler::ThreadStatsHeader*>(Base);
+  Head->Size.store(CurrentSize, std::memory_order_relaxed);
+  Head->Version = FEXCore::Profiler::STATS_VERSION;
+
+  std::string_view GitString = GIT_DESCRIBE_STRING;
+  strncpy(Head->fex_version, GitString.data(), std::min(GitString.size(), sizeof(Head->fex_version)));
+  Head->app_type = AppType;
+
+  Stats = reinterpret_cast<FEXCore::Profiler::ThreadStats*>(reinterpret_cast<uint64_t>(Base) + sizeof(FEXCore::Profiler::ThreadStatsHeader));
+
+  RemainingSlots = TotalSlotsFromSize();
+}
+
+bool StatAllocBase::AllocateMoreSlots() {
+  const auto OriginalSlotCount = TotalSlotsFromSize();
+
+  uint32_t NewSize = FrontendAllocateSlots(CurrentSize * 2);
+
+  if (NewSize == CurrentSize) {
+    return false;
+  }
+
+  CurrentSize = NewSize;
+  Head->Size.store(CurrentSize, std::memory_order_relaxed);
+  RemainingSlots = TotalSlotsFromSize() - OriginalSlotCount;
+
+  return true;
+}
+
+FEXCore::Profiler::ThreadStats* StatAllocBase::AllocateSlot(uint32_t TID) {
+  if (!RemainingSlots) {
+    if (!AllocateMoreSlots()) {
+      return nullptr;
+    }
+  }
+
+  // Find a free slot
+  store_memory_barrier();
+  FEXCore::Profiler::ThreadStats* AllocatedSlot {};
+  for (size_t i = 0; i < TotalSlotsFromSize(); ++i) {
+    AllocatedSlot = &Stats[i];
+    if (AllocatedSlot->TID.load(std::memory_order_relaxed) == 0) {
+      break;
+    }
+  }
+
+  --RemainingSlots;
+
+  // Slot might be reused, just zero it now.
+  memset(AllocatedSlot, 0, sizeof(FEXCore::Profiler::ThreadStatsHeader));
+
+  // TID != 0 means slot is allocated.
+  AllocatedSlot->TID.store(TID, std::memory_order_relaxed);
+
+  // Setup singly-linked list
+  if (Head->Head.load(std::memory_order_relaxed) == 0) {
+    Head->Head.store(OffsetFromStat(AllocatedSlot), std::memory_order_relaxed);
+  } else {
+    StatTail->Next.store(OffsetFromStat(AllocatedSlot), std::memory_order_relaxed);
+  }
+
+  // Update the tail.
+  StatTail = AllocatedSlot;
+  return AllocatedSlot;
+}
+
+void StatAllocBase::DeallocateSlot(FEXCore::Profiler::ThreadStats* AllocatedSlot) {
+  if (!AllocatedSlot) {
+    return;
+  }
+
+  // TID == 0 will signal the reader to ignore this slot & deallocate it!
+  AllocatedSlot->TID.store(0, std::memory_order_relaxed);
+
+  store_memory_barrier();
+
+  const auto SlotOffset = OffsetFromStat(AllocatedSlot);
+  const auto AllocatedSlotNext = AllocatedSlot->Next.load(std::memory_order_relaxed);
+
+  const bool IsTail = AllocatedSlot == StatTail;
+
+  // Update the linked list.
+  if (Head->Head == SlotOffset) {
+    Head->Head.store(AllocatedSlotNext, std::memory_order_relaxed);
+    if (IsTail) {
+      StatTail = nullptr;
+    }
+  } else {
+    for (size_t i = 0; i < TotalSlotsFromSize(); ++i) {
+      auto Slot = &Stats[i];
+      auto NextSlotOffset = Slot->Next.load(std::memory_order_relaxed);
+
+      if (NextSlotOffset == SlotOffset) {
+        Slot->Next.store(AllocatedSlotNext, std::memory_order_relaxed);
+
+        if (IsTail) {
+          // This slot is now the tail.
+          StatTail = Slot;
+        }
+        break;
+      }
+    }
+  }
+
+  ++RemainingSlots;
+}
+
+} // namespace FEX::Profiler
diff --git a/Source/Common/Profiler.h b/Source/Common/Profiler.h
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: MIT
+/*
+$info$
+tags: Common|Profiler
+desc: Frontend profiler common code
+$end_info$
+*/
+#pragma once
+#include <FEXCore/Utils/Profiler.h>
+
+namespace FEXCore::Core {
+struct InternalThreadState;
+}
+
+#ifdef _M_ARM_64
+static inline void store_memory_barrier() {
+  asm volatile("dmb ishst;" ::: "memory");
+}
+
+#else
+static inline void store_memory_barrier() {
+  // Intentionally empty.
+  // x86 is strongly memory ordered with regular loadstores. No need for barrier.
+}
+#endif
+
+namespace FEX::Profiler {
+class StatAllocBase {
+protected:
+  FEXCore::Profiler::ThreadStats* AllocateSlot(uint32_t TID);
+  void DeallocateSlot(FEXCore::Profiler::ThreadStats* AllocatedSlot);
+
+  uint32_t OffsetFromStat(FEXCore::Profiler::ThreadStats* Stat) const {
+    return reinterpret_cast<uint64_t>(Stat) - reinterpret_cast<uint64_t>(Base);
+  }
+  uint32_t TotalSlotsFromSize() const {
+    return (CurrentSize - sizeof(FEXCore::Profiler::ThreadStatsHeader)) / sizeof(FEXCore::Profiler::ThreadStats) - 1;
+  }
+  uint32_t TotalSlotsFromSize(uint32_t Size) const {
+    return (Size - sizeof(FEXCore::Profiler::ThreadStatsHeader)) / sizeof(FEXCore::Profiler::ThreadStats) - 1;
+  }
+
+  uint32_t SlotIndexFromOffset(uint32_t Offset) {
+    return (Offset - sizeof(FEXCore::Profiler::ThreadStatsHeader)) / sizeof(FEXCore::Profiler::ThreadStats);
+  }
+
+  void SaveHeader(FEXCore::Profiler::AppType AppType);
+
+  void* Base;
+  uint32_t CurrentSize {};
+  FEXCore::Profiler::ThreadStatsHeader* Head {};
+  FEXCore::Profiler::ThreadStats* Stats;
+  FEXCore::Profiler::ThreadStats* StatTail {};
+  uint32_t RemainingSlots;
+
+  // Limited to 4MB which should be a few hundred threads of tracking capability.
+  // I (Sonicadvance1) wanted to reserve 128MB of VA space because it's cheap, but ran in to a bug when running WINE.
+  // WINE allocates [0x7fff'fe00'0000, 0x7fff'ffff'0000) which /consistently/ overlaps with FEX's sigaltstack.
+  // This only occurs when this stat allocation size is large as the top-down allocation pushes the alt-stack further.
+  // Additionally, only occurs on 48-bit VA systems, as mmap on lesser VA will fail regardless.
+  // TODO: Bump allocation size up once FEXCore's allocator can first use the 128TB of blocked VA space on 48-bit systems.
+  constexpr static uint32_t MAX_STATS_SIZE = 4 * 1024 * 1024;
+
+private:
+  virtual uint32_t FrontendAllocateSlots(uint32_t NewSize) = 0;
+  bool AllocateMoreSlots();
+};
+
+} // namespace FEX::Profiler