Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEX: Implements new sampling based stats #4291

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
8 changes: 8 additions & 0 deletions FEXCore/Source/Interface/Config/Config.json.in
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,14 @@
"Redirects the telemetry folder that FEX usually writes to.",
"By default telemetry data is stored in {$FEX_APP_DATA_LOCATION,{$XDG_DATA_HOME,$HOME}/.fex-emu/Telemetry/}"
]
},
"ProfileStats": {
"Type": "bool",
"Default": "false",
"Desc": [
"Enables FEX's low-overhead sampling profile statistics.",
"Requires a supported version of Mangohud to see the results"
]
}
},
"Hacks": {
Expand Down
3 changes: 2 additions & 1 deletion FEXCore/Source/Interface/Core/Core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -831,8 +831,9 @@ ContextImpl::CompileCodeResult ContextImpl::CompileCode(FEXCore::Core::InternalT
}

uintptr_t ContextImpl::CompileBlock(FEXCore::Core::CpuStateFrame* Frame, uint64_t GuestRIP, uint64_t MaxInst) {
FEXCORE_PROFILE_SCOPED("CompileBlock");
auto Thread = Frame->Thread;
FEXCORE_PROFILE_SCOPED("CompileBlock");
FEXCORE_PROFILE_ACCUMULATION(Thread, AccumulatedJITTime);

// Invalidate might take a unique lock on this, to guarantee that during invalidation no code gets compiled
auto lk = GuardSignalDeferringSection<std::shared_lock>(CodeInvalidationMutex, Thread);
Expand Down
6 changes: 6 additions & 0 deletions FEXCore/include/FEXCore/Debug/InternalThreadState.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ class OpDispatchBuilder;
class PassManager;
} // namespace FEXCore::IR

namespace FEXCore::Profiler {
struct ThreadStats;
};

namespace FEXCore::Core {

// Special-purpose replacement for std::unique_ptr to allow InternalThreadState to be standard layout.
Expand Down Expand Up @@ -95,6 +99,8 @@ struct InternalThreadState : public FEXCore::Allocator::FEXAllocOperators {

std::shared_mutex ObjectCacheRefCounter {};

FEXCore::Profiler::ThreadStats* ThreadStats {};
Sonicadvance1 marked this conversation as resolved.
Show resolved Hide resolved

///< Data pointer for exclusive use by the frontend
void* FrontendPtr;

Expand Down
96 changes: 96 additions & 0 deletions FEXCore/include/FEXCore/Utils/Profiler.h
Original file line number Diff line number Diff line change
@@ -1,13 +1,72 @@
// SPDX-License-Identifier: MIT
#pragma once
#include <atomic>
#include <cstdint>
#include <string_view>

#ifdef _M_X86_64
#include <x86intrin.h>
#endif

#include <FEXCore/Utils/CompilerDefs.h>

namespace FEXCore::Profiler {
// FEXCore live-stats
constexpr uint8_t STATS_VERSION = 1;
enum class AppType : uint8_t {
LINUX_32,
LINUX_64,
WIN_ARM64EC,
WIN_WOW64,
};

struct ThreadStatsHeader {
uint8_t Version;
AppType app_type;
uint8_t _pad[2];
char fex_version[48];
std::atomic<uint32_t> Head;
std::atomic<uint64_t> Size;
};

struct ThreadStats {
std::atomic<uint32_t> Next;
std::atomic<uint32_t> TID;

// Accumulated time (In unscaled CPU cycles!)
uint64_t AccumulatedJITTime;
uint64_t AccumulatedSignalTime;

// Accumulated event counts
uint64_t AccumulatedSIGBUSCount;
uint64_t AccumulatedSMCCount;
};

#ifdef ENABLE_FEXCORE_PROFILER

#ifdef _M_ARM_64
/**
* @brief Get the raw cycle counter which is synchronizing.
Sonicadvance1 marked this conversation as resolved.
Show resolved Hide resolved
*
* `CNTVCTSS_EL0` also does the same thing, but requires the FEAT_ECV feature.
*/
static inline uint64_t GetCycleCounter() {
uint64_t Result {};
__asm volatile(R"(
isb;
mrs %[Res], CNTVCT_EL0;
)"
: [Res] "=r"(Result));
return Result;
}
#else
static inline uint64_t GetCycleCounter() {
unsigned dummy;
uint64_t tsc = __rdtscp(&dummy);
return tsc;
}
#endif

FEX_DEFAULT_VISIBILITY void Init();
FEX_DEFAULT_VISIBILITY void Shutdown();
FEX_DEFAULT_VISIBILITY void TraceObject(std::string_view const Format);
Expand All @@ -34,6 +93,36 @@ class ProfilerBlock final {
// Declare a scoped profile block variable with a fixed name.
#define FEXCORE_PROFILE_SCOPED(name) FEXCore::Profiler::ProfilerBlock UniqueScopeName(ScopedBlock_, __LINE__)(name)

template<typename T, size_t FlatOffset = 0>
class AccumulationBlock final {
public:
AccumulationBlock(T* Stat)
: Begin {GetCycleCounter()}
, Stat {Stat} {}

~AccumulationBlock() {
const auto Duration = GetCycleCounter() - Begin + FlatOffset;
if (Stat) {
auto ref = std::atomic_ref<T>(*Stat);
ref.fetch_add(Duration, std::memory_order_relaxed);
}
}

private:
uint64_t Begin;
T* Stat;
Sonicadvance1 marked this conversation as resolved.
Show resolved Hide resolved
};

#define FEXCORE_PROFILE_ACCUMULATION(ThreadState, Stat) \
FEXCore::Profiler::AccumulationBlock<decltype(ThreadState->ThreadStats->Stat)> UniqueScopeName(ScopedAccumulation_, __LINE__)( \
ThreadState->ThreadStats ? &ThreadState->ThreadStats->Stat : nullptr);
#define FEXCORE_PROFILE_INSTANT_INCREMENT(ThreadState, Stat, value) \
do { \
if (ThreadState->ThreadStats) { \
ThreadState->ThreadStats->Stat += value; \
} \
} while (0)

#else
[[maybe_unused]]
static void Init() {}
Expand All @@ -50,5 +139,12 @@ static void TraceObject(std::string_view const, uint64_t) {}
#define FEXCORE_PROFILE_SCOPED(...) \
do { \
} while (0)
#define FEXCORE_PROFILE_ACCUMULATION(...) \
do { \
} while (0)
#define FEXCORE_PROFILE_INSTANT_INCREMENT(...) \
do { \
} while (0)

#endif
} // namespace FEXCore::Profiler
3 changes: 2 additions & 1 deletion Source/Common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ set(SRCS
EnvironmentLoader.cpp
HostFeatures.cpp
JSONPool.cpp
StringUtil.cpp)
StringUtil.cpp
Profiler.cpp)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are you adding a new set of files here instead of keeping it to a single set of files?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is frontend code that is shared between FEXInterpreter, wow64, and arm64ec.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this frontend code but the existing Profiler infrastructure in FEXCore not, despite also being Linux specific?

Copy link
Member

@neobrain neobrain Jan 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess my problem is that the 6 files (if I'm counting correctly) are placed in seemingly random locations: The existing Linux-specific code probably shouldn't be in FEXCore since the Windows-specific equivalent isn't there either.

(Not sure if moving it makes more sense than moving the Windows code to FEXCore.)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That code should also get moved to the frontend at some point as long as the TraceObject handlers can still get called from FEXCore.


if (NOT MINGW_BUILD)
list (APPEND SRCS
Expand Down
120 changes: 120 additions & 0 deletions Source/Common/Profiler.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
// SPDX-License-Identifier: MIT
#include "Common/Profiler.h"
#include "git_version.h"

#include <FEXCore/Debug/InternalThreadState.h>

namespace FEX::Profiler {
void StatAllocBase::SaveHeader(FEXCore::Profiler::AppType AppType) {
if (!Base) {
return;
}

Head = reinterpret_cast<FEXCore::Profiler::ThreadStatsHeader*>(Base);
Head->Size.store(CurrentSize, std::memory_order_relaxed);
Head->Version = FEXCore::Profiler::STATS_VERSION;

constexpr std::array<char, std::char_traits<char>::length(GIT_DESCRIBE_STRING) + 1> GitString = {GIT_DESCRIBE_STRING};
Sonicadvance1 marked this conversation as resolved.
Show resolved Hide resolved
strncpy(Head->fex_version, GitString.data(), std::min(GitString.size(), sizeof(Head->fex_version)));
Head->app_type = AppType;

Stats = reinterpret_cast<FEXCore::Profiler::ThreadStats*>(reinterpret_cast<uint64_t>(Base) + sizeof(FEXCore::Profiler::ThreadStatsHeader));

RemainingSlots = TotalSlotsFromSize();
}

bool StatAllocBase::AllocateMoreSlots() {
const auto OriginalSlotCount = TotalSlotsFromSize();

uint64_t NewSize = AllocateMoreSlots(CurrentSize * 2);

if (NewSize == CurrentSize) {
return false;
}

CurrentSize = NewSize;
Head->Size.store(CurrentSize, std::memory_order_relaxed);
RemainingSlots = TotalSlotsFromSize() - OriginalSlotCount;

return true;
}

FEXCore::Profiler::ThreadStats* StatAllocBase::AllocateBaseSlot(uint32_t TID) {
if (!RemainingSlots) {
if (!AllocateMoreSlots()) {
return nullptr;
}
}

// Find a free slot
memory_barrier();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Compared to the cost of the profiled events (SIGBUS/SMC), is it really worth dealing with the added complexity here instead of just using stronger atomics? Also, isn't thread creation/destruction in particular rare enough that we could use strong memory ordering on slot management variables while leaving the stats themselves weakly ordered?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

False dependency sharing in the hot path is a significant concern, so I'm keeping them as weak atomics.

Copy link
Member

@neobrain neobrain Jan 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where's the hot path? The currently traced events are orders of magnitude more expensive to process than potential cache behavior effects.

This applies even more to thread creation/destruction, which is a rare event on top of that.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we have some leniency on thread creation and teardown, which is already slow and causes global locks to be taken, but this is the way I have decided to implement it.

FEXCore::Profiler::ThreadStats* AllocatedSlot {};
for (size_t i = 0; i < TotalSlotsFromSize(); ++i) {
AllocatedSlot = &Stats[i];
if (AllocatedSlot->TID.load(std::memory_order_relaxed) == 0) {
break;
}
}

--RemainingSlots;

// Slot might be reused, just zero it now.
memset(AllocatedSlot, 0, sizeof(FEXCore::Profiler::ThreadStatsHeader));

// TID != 0 means slot is allocated.
AllocatedSlot->TID.store(TID, std::memory_order_relaxed);

// Setup singly-linked list
if (Head->Head.load(std::memory_order_relaxed) == 0) {
Head->Head.store(OffsetFromStat(AllocatedSlot), std::memory_order_relaxed);
} else {
StatTail->Next.store(OffsetFromStat(AllocatedSlot), std::memory_order_relaxed);
}

// Update the tail.
StatTail = AllocatedSlot;
return AllocatedSlot;
}

void StatAllocBase::DeallocateBaseSlot(FEXCore::Profiler::ThreadStats* AllocatedSlot) {
if (!AllocatedSlot) {
return;
}

// TID == 0 will signal the reader to ignore this slot & deallocate it!
AllocatedSlot->TID.store(0, std::memory_order_relaxed);

memory_barrier();

const auto SlotOffset = OffsetFromStat(AllocatedSlot);
const auto AllocatedSlotNext = AllocatedSlot->Next.load(std::memory_order_relaxed);

const bool IsTail = AllocatedSlot == StatTail;

// Update the linked list.
if (Head->Head == SlotOffset) {
Head->Head.store(AllocatedSlotNext, std::memory_order_relaxed);
if (IsTail) {
StatTail = nullptr;
}
} else {
for (size_t i = 0; i < TotalSlotsFromSize(); ++i) {
auto Slot = &Stats[i];
auto NextSlotOffset = Slot->Next.load(std::memory_order_relaxed);

if (NextSlotOffset == SlotOffset) {
Slot->Next.store(AllocatedSlotNext, std::memory_order_relaxed);

if (IsTail) {
// This slot is now the tail.
StatTail = Slot;
}
break;
}
}
}

++RemainingSlots;
}

} // namespace FEX::Profiler
67 changes: 67 additions & 0 deletions Source/Common/Profiler.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// SPDX-License-Identifier: MIT
/*
$info$
tags: Common|Profiler
desc: Frontend profiler common code
$end_info$
*/
#pragma once
#include <FEXCore/Utils/Profiler.h>

namespace FEXCore::Core {
struct InternalThreadState;
}

#ifdef _M_ARM_64
static inline void memory_barrier() {
asm volatile("dmb ishst;" ::: "memory");
Sonicadvance1 marked this conversation as resolved.
Show resolved Hide resolved
}

#else
static inline void memory_barrier() {
// Intentionally empty.
Sonicadvance1 marked this conversation as resolved.
Show resolved Hide resolved
}
#endif

namespace FEX::Profiler {
class StatAllocBase {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's some heavy logic in here, so this should explain what it's doing.

public:
virtual ~StatAllocBase() = default;
Sonicadvance1 marked this conversation as resolved.
Show resolved Hide resolved

protected:
FEXCore::Profiler::ThreadStats* AllocateBaseSlot(uint32_t TID);
Sonicadvance1 marked this conversation as resolved.
Show resolved Hide resolved
void DeallocateBaseSlot(FEXCore::Profiler::ThreadStats* AllocatedSlot);

uint32_t OffsetFromStat(FEXCore::Profiler::ThreadStats* Stat) const {
return reinterpret_cast<uint64_t>(Stat) - reinterpret_cast<uint64_t>(Base);
}
size_t TotalSlotsFromSize() const {
return (CurrentSize - sizeof(FEXCore::Profiler::ThreadStatsHeader)) / sizeof(FEXCore::Profiler::ThreadStats) - 1;
}
size_t SlotIndexFromOffset(uint32_t Offset) {
return (Offset - sizeof(FEXCore::Profiler::ThreadStatsHeader)) / sizeof(FEXCore::Profiler::ThreadStats);
}

void SaveHeader(FEXCore::Profiler::AppType AppType);

void* Base;
size_t CurrentSize {};
FEXCore::Profiler::ThreadStatsHeader* Head {};
FEXCore::Profiler::ThreadStats* Stats;
FEXCore::Profiler::ThreadStats* StatTail {};
Comment on lines +51 to +53
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are you preferring a custom-implemented linked list over flat memory? This seems neither more convenient for the implementation nor like it would improve performance.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sparsity in the allocation buffer is a concern, so this is how I'm keeping the implementation.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you elaborate what exactly the problem is?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reader side would need to walk the entire allocated buffer space to ensure it doesn't miss any slots, instead of the average case of a hundred or so.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What scenario exactly would trigger this? I don't know what your reader side looks like, but it sounds like a bitmask and/or active slot count would easily resolve this.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm just going to leave it as a linked list. We can rev the version number and change it to a bitmask in the future if someone is feeling up to writing that code in the future.

Copy link
Member

@neobrain neobrain Jan 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the choice between linked lists and flat memory doesn't seem to be so important, why is the focus on weakly-ordered atomics considered critical?

(I don't agree using a custom data structure here is the way forward since it makes up a good chunk of the complexity in this PR, but on top of that the performance targets don't seem consistent here as-is.)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the atomics I don't want the writer side to be affected by cacheline sharing problems at all. Semantically these are two independent things. The containing data which is FEXCore::Profiler::ThreadStats and the way to walk the list of data.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand they are different things, but it seems arbitrary to want this (with no explicit motivation) while using a data structure with poor performance characteristics in the same module.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, which is why we leave some of the "poor performance" problems on the reader side which has something like 500ms to sample the data, even though this is going to be hundreds of nanoseconds regardless.

uint64_t RemainingSlots;

// Limited to 4MB which should be a few hundred threads of tracking capability.
// I (Sonicadvance1) wanted to reserve 128MB of VA space because it's cheap, but ran in to a bug when running WINE.
// WINE allocates [0x7fff'fe00'0000, 0x7fff'ffff'0000) which /consistently/ overlaps with FEX's sigaltstack.
// This only occurs when this stat allocation size is large as the top-down allocation pushes the alt-stack further.
// Additionally, only occurs on 48-bit VA systems, as mmap on lesser VA will fail regardless.
// TODO: Bump allocation size up once FEXCore's allocator can first use the 128TB of blocked VA space on 48-bit systems.
constexpr static size_t MAX_STATS_SIZE = 4 * 1024 * 1024;

private:
virtual uint64_t AllocateMoreSlots(uint64_t NewSize) = 0;
bool AllocateMoreSlots();
Sonicadvance1 marked this conversation as resolved.
Show resolved Hide resolved
};

} // namespace FEX::Profiler
Loading
Loading