diff --git a/scheds/rust/scx_mitosis/build.rs b/scheds/rust/scx_mitosis/build.rs index f617cea07d..a5854f718c 100644 --- a/scheds/rust/scx_mitosis/build.rs +++ b/scheds/rust/scx_mitosis/build.rs @@ -6,7 +6,7 @@ fn main() { scx_cargo::BpfBuilder::new() .unwrap() - .enable_intf("src/bpf/intf.h", "bpf_intf.rs") + .enable_intf("src/bpf/intf_rust.h", "bpf_intf.rs") .enable_skel("src/bpf/mitosis.bpf.c", "bpf") .build() .unwrap(); diff --git a/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h new file mode 100644 index 0000000000..fc50f17fba --- /dev/null +++ b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h @@ -0,0 +1,201 @@ +/* Copyright (c) Meta Platforms, Inc. and affiliates. */ +/* + * This software may be used and distributed according to the terms of the + * GNU General Public License version 2. + * + * This header defines the 64-bit dispatch queue (DSQ) ID encoding + * scheme for scx_mitosis, using type fields to distinguish between + * per-CPU and cell+L3 domain queues. It includes helper functions to + * construct, validate, and parse these DSQ IDs for queue management. + */ +#pragma once + +#include "intf.h" +#include "mitosis.bpf.h" + +/* + * ================================ + * BPF DSQ ID Layout (64 bits wide) + * ================================ + * + * Top-level format: + * [63] [62..0] + * [ B] [ ID ] + * + * If B == 1 it is a Built-in DSQ + * ------------------------- + * [63] [62] [61 .. 32] [31..0] + * [ 1] [ L] [ R ] [ V ] + * + * - L (bit 62): LOCAL_ON flag + * If L == 1 -> V = CPU number + * - R (30 bits): reserved / unused + * - V (32 bits): value (e.g., CPU#) + * + * If B == 0 -> User-defined DSQ + * ----------------------------- + * Only the low 32 bits are used. + * + * [63 .. 32] [31..0] + * [ 0][ unused ] [ VAL ] + * + * Mitosis uses VAL as follows: + * + * [31..28] [27..0] + * [QTYPE ] [DATA ] + * + * QTYPE encodes the queue type: + * + * QTYPE = 0x1 -> Per-CPU Q + * [31..28] [27 .. .. 0] + * [ 0001 ] [ CPU# ] + * [Q-TYPE:1] + * + * QTYPE = 0x2 -> Cell+L3 Q + * [31..28] [27 .. 16] [15 .. 0] + * [ 0010 ] [ CELL# ] [ L3ID ] + * [Q-TYPE:2] + * + */ +/* + * The use of these bitfields depends on compiler defined byte AND bit ordering. + * Make sure we're only building with Clang/LLVM and that we're little-endian. + */ +#ifndef __clang__ +#error "This code must be compiled with Clang/LLVM (eBPF: clang -target bpf)." +#endif + +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ +#error "dsq64 bitfield layout assumes little-endian (bpfel)." +#endif + +/* ---- Bitfield widths (bits) ---- */ +#define CPU_B 28 +#define L3_B 16 +#define CELL_B 12 +#define TYPE_B 4 +#define DATA_B 28 +#define RSVD_B 32 + +/* Sum checks (in bits) */ +_Static_assert(CPU_B + TYPE_B == 32, "CPU layout low half must be 32 bits"); +_Static_assert(L3_B + CELL_B + TYPE_B == 32, + "CELL+L3 layout low half must be 32 bits"); +_Static_assert(DATA_B + TYPE_B == 32, "Common layout low half must be 32 bits"); + +typedef union { + u64 raw; + + /* Per-CPU user DSQ */ + struct { + u64 cpu : CPU_B; + u64 type : TYPE_B; + u64 rsvd : RSVD_B; + } cpu_dsq; + + /* Cell+L3 user DSQ */ + struct { + u64 l3 : L3_B; + u64 cell : CELL_B; + u64 type : TYPE_B; + u64 rsvd : RSVD_B; + } cell_l3_dsq; + + /* Generic user view */ + struct { + u64 data : DATA_B; + u64 type : TYPE_B; + u64 rsvd : RSVD_B; + } user_dsq; + + /* Built-in DSQ view */ + struct { + u64 value : 32; + u64 rsvd : 30; + u64 local_on : 1; + u64 builtin : 1; + } builtin_dsq; + + /* NOTE: Considered packed and aligned attributes, but that's redundant */ +} dsq_id_t; + +/* + * Invalid DSQ ID Sentinel: + * invalid bc bit 63 clear (it's a user DSQ) && dsq_type == 0 (no type) + * Good for catching uninitialized DSQ IDs. +*/ +#define DSQ_INVALID ((u64)0) + +_Static_assert(sizeof(((dsq_id_t){ 0 }).cpu_dsq) == sizeof(u64), + "cpu view must be 8 bytes"); +_Static_assert(sizeof(((dsq_id_t){ 0 }).cell_l3_dsq) == sizeof(u64), + "cell+l3 view must be 8 bytes"); +_Static_assert(sizeof(((dsq_id_t){ 0 }).user_dsq) == sizeof(u64), + "user common view must be 8 bytes"); +_Static_assert(sizeof(((dsq_id_t){ 0 }).builtin_dsq) == sizeof(u64), + "builtin view must be 8 bytes"); + +/* Compile-time checks (in bytes) */ +_Static_assert(sizeof(dsq_id_t) == sizeof(u64), + "dsq_id_t must be 8 bytes (64 bits)"); +_Static_assert(_Alignof(dsq_id_t) == sizeof(u64), + "dsq_id_t must be 8-byte aligned"); + +/* DSQ type enumeration */ +enum dsq_type { + DSQ_TYPE_NONE, + DSQ_TYPE_CPU, + DSQ_TYPE_CELL_L3, +}; + +/* Range guards */ +_Static_assert(MAX_CPUS <= (1u << CPU_B), "MAX_CPUS must fit in field"); +_Static_assert(MAX_L3S <= (1u << L3_B), "MAX_L3S must fit in field"); +_Static_assert(MAX_CELLS <= (1u << CELL_B), "MAX_CELLS must fit in field"); +_Static_assert(DSQ_TYPE_CELL_L3 < (1u << TYPE_B), + "DSQ_TYPE_CELL_L3 must fit in field"); + +/* + * While I considered error propagation, I decided to bail to force errors early. +*/ + +static inline bool is_user_dsq(dsq_id_t dsq_id) +{ + return !dsq_id.builtin_dsq.builtin && + dsq_id.user_dsq.type != DSQ_TYPE_NONE; +} + +// Is this a per CPU DSQ? +static inline bool is_cpu_dsq(dsq_id_t dsq_id) +{ + return is_user_dsq(dsq_id) && dsq_id.user_dsq.type == DSQ_TYPE_CPU; +} + +// If this is a per cpu dsq, return the cpu +static inline u32 get_cpu_from_dsq(dsq_id_t dsq_id) +{ + if (!is_cpu_dsq(dsq_id)) + scx_bpf_error("trying to get cpu from non-cpu dsq\n"); + + return dsq_id.cpu_dsq.cpu; +} + +/* Helper functions to construct DSQ IDs */ +static inline dsq_id_t get_cpu_dsq_id(u32 cpu) +{ + // Check for valid CPU range, 0 indexed so >=. + if (cpu >= MAX_CPUS) + scx_bpf_error("invalid cpu %u\n", cpu); + + return (dsq_id_t){ .cpu_dsq = { .cpu = cpu, .type = DSQ_TYPE_CPU } }; +} + +static inline dsq_id_t get_cell_l3_dsq_id(u32 cell, u32 l3) +{ + if (cell >= MAX_CELLS || l3 >= MAX_L3S) + scx_bpf_error("cell %u or l3 %u too large\n", cell, l3); + + return (dsq_id_t){ .cell_l3_dsq = { .l3 = l3, + .cell = cell, + .type = DSQ_TYPE_CELL_L3 } }; +} diff --git a/scheds/rust/scx_mitosis/src/bpf/intf.h b/scheds/rust/scx_mitosis/src/bpf/intf.h index 0658734545..b1fcbf7941 100644 --- a/scheds/rust/scx_mitosis/src/bpf/intf.h +++ b/scheds/rust/scx_mitosis/src/bpf/intf.h @@ -5,7 +5,8 @@ #ifndef __INTF_H #define __INTF_H -#ifndef __KERNEL__ +#ifndef __BPF__ +#include typedef unsigned long long u64; typedef unsigned int u32; typedef _Bool bool; @@ -18,6 +19,10 @@ typedef _Bool bool; #include #endif +/* ---- Work stealing config (compile-time) ------------------------------- */ +#define MITOSIS_ENABLE_STEALING 1 +/* ----------------------------------------------------------------------- */ + enum consts { CACHELINE_SIZE = 64, MAX_CPUS_SHIFT = 9, @@ -28,8 +33,67 @@ enum consts { PCPU_BASE = 0x80000000, MAX_CG_DEPTH = 256, + + MAX_L3S = 16, +}; + +/* Kernel side sees the real lock; userspace sees padded bytes of same size/alignment */ +#if defined(__BPF__) +#define CELL_LOCK_T struct bpf_spin_lock +#else +/* userspace placeholder: kernel won’t copy spin_lock */ +#define CELL_LOCK_T \ + struct { \ + u32 __pad; \ + } /* 4-byte aligned as required */ +#endif + +struct cell { + // This is a lock in the kernel and padding in the user + CELL_LOCK_T lock; // Assumed to be the first entry (see below) + + // Whether or not the cell is used + u32 in_use; + + // Number of CPUs in this cell + u32 cpu_cnt; + + // Number of L3s with at least one CPU in this cell + u32 l3_present_cnt; + + // Number of CPUs from each L3 assigned to this cell + u32 l3_cpu_cnt[MAX_L3S]; + + // per-L3 vtimes within this cell + u64 l3_vtime_now[MAX_L3S]; }; +// Putting the lock first in the struct is our convention. +// We pad this space when in Rust code that will never see the lock value. +// We intentionally avoid it in copy_cell_no_lock to keep the verifier happy. +// It is a BPF constraint that it is 4 byte aligned. + +// All assertions work for both BPF and userspace builds +_Static_assert(offsetof(struct cell, lock) == 0, + "lock/padding must be first field"); + +_Static_assert(sizeof(((struct cell *)0)->lock) == 4, + "lock/padding must be 4 bytes"); + +_Static_assert(_Alignof(CELL_LOCK_T) == 4, + "lock/padding must be 4-byte aligned"); + +_Static_assert(offsetof(struct cell, in_use) == 4, + "in_use must follow 4-byte lock/padding"); + +// Verify these are the same size in both BPF and Rust. +_Static_assert(sizeof(struct cell) == + ((4 * sizeof(u32)) + (4 * MAX_L3S) + (8 * MAX_L3S)), + "struct cell size must be stable for Rust bindings"); + +_Static_assert(sizeof(struct cell) == 208, + "struct cell must be exactly 208 bytes"); + /* Statistics */ enum cell_stat_idx { CSTAT_LOCAL, @@ -39,6 +103,14 @@ enum cell_stat_idx { NR_CSTATS, }; +/* Function invocation counters */ +enum fn_counter_idx { + COUNTER_SELECT_CPU, + COUNTER_ENQUEUE, + COUNTER_DISPATCH, + NR_COUNTERS, +}; + struct cpu_ctx { u64 cstats[MAX_CELLS][NR_CSTATS]; u64 cell_cycles[MAX_CELLS]; @@ -51,14 +123,4 @@ struct cgrp_ctx { bool cell_owner; }; -/* - * cell is the per-cell book-keeping -*/ -struct cell { - // current vtime of the cell - u64 vtime_now; - // Whether or not the cell is used or not - u32 in_use; -}; - #endif /* __INTF_H */ diff --git a/scheds/rust/scx_mitosis/src/bpf/intf_rust.h b/scheds/rust/scx_mitosis/src/bpf/intf_rust.h new file mode 100644 index 0000000000..f8ffd3252a --- /dev/null +++ b/scheds/rust/scx_mitosis/src/bpf/intf_rust.h @@ -0,0 +1,4 @@ +/* Force userspace path for Rust bindgen */ +#undef __BPF__ +#undef __bpf__ +#include "intf.h" diff --git a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h new file mode 100644 index 0000000000..492b2723c7 --- /dev/null +++ b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h @@ -0,0 +1,310 @@ +/* Copyright (c) Meta Platforms, Inc. and affiliates. */ +/* + * This software may be used and distributed according to the terms of the + * GNU General Public License version 2. + * + * This header assists adding L3 cache awareness to scx_mitosis by defining + * maps and fns for managing CPU-to-L3 domain mappings. It provides code to + * recalculate per-L3 CPU counts within cells and implements weighted + * random L3 selection for tasks. It also tracks work-stealing + * statistics for cross-L3 task migrations. + */ +#pragma once + +#include "mitosis.bpf.h" +#include "intf.h" + +typedef u32 l3_id_t; +#define L3_INVALID ((l3_id_t)~0u) + +// Configure how aggressively we steal work. +// When task is detected as a steal candidate, skip it this many times +// On a web server workload, 100 reduced steal count by ~90% +#define PREVENT_N_STEALS 0 + +/* Work stealing statistics map - accessible from both BPF and userspace */ +struct steal_stats_map { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, 1); +}; + +// A CPU -> L3 cache ID map +struct cpu_to_l3_map { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, MAX_CPUS); +}; + +struct l3_to_cpus_map { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, struct cpumask); + __uint(max_entries, MAX_L3S); +}; + +extern struct cpu_to_l3_map cpu_to_l3; +extern struct l3_to_cpus_map l3_to_cpus; +extern struct steal_stats_map steal_stats; + +static inline const bool l3_is_valid(u32 l3_id) +{ + if (l3_id == L3_INVALID) + return false; + + return (l3_id >= 0) && (l3_id < MAX_L3S); +} + +static inline void init_task_l3(struct task_ctx *tctx) +{ + tctx->l3 = L3_INVALID; + +#if MITOSIS_ENABLE_STEALING + tctx->pending_l3 = L3_INVALID; + tctx->steal_count = 0; + tctx->last_stolen_at = 0; + tctx->steals_prevented = 0; +#endif +} + +static inline const struct cpumask *lookup_l3_cpumask(u32 l3) +{ + struct cpumask *mask; + + if (!(mask = bpf_map_lookup_elem(&l3_to_cpus, &l3))) { + scx_bpf_error("no l3 cpumask, l3: %d, %p", l3, &l3_to_cpus); + return NULL; + } + + return mask; +} + +/* Recompute cell->l3_cpu_cnt[] after cell cpumask changes */ +static __always_inline void recalc_cell_l3_counts(u32 cell_idx) +{ + struct cell *cell = lookup_cell(cell_idx); + if (!cell) { + scx_bpf_error("recalc_cell_l3_counts: invalid cell %d", + cell_idx); + return; + } + + CPUMASK_GUARD(tmp_guard); + if (!tmp_guard.mask) { + scx_bpf_error( + "recalc_cell_l3_counts: failed to create tmp mask"); + return; + } + + u32 l3, l3s_present = 0, total_cpus = 0; + // Just so we don't hold the lock longer than necessary + u32 l3_cpu_cnt_tmp[MAX_L3S] = { 0 }; + + { // RCU context + RCU_READ_GUARD(); + const struct cpumask *cell_mask = + lookup_cell_cpumask(cell_idx); // RCU ptr + + if (!cell_mask) { + scx_bpf_error( + "recalc_cell_l3_counts: invalid cell mask"); + return; + } + + bpf_for(l3, 0, nr_l3) + { + const struct cpumask *l3_mask = lookup_l3_cpumask(l3); + if (!l3_mask) { + scx_bpf_error( + "recalc_cell_l3_counts: invalid l3 mask"); + return; + } + + bpf_cpumask_and(tmp_guard.mask, cell_mask, l3_mask); + + u32 cnt = bpf_cpumask_weight( + (const struct cpumask *)tmp_guard.mask); + + l3_cpu_cnt_tmp[l3] = cnt; + + bpf_printk("recalc_cell_l3_counts: cnt %d", cnt); + + // These are counted across the whole cell + total_cpus += cnt; + + // Number of non-empty L3s in this cell + if (cnt) + l3s_present++; + } + } // unlock RCU + + // Write to cell + bpf_spin_lock(&cell->lock); + for (u32 l3 = 0; l3 < nr_l3; l3++) { + cell->l3_cpu_cnt[l3] = l3_cpu_cnt_tmp[l3]; + } + + cell->l3_present_cnt = l3s_present; + cell->cpu_cnt = total_cpus; + bpf_spin_unlock(&cell->lock); +} + +/** + * Weighted random selection of an L3 cache domain for a task. + * + * Uses the CPU count in each L3 domain within the cell as weights to + * probabilistically select an L3. L3 domains with more CPUs in the cell + * have higher probability of being selected. + * + * @cell_id: The cell ID to select an L3 from + * @return: L3 ID on success, L3_INVALID on error + */ +static inline s32 pick_l3_for_task(u32 cell_id) +{ + struct cell *cell; + + /* Look up the cell structure */ + if (!(cell = lookup_cell(cell_id))) { + scx_bpf_error("pick_l3_for_task: invalid cell %d", cell_id); + return L3_INVALID; + } + + // Snapshot the current state of the cell + struct cell cell_snapshot; + bpf_spin_lock(&cell->lock); + copy_cell_skip_lock(&cell_snapshot, cell); + bpf_spin_unlock(&cell->lock); + + // No cpus + if (!cell_snapshot.cpu_cnt) { + scx_bpf_error( + "pick_l3_for_task: cell %d has no CPUs accounted yet", + cell_id); + return L3_INVALID; + } + + /* Find the L3 domain corresponding to the target value using + * weighted selection - accumulate CPU counts until we exceed target */ + + /* Generate random target value in range [0, cpu_cnt) */ + u32 target = bpf_get_prandom_u32() % cell_snapshot.cpu_cnt; + u32 l3, cur = 0; + s32 ret = L3_INVALID; + + // This could be a prefix sum. Find first l3 where we exceed target + bpf_for(l3, 0, nr_l3) + { + cur += cell_snapshot.l3_cpu_cnt[l3]; + if (target < cur) { + ret = (s32)l3; + break; + } + } + + if (ret == L3_INVALID) { + scx_bpf_error("pick_l3_for_task: invalid L3"); + return L3_INVALID; + } + + return ret; +} + +#if MITOSIS_ENABLE_STEALING + +static inline bool try_stealing_this_task(struct task_ctx *task_ctx, + s32 local_l3, u64 candidate_dsq) +{ + // Attempt the steal, can fail beacuse it's a race. + if (!scx_bpf_dsq_move_to_local(candidate_dsq)) + return false; + + // We got the task! + task_ctx->steal_count++; + task_ctx->last_stolen_at = scx_bpf_now(); + /* Retag to thief L3 (the one for this cpu) */ + task_ctx->pending_l3 = local_l3; + task_ctx->steals_prevented = 0; + + /* Increment steal counter in map */ + u32 key = 0; + u64 *count = bpf_map_lookup_elem(&steal_stats, &key); + // NOTE: This could get expensive, but I'm not anticipating that many steals. Percpu if we care. + if (count) + __sync_fetch_and_add(count, 1); + + return true; +} + +/* Work stealing: + * Scan sibling (cell,L3) DSQs in the same cell and steal the first queued task if it can run on this cpu +*/ +static inline bool try_stealing_work(u32 cell, s32 local_l3) +{ + if (!l3_is_valid(local_l3)) + scx_bpf_error("try_stealing_work: invalid local_l3"); + + struct cell *cell_ptr = lookup_cell(cell); + if (!cell_ptr) + scx_bpf_error("try_stealing_work: invalid cell"); + + // Loop over all other L3s, looking for a queued task to steal + u32 i; + bpf_for(i, 1, nr_l3) + { + // Start with the next one to spread out the load + u32 candidate_l3 = (local_l3 + i) % nr_l3; + + // Prevents the optimizer from removing the following conditional return + // so that the verifier knows the read wil be safe + barrier_var(candidate_l3); + + if (candidate_l3 >= MAX_L3S) + continue; + + // Skip L3s that are not present in this cell + // Note: rechecking cell_ptr for verifier + // TODO: Lock? + if (cell_ptr && cell_ptr->l3_cpu_cnt[candidate_l3] == 0) + continue; + + u64 candidate_dsq = get_cell_l3_dsq_id(cell, candidate_l3).raw; + + struct task_struct *task = NULL; + struct task_ctx *task_ctx; + // I'm only using this for the verifier + bool found_task = false; + + // Optimization: skip if faster than constructing an iterator + // Not redundant with later checking if task found (race) + if (!scx_bpf_dsq_nr_queued(candidate_dsq)) + continue; + + // Just a trick for peeking the head element + bpf_for_each(scx_dsq, task, candidate_dsq, 0) + { + task_ctx = lookup_task_ctx(task); + found_task = (task_ctx != NULL); + break; + } + + // No task? Try next L3 + if (!found_task) + continue; + + // This knob throttles stealing. + // TODO: make runtime configurable + if (task_ctx->steals_prevented++ < PREVENT_N_STEALS) { + continue; + } + + if (!try_stealing_this_task(task_ctx, local_l3, candidate_dsq)) + continue; + + // Success, we got a task (no guarantee it was the one we peeked though... race) + return true; + } + return false; +} +#endif diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c index b40c06e79d..3e1eac406e 100644 --- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c @@ -12,16 +12,12 @@ * cgroups belonging to the cell. */ -#include "intf.h" - -#ifdef LSP -#define __bpf__ -#include "../../../../include/scx/common.bpf.h" -#include "../../../../include/scx/ravg_impl.bpf.h" -#else -#include -#include -#endif +// TODO: fix debug printer. +// #include "intf.h" + +#include "mitosis.bpf.h" +#include "dsq.bpf.h" +#include "l3_aware.bpf.h" char _license[] SEC("license") = "GPL"; @@ -35,6 +31,7 @@ const volatile unsigned char all_cpus[MAX_CPUS_U8]; const volatile u64 slice_ns; const volatile u64 root_cgid = 1; +const volatile u32 nr_l3 = 1; /* * CPU assignment changes aren't fully in effect until a subsequent tick() * configuration_seq is bumped on each assignment change @@ -48,23 +45,32 @@ private(root_cgrp) struct cgroup __kptr *root_cgrp; UEI_DEFINE(uei); +// Cells now defined as a map so we can lock. +struct cell_map cells SEC(".maps"); + /* - * We store per-cpu values along with per-cell values. Helper functions to - * translate. - */ -static inline u32 cpu_dsq(u32 cpu) -{ - return PCPU_BASE | cpu; -} + * Maps used for L3-aware scheduling +*/ +#if 0 +struct cell_locks_map cell_locks SEC(".maps"); +#endif +struct cpu_to_l3_map cpu_to_l3 SEC(".maps"); +struct l3_to_cpus_map l3_to_cpus SEC(".maps"); -static inline u32 cell_dsq(u32 cell) -{ - return cell; -} +/* + * Maps for statistics +*/ +struct function_counters_map function_counters SEC(".maps"); +struct steal_stats_map steal_stats SEC(".maps"); -static inline u32 dsq_to_cpu(u32 dsq) +static inline void increment_counter(enum fn_counter_idx idx) { - return dsq & ~PCPU_BASE; + u64 *counter; + u32 key = idx; + + counter = bpf_map_lookup_elem(&function_counters, &key); + if (counter) + (*counter)++; } static inline struct cgroup *lookup_cgrp_ancestor(struct cgroup *cgrp, @@ -119,28 +125,6 @@ static inline struct cgroup *task_cgroup(struct task_struct *p) return cgrp; } -/* - * task_ctx is the per-task information kept by scx_mitosis - */ -struct task_ctx { - /* cpumask is the set of valid cpus this task can schedule on */ - /* (tasks cpumask anded with its cell cpumask) */ - struct bpf_cpumask __kptr *cpumask; - /* started_running_at for recording runtime */ - u64 started_running_at; - u64 basis_vtime; - /* For the sake of monitoring, each task is owned by a cell */ - u32 cell; - /* For the sake of scheduling, a task is exclusively owned by either a cell - * or a cpu */ - u32 dsq; - /* latest configuration that was applied for this task */ - /* (to know if it has to be re-applied) */ - u32 configuration_seq; - /* Is this task allowed on all cores of its cell? */ - bool all_cell_cpus_allowed; -}; - struct { __uint(type, BPF_MAP_TYPE_TASK_STORAGE); __uint(map_flags, BPF_F_NO_PREALLOC); @@ -185,20 +169,6 @@ static inline struct cpu_ctx *lookup_cpu_ctx(int cpu) return cctx; } -struct cell cells[MAX_CELLS]; - -static inline struct cell *lookup_cell(int idx) -{ - struct cell *cell; - - cell = MEMBER_VPTR(cells, [idx]); - if (!cell) { - scx_bpf_error("Invalid cell %d", idx); - return NULL; - } - return cell; -} - /* * Cells are allocated concurrently in some cases (e.g. cgroup_init). * allocate_cell and free_cell enable these allocations to be done safely @@ -212,8 +182,17 @@ static inline int allocate_cell() if (!(c = lookup_cell(cell_idx))) return -1; - if (__sync_bool_compare_and_swap(&c->in_use, 0, 1)) + bpf_spin_lock(&c->lock); + if (c->in_use == 0) { + // Zero everything except the lock (which is first) + __builtin_memset(&c->in_use, 0, + sizeof(struct cell) - + sizeof(CELL_LOCK_T)); + c->in_use = 1; // Then mark as in use + bpf_spin_unlock(&c->lock); return cell_idx; + } + bpf_spin_unlock(&c->lock); } scx_bpf_error("No available cells to allocate"); return -1; @@ -292,7 +271,6 @@ static inline int update_task_cpumask(struct task_struct *p, { const struct cpumask *cell_cpumask; struct cpu_ctx *cpu_ctx; - struct cell *cell; u32 cpu; if (!(cell_cpumask = lookup_cell_cpumask(tctx->cell))) @@ -301,11 +279,24 @@ static inline int update_task_cpumask(struct task_struct *p, if (!tctx->cpumask) return -EINVAL; + /* + * Calculate the intersection of CPUs that are both: + * 1. In this task's assigned cell (cell_cpumask) + * 2. Allowed by the task's CPU affinity (p->cpus_ptr) + * Store result in tctx->cpumask - this becomes the effective CPU set + * where this task can actually run. + */ bpf_cpumask_and(tctx->cpumask, cell_cpumask, p->cpus_ptr); - if (cell_cpumask) - tctx->all_cell_cpus_allowed = - bpf_cpumask_subset(cell_cpumask, p->cpus_ptr); + /* + * Check if the task can run on ALL CPUs in its assigned cell. + * If cell_cpumask is a subset of p->cpus_ptr, it means the task's + * CPU affinity doesn't restrict it within the cell - it can use + * any CPU in the cell. This affects scheduling decisions later. + * True if all the bits in cell_cpumask are set in p->cpus_ptr. + */ + tctx->all_cell_cpus_allowed = + bpf_cpumask_subset(cell_cpumask, p->cpus_ptr); /* * XXX - To be correct, we'd need to calculate the vtime @@ -317,16 +308,66 @@ static inline int update_task_cpumask(struct task_struct *p, * Revisit if high frequency dynamic cell switching * needs to be supported. */ + + // We want to set the task vtime to that of the cell it's joining. if (tctx->all_cell_cpus_allowed) { - tctx->dsq = cell_dsq(tctx->cell); - if (!(cell = lookup_cell(tctx->cell))) + const struct cpumask *l3_mask = NULL; + if (tctx->l3 != L3_INVALID) { + l3_mask = lookup_l3_cpumask((u32)tctx->l3); + /* If the L3 no longer intersects the cell's cpumask, invalidate it */ + if (!l3_mask || + !bpf_cpumask_intersects(cell_cpumask, l3_mask)) + tctx->l3 = L3_INVALID; + } + + /* --- Pick a new L3 if needed --- */ + if (tctx->l3 == L3_INVALID) { + s32 new_l3 = pick_l3_for_task(tctx->cell); + if (new_l3 < 0) { + scx_bpf_error("bad L3: %d", new_l3); + return -ENODEV; + } + tctx->l3 = new_l3; + l3_mask = lookup_l3_cpumask((u32)tctx->l3); + if (!l3_mask) + return -ENOENT; + } + + /* --- Narrow the effective cpumask by the chosen L3 --- */ + /* tctx->cpumask already contains (task_affinity ∧ cell_mask) */ + if (tctx->cpumask) + bpf_cpumask_and(tctx->cpumask, + (const struct cpumask *)tctx->cpumask, + l3_mask); + + /* If empty after intersection, nothing can run here */ + if (tctx->cpumask && + bpf_cpumask_empty((const struct cpumask *)tctx->cpumask)) { + scx_bpf_error("Empty cpumask after intersection"); + return -ENODEV; + } + + /* --- Point to the correct (cell,L3) DSQ and set vtime baseline --- */ + tctx->dsq = get_cell_l3_dsq_id(tctx->cell, tctx->l3); + + struct cell *cell = lookup_cell(tctx->cell); + if (!cell) { + scx_bpf_error("Invalid cell"); return -ENOENT; - p->scx.dsq_vtime = READ_ONCE(cell->vtime_now); + } + + if (!l3_is_valid(tctx->l3)) { + scx_bpf_error("Invalid L3 %d", tctx->l3); + return -EINVAL; + } + + p->scx.dsq_vtime = READ_ONCE(cell->l3_vtime_now[tctx->l3]); } else { + /* Task is CPU-restricted, use task mask */ cpu = bpf_cpumask_any_distribute(p->cpus_ptr); if (!(cpu_ctx = lookup_cpu_ctx(cpu))) return -ENOENT; - tctx->dsq = cpu_dsq(cpu); + tctx->dsq = get_cpu_dsq_id(cpu); p->scx.dsq_vtime = READ_ONCE(cpu_ctx->vtime_now); } @@ -442,20 +483,24 @@ s32 BPF_STRUCT_OPS(mitosis_select_cpu, struct task_struct *p, s32 prev_cpu, struct cpu_ctx *cctx; struct task_ctx *tctx; + increment_counter(COUNTER_SELECT_CPU); + if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p))) return prev_cpu; if (maybe_refresh_cell(p, tctx) < 0) return prev_cpu; + /* Pinned path: only if our task really requires a per-CPU queue. */ if (!tctx->all_cell_cpus_allowed) { cstat_inc(CSTAT_AFFN_VIOL, tctx->cell, cctx); - cpu = dsq_to_cpu(tctx->dsq); + cpu = get_cpu_from_dsq(tctx->dsq); if (scx_bpf_test_and_clear_cpu_idle(cpu)) scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0); return cpu; } + // Grab an idle core if ((cpu = pick_idle_cpu(p, prev_cpu, cctx, tctx)) >= 0) { cstat_inc(CSTAT_LOCAL, tctx->cell, cctx); scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0); @@ -489,14 +534,17 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags) s32 cpu = -1; u64 basis_vtime; + increment_counter(COUNTER_ENQUEUE); + if (!(tctx = lookup_task_ctx(p)) || !(cctx = lookup_cpu_ctx(-1))) return; if (maybe_refresh_cell(p, tctx) < 0) return; + // Cpu pinned work if (!tctx->all_cell_cpus_allowed) { - cpu = dsq_to_cpu(tctx->dsq); + cpu = get_cpu_from_dsq(tctx->dsq); } else if (!__COMPAT_is_enq_cpu_selected(enq_flags)) { /* * If we haven't selected a cpu, then we haven't looked for and kicked an @@ -520,12 +568,23 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags) } if (tctx->all_cell_cpus_allowed) { + // This is a task that can run on any cpu in the cell + cstat_inc(CSTAT_CELL_DSQ, tctx->cell, cctx); - /* Task can use any CPU in its cell, so use the cell DSQ */ + + /* Task can use any CPU in its cell, set basis_vtime from per-(cell, L3) vtime */ if (!(cell = lookup_cell(tctx->cell))) return; - basis_vtime = READ_ONCE(cell->vtime_now); + + if (!l3_is_valid(tctx->l3)) { + scx_bpf_error("Invalid L3 ID for task %d in enqueue", + p->pid); + return; + } + basis_vtime = READ_ONCE(cell->l3_vtime_now[tctx->l3]); + } else { + // This is a task that can only run on a specific cpu cstat_inc(CSTAT_CPU_DSQ, tctx->cell, cctx); /* @@ -540,7 +599,8 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags) tctx->basis_vtime = basis_vtime; - if (time_after(vtime, basis_vtime + 8192 * slice_ns)) { + if (time_after(vtime, + basis_vtime + VTIME_MAX_FUTURE_MULTIPLIER * slice_ns)) { scx_bpf_error("vtime is too far in the future for %d", p->pid); return; } @@ -548,10 +608,11 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags) * Limit the amount of budget that an idling task can accumulate * to one slice. */ + // TODO: Should this be time_before64? if (time_before(vtime, basis_vtime - slice_ns)) vtime = basis_vtime - slice_ns; - scx_bpf_dsq_insert_vtime(p, tctx->dsq, slice_ns, vtime, enq_flags); + scx_bpf_dsq_insert_vtime(p, tctx->dsq.raw, slice_ns, vtime, enq_flags); /* Kick the CPU if needed */ if (!__COMPAT_is_enq_cpu_selected(enq_flags) && cpu >= 0) @@ -563,70 +624,81 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev) struct cpu_ctx *cctx; u32 cell; + increment_counter(COUNTER_DISPATCH); + if (!(cctx = lookup_cpu_ctx(-1))) return; cell = READ_ONCE(cctx->cell); - bool found = false; - u64 min_vtime_dsq; - u64 min_vtime; + /* Start from a valid DSQ */ + dsq_id_t local_dsq = get_cpu_dsq_id(cpu); + bool found = false; + dsq_id_t min_vtime_dsq = local_dsq; + u64 min_vtime = ~0ULL; /* U64_MAX */ struct task_struct *p; - bpf_for_each(scx_dsq, p, cell, 0) { - min_vtime = p->scx.dsq_vtime; - min_vtime_dsq = cell; - found = true; - break; + + // Get L3 + u32 cpu_key = (u32)cpu; + u32 *l3_ptr = bpf_map_lookup_elem(&cpu_to_l3, &cpu_key); + s32 l3 = l3_ptr ? (s32)*l3_ptr : L3_INVALID; + + /* Check the L3 queue */ + if (l3 != L3_INVALID) { + dsq_id_t cell_l3_dsq = get_cell_l3_dsq_id(cell, l3); + bpf_for_each(scx_dsq, p, cell_l3_dsq.raw, 0) + { + min_vtime = p->scx.dsq_vtime; + min_vtime_dsq = cell_l3_dsq; + found = true; + break; + } } - u64 dsq = cpu_dsq(cpu); - bpf_for_each(scx_dsq, p, dsq, 0) { + /* Check the CPU DSQ for a lower vtime */ + bpf_for_each(scx_dsq, p, local_dsq.raw, 0) + { if (!found || time_before(p->scx.dsq_vtime, min_vtime)) { min_vtime = p->scx.dsq_vtime; - min_vtime_dsq = dsq; + min_vtime_dsq = local_dsq; found = true; } break; } /* - * If we failed to find an eligible task, scx will keep running prev if - * prev->scx.flags & SCX_TASK_QUEUED (we don't set SCX_OPS_ENQ_LAST), and - * otherwise go idle. - */ - if (!found) - return; - /* - * The move_to_local can fail if we raced with some other cpu in the cell - * and now the cell is empty. We have to ensure to try the cpu_dsq or else - * we might never wakeup. - */ + * The move_to_local can fail if we raced with some other cpu in the cell + * and now the cell is empty. We have to ensure to try the cpu_dsq or else + * we might never wakeup. + */ - if (!scx_bpf_dsq_move_to_local(min_vtime_dsq) && min_vtime_dsq != dsq) - scx_bpf_dsq_move_to_local(dsq); -} + if (found) { + // We found a task in the local or cell-L3 DSQ -/* - * A couple of tricky things about checking a cgroup's cpumask: - * - * First, we need an RCU pointer to pass to cpumask kfuncs. The only way to get - * this right now is to copy the cpumask to a map entry. Given that cgroup init - * could be re-entrant we have a few per-cpu entries in a map to make this - * doable. - * - * Second, cpumask can sometimes be stored as an array in-situ or as a pointer - * and with different lengths. Some bpf_core_type_matches finagling can make - * this all work. - */ -#define MAX_CPUMASK_ENTRIES (4) + // If it was in the per cpu DSQ, there is no competation, grab it and return + if (min_vtime_dsq.raw == local_dsq.raw) { + scx_bpf_dsq_move_to_local(min_vtime_dsq.raw); + return; + } -/* - * We don't know how big struct cpumask is at compile time, so just allocate a - * large space and check that it is big enough at runtime - */ -#define CPUMASK_LONG_ENTRIES (128) -#define CPUMASK_SIZE (sizeof(long) * CPUMASK_LONG_ENTRIES) + // If it was in the cell L3 DSQ, we are competing with other cpus in the cell-l3 + // try to move it to the local DSQ + if (scx_bpf_dsq_move_to_local(min_vtime_dsq.raw)) { + // We won the race and got the task, return + return; + } + } + +#if MITOSIS_ENABLE_STEALING + // We didn't find a task in either DSQ, or lost the race. + // Instead of going straight to idle, attempt to steal a task from another + // L3 in the cell. + + // Try stealing. If successful, this moves the task to the local runqueue + try_stealing_work(cell, l3); +#endif +} struct cpumask_entry { unsigned long cpumask[CPUMASK_LONG_ENTRIES]; @@ -758,18 +830,19 @@ static inline int get_cgroup_cpumask(struct cgroup *cgrp, u32 level_cells[MAX_CG_DEPTH]; int running; +/* The guard is a stack variable. When it falls out of scope, + * we drop the running lock. */ +static inline void __running_unlock(int *guard) +{ + (void)guard; /* unused */ + WRITE_ONCE(running, 0); +} + /* * On tick, we identify new cells and apply CPU assignment */ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run) { - /* - * We serialize tick() on core 0 and ensure only one tick running at a time - * to ensure this can only happen once. - */ - if (bpf_get_smp_processor_id()) - return; - u32 local_configuration_seq = READ_ONCE(configuration_seq); if (local_configuration_seq == READ_ONCE(applied_configuration_seq)) return; @@ -779,6 +852,8 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run) __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) return; + int __attribute__((cleanup(__running_unlock), unused)) __running_guard; + DECLARE_CPUMASK_ENTRY(entry) = allocate_cpumask_entry(); if (!entry) return; @@ -809,8 +884,7 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run) } /* - * Initialize root cell cpumask to all cpus, and then remove from it as we - * go + * Initialize root cell cpumask to all cpus, and then remove from it as we go */ bpf_cpumask_copy(root_bpf_cpumask, (const struct cpumask *)all_cpumask); @@ -842,7 +916,8 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run) * Iterate over all cgroups, check if any have a cpumask and populate them * as a separate cell. */ - bpf_for_each(css, pos, root_css, BPF_CGROUP_ITER_DESCENDANTS_PRE) { + bpf_for_each(css, pos, root_css, BPF_CGROUP_ITER_DESCENDANTS_PRE) + { cur_cgrp = pos->cgroup; /* @@ -967,13 +1042,12 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run) int cpu_idx; bpf_for(cpu_idx, 0, nr_possible_cpus) { - if (bpf_cpumask_test_cpu( - cpu_idx, (const struct cpumask *)&entry->cpumask)) { + if (bpf_cpumask_test_cpu(cpu_idx, (const struct cpumask *) + root_bpf_cpumask)) { struct cpu_ctx *cpu_ctx; if (!(cpu_ctx = lookup_cpu_ctx(cpu_idx))) goto out_root_cgrp; cpu_ctx->cell = 0; - bpf_cpumask_clear_cpu(cpu_idx, root_bpf_cpumask); } } @@ -992,18 +1066,40 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run) goto out_root_cgrp; } + int cell_idx; + /* Recalculate L3 counts for all active cells after CPU assignment changes */ + bpf_for(cell_idx, 1, MAX_CELLS) + { + struct cell *cell; + if (!(cell = lookup_cell(cell_idx))) { + scx_bpf_error("Lookup for cell %d failed in tick()", + cell_idx); + goto out_root_cgrp; + } + + if (!cell->in_use) + continue; + + /* Recalculate L3 counts for each active cell */ + recalc_cell_l3_counts(cell_idx); + } + + /* Recalculate root cell's L3 counts after cpumask update */ + recalc_cell_l3_counts(ROOT_CELL_ID); + barrier(); WRITE_ONCE(applied_configuration_seq, local_configuration_seq); - WRITE_ONCE(running, 0); bpf_cgroup_release(root_cgrp_ref); return; + out_rcu_unlock: bpf_rcu_read_unlock(); out_root_cgrp: bpf_cgroup_release(root_cgrp_ref); out: - bpf_cpumask_release(root_bpf_cpumask); + if (root_bpf_cpumask) + bpf_cpumask_release(root_bpf_cpumask); } void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p) @@ -1016,13 +1112,47 @@ void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p) !(cell = lookup_cell(cctx->cell))) return; + /* + * If this task was stolen across L3s, retag to thief L3 and recompute + * effective cpumask+DSQ. Preserve vtime to keep fairness. + */ +#if MITOSIS_ENABLE_STEALING + if (l3_is_valid(tctx->pending_l3)) { + u64 save_v = p->scx.dsq_vtime; + tctx->l3 = tctx->pending_l3; + tctx->pending_l3 = L3_INVALID; + update_task_cpumask(p, tctx); + p->scx.dsq_vtime = save_v; + } +#endif + + /* Validate task's DSQ before it starts running */ + if (tctx->dsq.raw == DSQ_INVALID) { + if (tctx->all_cell_cpus_allowed) { + scx_bpf_error( + "Task %d has invalid DSQ 0 in running callback (CELL-SCHEDULABLE task, can run on any CPU in cell %d)", + p->pid, tctx->cell); + } else { + scx_bpf_error( + "Task %d has invalid DSQ 0 in running callback (CORE-PINNED task, restricted to specific CPUs)", + p->pid); + } + return; + } + /* - * Update both the CPU's cell and the cpu's vtime so the vtime's are - * comparable at dispatch time. + * Update per-(cell, L3) vtime for cell-schedulable tasks */ - if (time_before(READ_ONCE(cell->vtime_now), p->scx.dsq_vtime)) - WRITE_ONCE(cell->vtime_now, p->scx.dsq_vtime); + if (tctx->all_cell_cpus_allowed && l3_is_valid(tctx->l3)) { + if (time_before(READ_ONCE(cell->l3_vtime_now[tctx->l3]), + p->scx.dsq_vtime)) + WRITE_ONCE(cell->l3_vtime_now[tctx->l3], + p->scx.dsq_vtime); + } + /* + * Update CPU vtime for CPU-pinned tasks + */ if (time_before(READ_ONCE(cctx->vtime_now), p->scx.dsq_vtime)) WRITE_ONCE(cctx->vtime_now, p->scx.dsq_vtime); @@ -1048,7 +1178,7 @@ void BPF_STRUCT_OPS(mitosis_stopping, struct task_struct *p, bool runnable) used = now - tctx->started_running_at; tctx->started_running_at = now; /* scale the execution time by the inverse of the weight and charge */ - p->scx.dsq_vtime += used * 100 / p->scx.weight; + p->scx.dsq_vtime += used * DEFAULT_WEIGHT_MULTIPLIER / p->scx.weight; if (cidx != 0 || tctx->all_cell_cpus_allowed) { u64 *cell_cycles = MEMBER_VPTR(cctx->cell_cycles, [cidx]); @@ -1057,6 +1187,17 @@ void BPF_STRUCT_OPS(mitosis_stopping, struct task_struct *p, bool runnable) return; } *cell_cycles += used; + + /* + * For cell-schedulable tasks, also accumulate vtime into + * per-cell per-L3 queues + */ + if (tctx->all_cell_cpus_allowed && l3_is_valid(tctx->l3)) { + /* Accumulate weighted execution time into per-(cell, L3) vtime */ + cell->l3_vtime_now[tctx->l3] += + used * DEFAULT_WEIGHT_MULTIPLIER / + p->scx.weight; + } } } @@ -1083,8 +1224,9 @@ s32 BPF_STRUCT_OPS(mitosis_cgroup_init, struct cgroup *cgrp, return -ENOENT; } + // Special case for root cell if (cgrp->kn->id == root_cgid) { - WRITE_ONCE(cgc->cell, 0); + WRITE_ONCE(cgc->cell, ROOT_CELL_ID); return 0; } @@ -1175,6 +1317,7 @@ s32 BPF_STRUCT_OPS(mitosis_init_task, struct task_struct *p, { struct task_ctx *tctx; struct bpf_cpumask *cpumask; + int ret; tctx = bpf_task_storage_get(&task_ctxs, p, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); @@ -1200,16 +1343,24 @@ s32 BPF_STRUCT_OPS(mitosis_init_task, struct task_struct *p, return -EINVAL; } - return update_task_cell(p, tctx, args->cgroup); + /* Initialize L3 to invalid before cell assignment */ + init_task_l3(tctx); + + // TODO clean this up + if ((ret = update_task_cell(p, tctx, args->cgroup))) { + return ret; + } + + return 0; } __hidden void dump_cpumask_word(s32 word, const struct cpumask *cpumask) { u32 u, v = 0; - bpf_for(u, 0, 32) + bpf_for(u, 0, BITS_PER_U32) { - s32 cpu = 32 * word + u; + s32 cpu = BITS_PER_U32 * word + u; if (cpu < nr_possible_cpus && bpf_cpumask_test_cpu(cpu, cpumask)) v |= 1 << u; @@ -1239,9 +1390,82 @@ static void dump_cell_cpumask(int id) dump_cpumask(cell_cpumask); } +/* Print cell state for debugging */ +static __always_inline void dump_cell_state(u32 cell_idx) +{ + struct cell *cell = lookup_cell(cell_idx); + if (!cell) { + scx_bpf_dump("Cell %d: NOT FOUND", cell_idx); + return; + } + + scx_bpf_dump("Cell %d: in_use=%d, cpu_cnt=%d, l3_present_cnt=%d", + cell_idx, cell->in_use, cell->cpu_cnt, + cell->l3_present_cnt); + + u32 l3; + // TODO Print vtimes for L3s + // TODO lock + bpf_for(l3, 0, nr_l3) + { + if (cell->l3_cpu_cnt[l3] > 0) { + scx_bpf_dump(" L3[%d]: %d CPUs", l3, + cell->l3_cpu_cnt[l3]); + } + } +} + +static __always_inline void dump_l3_state() +{ + u32 l3; + const struct cpumask *l3_mask; + dsq_id_t dsq_id; + + scx_bpf_dump("\n=== L3 Cache Topology ===\n"); + scx_bpf_dump("Total L3 domains: %d\n", nr_l3); + + bpf_for(l3, 0, nr_l3) + { + l3_mask = lookup_l3_cpumask(l3); + if (!l3_mask) { + scx_bpf_dump( + "L3[%d]: ERROR - failed to lookup cpumask\n", + l3); + continue; + } + + scx_bpf_dump("L3[%d] CPUS=", l3); + dump_cpumask(l3_mask); + scx_bpf_dump("\n"); + + scx_bpf_dump(" Per-cell DSQ stats:\n"); + u32 cell_idx; + bpf_for(cell_idx, 0, MAX_CELLS) + { + struct cell *cell = lookup_cell(cell_idx); + if (!cell || !cell->in_use) + continue; + + if (!l3_is_valid(l3)) + continue; + + dsq_id = get_cell_l3_dsq_id(cell_idx, l3); + u64 nr_queued = scx_bpf_dsq_nr_queued(dsq_id.raw); + + if (nr_queued > 0 || cell->l3_cpu_cnt[l3] > 0) { + scx_bpf_dump( + " Cell[%d]: %d CPUs, vtime=%llu, nr_queued=%llu\n", + cell_idx, cell->l3_cpu_cnt[l3], + READ_ONCE(cell->l3_vtime_now[l3]), + nr_queued); + } + } + } +} + void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx) { - u64 dsq_id; + dsq_id_t dsq_id; int i; struct cell *cell; struct cpu_ctx *cpu_ctx; @@ -1259,9 +1483,7 @@ void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx) scx_bpf_dump("CELL[%d] CPUS=", i); dump_cell_cpumask(i); scx_bpf_dump("\n"); - scx_bpf_dump("CELL[%d] vtime=%llu nr_queued=%d\n", i, - READ_ONCE(cell->vtime_now), - scx_bpf_dsq_nr_queued(i)); + dump_cell_state(i); } bpf_for(i, 0, nr_possible_cpus) @@ -1269,11 +1491,13 @@ void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx) if (!(cpu_ctx = lookup_cpu_ctx(i))) return; - dsq_id = cpu_dsq(i); + dsq_id = get_cpu_dsq_id(i); scx_bpf_dump("CPU[%d] cell=%d vtime=%llu nr_queued=%d\n", i, cpu_ctx->cell, READ_ONCE(cpu_ctx->vtime_now), - scx_bpf_dsq_nr_queued(dsq_id)); + scx_bpf_dsq_nr_queued(dsq_id.raw)); } + + dump_l3_state(); } void BPF_STRUCT_OPS(mitosis_dump_task, struct scx_dump_ctx *dctx, @@ -1285,9 +1509,9 @@ void BPF_STRUCT_OPS(mitosis_dump_task, struct scx_dump_ctx *dctx, return; scx_bpf_dump( - "Task[%d] vtime=%llu basis_vtime=%llu cell=%u dsq=%x all_cell_cpus_allowed=%d\n", + "Task[%d] vtime=%llu basis_vtime=%llu cell=%u dsq=%llu all_cell_cpus_allowed=%d\n", p->pid, p->scx.dsq_vtime, tctx->basis_vtime, tctx->cell, - tctx->dsq, tctx->all_cell_cpus_allowed); + tctx->dsq.raw, tctx->all_cell_cpus_allowed); scx_bpf_dump("Task[%d] CPUS=", p->pid); dump_cpumask(p->cpus_ptr); scx_bpf_dump("\n"); @@ -1319,7 +1543,8 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init) if ((u8_ptr = MEMBER_VPTR(all_cpus, [i / 8]))) { if (*u8_ptr & (1 << (i % 8))) { bpf_cpumask_set_cpu(i, cpumask); - ret = scx_bpf_create_dsq(cpu_dsq(i), -1); + ret = scx_bpf_create_dsq(get_cpu_dsq_id(i).raw, + ANY_NUMA); if (ret < 0) { bpf_cpumask_release(cpumask); return ret; @@ -1334,14 +1559,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init) if (cpumask) bpf_cpumask_release(cpumask); + /* setup cell cpumasks */ bpf_for(i, 0, MAX_CELLS) { struct cell_cpumask_wrapper *cpumaskw; - - ret = scx_bpf_create_dsq(i, -1); - if (ret < 0) - return ret; - if (!(cpumaskw = bpf_map_lookup_elem(&cell_cpumasks, &i))) return -ENOENT; @@ -1373,7 +1594,32 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init) } } - cells[0].in_use = true; + struct cell *cell = lookup_cell(0); + if (!cell) { + scx_bpf_error("Failed to lookup cell 0"); + return -ENOENT; + } + cell->in_use = true; + + /* Configure root cell (cell 0) topology at init time using nr_l3 and l3_to_cpu masks */ + recalc_cell_l3_counts(ROOT_CELL_ID); + + /* Create (cell,L3) DSQs for all pairs. Userspace will populate maps. */ + // This is a crazy over-estimate + bpf_for(i, 0, MAX_CELLS) + { + u32 l3; + bpf_for(l3, 0, nr_l3) + { + ret = scx_bpf_create_dsq(get_cell_l3_dsq_id(i, l3).raw, + ANY_NUMA); + if (ret < 0) + scx_bpf_error( + "Failed to create DSQ for cell %d, L3 %d: err %d", + i, l3, ret); + } + } + return 0; } diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h new file mode 100644 index 0000000000..52738c6a21 --- /dev/null +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h @@ -0,0 +1,207 @@ +/* Copyright (c) Meta Platforms, Inc. and affiliates. */ +/* + * This software may be used and distributed according to the terms of the + * GNU General Public License version 2. + * + * This defines the core data structures, types, and constants + * for the scx_mitosis scheduler, primarily containing `struct cell` + * and `struct task_ctx`. + */ + +#pragma once + +#ifdef LSP +#define __bpf__ +#include "../../../../include/scx/common.bpf.h" +#include "../../../../include/scx/ravg_impl.bpf.h" +#else +#include +#include +#endif + +#include "intf.h" +#include "dsq.bpf.h" + +/* + * A couple of tricky things about checking a cgroup's cpumask: + * + * First, we need an RCU pointer to pass to cpumask kfuncs. The only way to get + * this right now is to copy the cpumask to a map entry. Given that cgroup init + * could be re-entrant we have a few per-cpu entries in a map to make this + * doable. + * + * Second, cpumask can sometimes be stored as an array in-situ or as a pointer + * and with different lengths. Some bpf_core_type_matches finagling can make + * this all work. + */ +#define MAX_CPUMASK_ENTRIES (4) + +/* + * We don't know how big struct cpumask is at compile time, so just allocate a + * large space and check that it is big enough at runtime + * TODO: This should be deduplicated with the rust code and put in intf.h + */ +#define CPUMASK_LONG_ENTRIES (128) +#define CPUMASK_SIZE (sizeof(long) * CPUMASK_LONG_ENTRIES) + +extern const volatile u32 nr_l3; + +extern struct cell_map cells; + +enum mitosis_constants { + + /* Root cell index */ + ROOT_CELL_ID = 0, + + /* Invalid/unset L3 value */ + // INVALID_L3_ID = -1, + + /* Default weight divisor for vtime calculation */ + DEFAULT_WEIGHT_MULTIPLIER = 100, + + /* Vtime validation multiplier (slice_ns * 8192) */ + VTIME_MAX_FUTURE_MULTIPLIER = 8192, + + /* Bits per u32 for cpumask operations */ + BITS_PER_U32 = 32, + + /* No NUMA constraint for DSQ creation */ + ANY_NUMA = -1, +}; + +static inline void copy_cell_skip_lock(struct cell *dst, const struct cell *src) +{ + /* Copy everything AFTER the lock field. + * Since lock is first and 4 bytes (verified by static assertions), + * we skip it and copy the remainder of the struct. + */ + __builtin_memcpy(&dst->in_use, &src->in_use, + sizeof(struct cell) - sizeof(CELL_LOCK_T)); +} + +static inline struct cell *lookup_cell(int idx) +{ + struct cell *cell; + + cell = bpf_map_lookup_elem(&cells, &idx); + + if (!cell) { + scx_bpf_error("Invalid cell %d", idx); + return NULL; + } + return cell; +} + +static inline struct bpf_spin_lock *get_cell_lock(u32 cell_idx) +{ + if (cell_idx >= MAX_CELLS) { + scx_bpf_error("Invalid cell index %d", cell_idx); + return NULL; + } + + struct cell *cell = lookup_cell(cell_idx); + if (!cell) { + scx_bpf_error("Cell %d not found", cell_idx); + return NULL; + } + return &cell->lock; +} + +/* + * task_ctx is the per-task information kept by scx_mitosis + */ +struct task_ctx { + /* cpumask is the set of valid cpus this task can schedule on */ + /* (tasks cpumask anded with its cell cpumask) */ + struct bpf_cpumask __kptr *cpumask; + /* started_running_at for recording runtime */ + u64 started_running_at; + u64 basis_vtime; + /* For the sake of monitoring, each task is owned by a cell */ + u32 cell; + /* For the sake of scheduling, a task is exclusively owned by either a cell + * or a cpu */ + dsq_id_t dsq; + /* latest configuration that was applied for this task */ + /* (to know if it has to be re-applied) */ + u32 configuration_seq; + /* Is this task allowed on all cores of its cell? */ + bool all_cell_cpus_allowed; + // Which L3 this task is assigned to + s32 l3; + +#if MITOSIS_ENABLE_STEALING + /* When a task is stolen, dispatch() marks the destination L3 here. + * running() applies the retag and recomputes cpumask (vtime preserved). + */ + s32 pending_l3; + u32 steal_count; /* how many times this task has been stolen */ + u64 last_stolen_at; /* ns timestamp of the last steal (scx_bpf_now) */ + u32 steals_prevented; /* how many times this task has been prevented from being stolen */ +#endif +}; + +// These could go in mitosis.bpf.h, but we'll cross that bridge when we get +static inline const struct cpumask *lookup_cell_cpumask(int idx); + +static inline struct task_ctx *lookup_task_ctx(struct task_struct *p); + +/* MAP TYPES */ +struct function_counters_map { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, NR_COUNTERS); +}; + +struct cell_map { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, struct cell); + __uint(max_entries, MAX_CELLS); +}; + +struct rcu_read_guard { + bool active; +}; + +static inline struct rcu_read_guard rcu_read_lock_guard(void) +{ + bpf_rcu_read_lock(); + return (struct rcu_read_guard){ .active = true }; +} + +static inline void rcu_read_guard_release(struct rcu_read_guard *guard) +{ + if (guard->active) { + bpf_rcu_read_unlock(); + guard->active = false; + } +} +#define RCU_READ_GUARD() \ + struct rcu_read_guard __rcu_guard \ + __attribute__((__cleanup__(rcu_read_guard_release))) = \ + rcu_read_lock_guard() + +struct cpumask_guard { + struct bpf_cpumask *mask; +}; + +static inline struct cpumask_guard cpumask_create_guard(void) +{ + struct bpf_cpumask *mask = bpf_cpumask_create(); + return (struct cpumask_guard){ .mask = mask }; +} + +static inline void cpumask_guard_release(struct cpumask_guard *guard) +{ + if (guard->mask) { + bpf_cpumask_release(guard->mask); + guard->mask = NULL; + } +} + +#define CPUMASK_GUARD(var_name) \ + struct cpumask_guard var_name \ + __attribute__((__cleanup__(cpumask_guard_release))) = \ + cpumask_create_guard() diff --git a/scheds/rust/scx_mitosis/src/main.rs b/scheds/rust/scx_mitosis/src/main.rs index 8f42568df6..b25be74326 100644 --- a/scheds/rust/scx_mitosis/src/main.rs +++ b/scheds/rust/scx_mitosis/src/main.rs @@ -6,6 +6,7 @@ mod bpf_skel; pub use bpf_skel::*; pub mod bpf_intf; mod stats; +mod mitosis_topology_utils; use std::cmp::max; use std::collections::HashMap; @@ -16,14 +17,14 @@ use std::sync::atomic::AtomicBool; use std::sync::atomic::Ordering; use std::sync::Arc; use std::time::Duration; +use std::sync::Mutex; use anyhow::bail; use anyhow::Context; use anyhow::Result; use clap::Parser; use crossbeam::channel::RecvTimeoutError; -use libbpf_rs::MapCore as _; -use libbpf_rs::OpenObject; +use libbpf_rs::{MapCore, OpenObject, MapFlags}; use log::debug; use log::info; use log::trace; @@ -46,11 +47,48 @@ use scx_utils::NR_CPUS_POSSIBLE; use stats::CellMetrics; use stats::Metrics; +use crate::mitosis_topology_utils::{populate_topology_maps, MapKind}; + +// This is the cell type from intf.h. +// When copied to user, the lock field is omitted. +// We can mmap it, or use calls to the BPF_MAP_LOOKUP_ELEM +// command of the bpf() system call with the BPF_F_LOCK flag +type BpfCell = bpf_intf::cell; const SCHEDULER_NAME: &str = "scx_mitosis"; const MAX_CELLS: usize = bpf_intf::consts_MAX_CELLS as usize; const NR_CSTATS: usize = bpf_intf::cell_stat_idx_NR_CSTATS as usize; +// Can we deduplicate this with mitosis.bpf.h? +const CPUMASK_LONG_ENTRIES: usize = 128; + +// Global debug flags +// TODO: These will be runtime adjustable via a CLI option. +static DEBUG_FLAGS: std::sync::LazyLock>> = std::sync::LazyLock::new(|| { + let mut flags = HashMap::new(); + flags.insert("cpu_to_l3".to_string(), false); + flags.insert("l3_to_cpus".to_string(), false); + flags.insert("cells".to_string(), true ); + flags.insert("counters".to_string(), true ); + flags.insert("steals".to_string(), true ); + flags.insert("metrics".to_string(), true ); + Mutex::new(flags) +}); + +/// Debug Printers +const ANSI_RED: &str = "\x1b[31m"; +const ANSI_GREEN: &str = "\x1b[32m"; +const ANSI_RESET: &str = "\x1b[0m"; + +/// Check if a debug flag is enabled +fn is_debug_flag_enabled(flag: &str) -> bool { + if let Ok(flags) = DEBUG_FLAGS.lock() { + flags.get(flag).copied().unwrap_or(false) + } else { + false + } +} + /// scx_mitosis: A dynamic affinity scheduler /// /// Cgroups are assigned to a dynamic number of Cells which are assigned to a @@ -106,20 +144,22 @@ const QUEUE_STATS_IDX: [bpf_intf::cell_stat_idx; 3] = [ // Per cell book-keeping #[derive(Debug)] -struct Cell { +struct CellMask { cpus: Cpumask, } struct Scheduler<'a> { skel: BpfSkel<'a>, monitor_interval: Duration, - cells: HashMap, + cells: HashMap, // These are the per-cell cstats. // Note these are accumulated across all CPUs. prev_cell_stats: [[u64; NR_CSTATS]; MAX_CELLS], + prev_total_steals: u64, metrics: Metrics, stats_server: StatsServer<(), Metrics>, last_configuration_seq: Option, + iteration_count: u64, } struct DistributionStats { @@ -146,7 +186,7 @@ impl Display for DistributionStats { ); write!( f, - "{:width$} {:5.1}% | Local:{:4.1}% From: CPU:{:4.1}% Cell:{:4.1}% | V:{:4.1}%", + "{:width$} {:5.1}% | Local:{:5.1}% From: CPU:{:4.1}% Cell:{:5.1}% | V:{:4.1}%", self.total_decisions, self.share_of_decisions_pct, self.local_q_pct, @@ -159,6 +199,38 @@ impl Display for DistributionStats { } impl<'a> Scheduler<'a> { + fn get_bpf_cell(&self, cell_id: u32) -> anyhow::Result> { + let key = cell_id.to_ne_bytes(); + let map = &self.skel.maps.cells; // NOTE: map is a field, not a method + + match map.lookup(&key, MapFlags::ANY)? { + Some(bytes) => { + let need = core::mem::size_of::(); + if bytes.len() != need { + anyhow::bail!("cells value size {} != BpfCell {}", bytes.len(), need); + } + // Copy to an aligned buffer to avoid misaligned reference + let mut tmp = MaybeUninit::::uninit(); + unsafe { + std::ptr::copy_nonoverlapping( + bytes.as_ptr(), + tmp.as_mut_ptr() as *mut u8, + need, + ); + Ok(Some(tmp.assume_init())) + } + } + None => Ok(None), + } + } + + fn is_cell_in_use(&self, cell_id: u32) -> bool { + match self.get_bpf_cell(cell_id) { + Ok(Some(c)) => c.in_use != 0, + _ => false, + } + } + fn init(opts: &Opts, open_object: &'a mut MaybeUninit) -> Result { let topology = Topology::new()?; @@ -182,12 +254,35 @@ impl<'a> Scheduler<'a> { skel.maps.rodata_data.as_mut().unwrap().all_cpus[cpu / 8] |= 1 << (cpu % 8); } + skel.maps.rodata_data.as_mut().unwrap().nr_l3 = topology.all_llcs.len() as u32; + + // print the number of l3s we detected + info!("Found {} L3s", topology.all_llcs.len()); + match *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP { 0 => info!("Kernel does not support queued wakeup optimization."), v => skel.struct_ops.mitosis_mut().flags |= v, } - let skel = scx_ops_load!(skel, mitosis, uei)?; + let mut skel = scx_ops_load!(skel, mitosis, uei)?; + + // Verify our version of the cell datastructure is the same size + // as the bpf one. + let cells_info = skel.maps.cells.info()?; + let usz = core::mem::size_of::() as u32; + if cells_info.info.value_size != usz { + bail!( + "cells value_size={} but Rust expects {} (BpfCell)", + cells_info.info.value_size, + usz + ); + } + + // Set up CPU to L3 topology mapping using the common functionality + populate_topology_maps(&mut skel, MapKind::CpuToL3, None)?; + + // Set up L3 to CPUs mapping using the common functionality + populate_topology_maps(&mut skel, MapKind::L3ToCpus, None)?; let stats_server = StatsServer::new(stats::server_data()).launch()?; @@ -196,9 +291,11 @@ impl<'a> Scheduler<'a> { monitor_interval: Duration::from_secs(opts.monitor_interval_s), cells: HashMap::new(), prev_cell_stats: [[0; NR_CSTATS]; MAX_CELLS], + prev_total_steals: 0, metrics: Metrics::default(), stats_server, last_configuration_seq: None, + iteration_count: 0, }) } @@ -210,6 +307,7 @@ impl<'a> Scheduler<'a> { let (res_ch, req_ch) = self.stats_server.channels(); while !shutdown.load(Ordering::Relaxed) && !uei_exited!(&self.skel, uei) { + self.iteration_count += 1; self.refresh_bpf_cells()?; self.collect_metrics()?; @@ -292,7 +390,7 @@ impl<'a> Scheduler<'a> { } } - let prefix = "Total Decisions:"; + let prefix = " Total: "; // Here we want to sum the affinity violations over all cells. let scope_affn_viols: u64 = cell_stats_delta @@ -310,7 +408,10 @@ impl<'a> Scheduler<'a> { self.metrics.update(&stats); - trace!("{} {}", prefix, stats); + if is_debug_flag_enabled("metrics") { + trace!("{}{}{}", ANSI_GREEN, "metrics:", ANSI_RESET); + trace!("{} {}", prefix, stats); + } Ok(()) } @@ -327,8 +428,8 @@ impl<'a> Scheduler<'a> { .map(|&stat| cell_stats_delta[cell][stat as usize]) .sum::(); - // FIXME: This should really query if the cell is enabled or not. - if cell_queue_decisions == 0 { + // Only print stats for cells that are in use and have decisions + if !self.is_cell_in_use(cell as u32) { continue; } @@ -340,7 +441,7 @@ impl<'a> Scheduler<'a> { const MIN_CELL_WIDTH: usize = 2; let cell_width: usize = max(MIN_CELL_WIDTH, (MAX_CELLS as f64).log10().ceil() as usize); - let prefix = format!(" Cell {:width$}:", cell, width = cell_width); + let prefix = format!(" Cell {:width$}:", cell, width = cell_width); // Sum affinity violations for this cell let scope_affn_viols: u64 = @@ -359,7 +460,9 @@ impl<'a> Scheduler<'a> { .or_default() .update(&stats); - trace!("{} {}", prefix, stats); + if is_debug_flag_enabled("metrics") { + trace!("{} {}", prefix, stats); + } } Ok(()) } @@ -410,17 +513,70 @@ impl<'a> Scheduler<'a> { } Ok(cell_stats_delta) } + /// Print debug printer status summary + fn print_debug_status(&self) { + if let Ok(flags) = DEBUG_FLAGS.lock() { + let mut disabled: Vec<_> = flags.iter().filter_map(|(flag, &enabled)| (!enabled).then_some(format!("{}~{}{}", ANSI_RED, flag, ANSI_RESET))).collect(); + let enabled: Vec<_> = flags.iter().filter_map(|(flag, &enabled)| enabled.then_some(format!("{}+{}{}", ANSI_GREEN, flag, ANSI_RESET))).collect(); + disabled.extend(enabled); + trace!("Debug Flags: {}", if disabled.is_empty() { "none".to_string() } else { disabled.join(" ") }); + // trace!("hint: sudo ./scx_mitosis cli debug ~/+"); + } + } /// Collect metrics and out various debugging data like per cell stats, per-cpu stats, etc. fn collect_metrics(&mut self) -> Result<()> { + trace!(""); + trace!("Iteration #{}", self.iteration_count); + let cell_stats_delta = self.calculate_cell_stat_delta()?; self.log_all_queue_stats(&cell_stats_delta)?; + // TODO: I don't really understand this. for (cell_id, cell) in &self.cells { + // Check if cell is actually in use from BPF before printing + if !self.is_cell_in_use(*cell_id) { + continue; + } trace!("CELL[{}]: {}", cell_id, cell.cpus); } + // Read total steals from BPF and update metrics + self.update_steal_metrics()?; + + // Read and print function counters + self.print_and_reset_function_counters()?; + if is_debug_flag_enabled("cells") { + trace!("{}cells:{}", ANSI_GREEN, ANSI_RESET); + for i in 0..self.cells.len() { + if let Some(cell) = self.cells.get(&(i as u32)) { + trace!(" CELL[{}]: {} ({:3} CPUs)", i, cell.cpus, cell.cpus.weight()); + } + } + } + + if is_debug_flag_enabled("cpu_to_l3") { + let cpu_to_l3 = read_cpu_to_l3(&self.skel)?; + let cpu_l3_pairs: Vec = cpu_to_l3.iter().enumerate() + .map(|(cpu, l3)| format!("{:3}:{:2}", cpu, l3)) + .collect(); + let chunked_output = cpu_l3_pairs + .chunks(16) + .map(|chunk| chunk.join(" ")) + .collect::>() + .join("\n"); + trace!("{}cpu_to_l3:{}\n{}", ANSI_GREEN, ANSI_RESET, chunked_output); + } + + if is_debug_flag_enabled("l3_to_cpus") { + trace!("{}l3_to_cpus:{}", ANSI_GREEN, ANSI_RESET); + let l3_to_cpus = read_l3_to_cpus(&self.skel)?; + for (l3_id, mask) in l3_to_cpus.iter() { + trace!("l3_to_cpus: [{:2}] = {}", l3_id, mask); + } + } + for (cell_id, cell) in self.cells.iter() { // Assume we have a CellMetrics entry if we have a known cell self.metrics @@ -430,9 +586,167 @@ impl<'a> Scheduler<'a> { } self.metrics.num_cells = self.cells.len() as u32; + // Print debug printer status at the end of each cycle + self.print_debug_status(); + Ok(()) } + fn print_and_reset_function_counters(&mut self) -> Result<()> { + if !is_debug_flag_enabled("counters") { + return Ok(()); + } + trace!("{}counters:{}", ANSI_GREEN, ANSI_RESET); + + let counter_names = ["select", "enqueue", "dispatch"]; + let max_name_len = counter_names.iter().map(|name| name.len()).max().unwrap_or(0); + let mut all_counters = Vec::new(); + + // Read counters for each function + for counter_idx in 0..bpf_intf::fn_counter_idx_NR_COUNTERS { + let key = (counter_idx as u32).to_ne_bytes(); + + // Read per-CPU values + let percpu_values = self.skel + .maps + .function_counters + .lookup_percpu(&key, MapFlags::ANY) + .context("Failed to lookup function counter")? + .unwrap_or_default(); + + let mut cpu_values = Vec::new(); + for cpu in 0..*NR_CPUS_POSSIBLE { + if cpu < percpu_values.len() { + let value = u64::from_ne_bytes( + percpu_values[cpu].as_slice().try_into() + .context("Failed to convert counter bytes")? + ); + cpu_values.push(value); + } + } + + all_counters.push(cpu_values); + } + + // Calculate and print statistics for each counter + for (idx, counter_values) in all_counters.iter().enumerate() { + if idx >= counter_names.len() { + break; + } + + let name = counter_names[idx]; + let non_zero_values: Vec = counter_values.iter().filter(|&&v| v > 0).copied().collect(); + + if non_zero_values.is_empty() { + trace!(" Fn[{:6} min={:>4} med={:>4} max={:>5} ({:3} CPUs)", + name, total, min, median, max, non_zero_values.len(), width = max_name_len + ); + } + + // Zero out all counters after printing + for counter_idx in 0..bpf_intf::fn_counter_idx_NR_COUNTERS { + let key = (counter_idx as u32).to_ne_bytes(); + let zero_value = 0u64.to_ne_bytes().to_vec(); + + // Create per-CPU values array (all zeros) + let percpu_values: Vec> = (0..*NR_CPUS_POSSIBLE) + .map(|_| zero_value.clone()) + .collect(); + + self.skel + .maps + .function_counters + .update_percpu(&key, &percpu_values, MapFlags::ANY) + .context("Failed to reset function counter")?; + } + + Ok(()) + } + +fn update_steal_metrics(&mut self) -> Result<()> { + let steals_debug = is_debug_flag_enabled("steals"); + + // Early out if stealing is compiled out. + if bpf_intf::MITOSIS_ENABLE_STEALING == 0 { + self.metrics.total_steals = 0; + if steals_debug { + trace!("{}steals:{}", ANSI_GREEN, ANSI_RESET); + trace!(" Work stealing disabled at compile time (MITOSIS_ENABLE_STEALING=0)"); + } + return Ok(()); + } + + let key = 0u32.to_ne_bytes(); + + // Read the count; lazily initialize the slot to 0 if it doesn't exist. + let steal_count = match self.skel.maps.steal_stats.lookup(&key, MapFlags::ANY) { + Ok(Some(data)) if data.len() >= 8 => { + u64::from_ne_bytes(data[..8].try_into().unwrap()) + } + Ok(Some(_)) => { + if steals_debug { + debug!("steal_stats map data too small"); + } + 0 + } + Ok(None) => { + let zero = 0u64.to_ne_bytes(); + if let Err(e) = self.skel.maps.steal_stats.update(&key, &zero, MapFlags::ANY) { + if steals_debug { + debug!("Failed to initialize steal_stats map: {e}"); + } + } + 0 + } + Err(e) => { + if steals_debug { + debug!("Failed to read steal_stats map: {e}"); + } + 0 + } + }; + + // Calculate steals since last update (delta) + let steals_delta = steal_count - self.prev_total_steals; + self.prev_total_steals = steal_count; + self.metrics.total_steals = steals_delta; + + // Early out if we aren't logging. + if !steals_debug { + return Ok(()); + } + + if steals_delta > 0 { + trace!("{}steals:{}", ANSI_GREEN, ANSI_RESET); + trace!(" Work stealing active: steals_since_last={}", steals_delta); + } else { + trace!("{}steals:{}", ANSI_GREEN, ANSI_RESET); + trace!(" Work stealing enabled but no new steals: steals_since_last={}", steals_delta); + } + + Ok(()) +} + + fn refresh_bpf_cells(&mut self) -> Result<()> { let applied_configuration = unsafe { std::ptr::read_volatile( @@ -465,15 +779,12 @@ impl<'a> Scheduler<'a> { // Create cells we don't have yet, drop cells that are no longer in use. // If we continue to drop cell metrics once a cell is removed, we'll need to make sure we // flush metrics for a cell before we remove it completely. - let cells = &self.skel.maps.bss_data.as_ref().unwrap().cells; for i in 0..MAX_CELLS { let cell_idx = i as u32; - let bpf_cell = cells[i]; - let in_use = unsafe { std::ptr::read_volatile(&bpf_cell.in_use as *const u32) }; - if in_use > 0 { + if self.is_cell_in_use(cell_idx) { self.cells .entry(cell_idx) - .or_insert_with(|| Cell { + .or_insert_with(|| CellMask { cpus: Cpumask::new(), }) .cpus = cell_to_cpus @@ -498,7 +809,7 @@ fn read_cpu_ctxs(skel: &BpfSkel) -> Result> { let cpu_ctxs_vec = skel .maps .cpu_ctxs - .lookup_percpu(&0u32.to_ne_bytes(), libbpf_rs::MapFlags::ANY) + .lookup_percpu(&0u32.to_ne_bytes(), MapFlags::ANY) .context("Failed to lookup cpu_ctx")? .unwrap(); for cpu in 0..*NR_CPUS_POSSIBLE { @@ -509,7 +820,52 @@ fn read_cpu_ctxs(skel: &BpfSkel) -> Result> { Ok(cpu_ctxs) } +fn read_cpu_to_l3(skel: &BpfSkel) -> Result> { + let mut cpu_to_l3 = vec![]; + for cpu in 0..*NR_CPUS_POSSIBLE { + let key = (cpu as u32).to_ne_bytes(); + let val = skel + .maps + .cpu_to_l3 + .lookup(&key, MapFlags::ANY)? + .map(|v| u32::from_ne_bytes(v.try_into().unwrap())) + .unwrap_or(0); + cpu_to_l3.push(val); + } + Ok(cpu_to_l3) +} + +fn read_l3_to_cpus(skel: &BpfSkel) -> Result> { + let mut l3_to_cpus = vec![]; + + // Get the number of L3 caches from the BPF rodata + let nr_l3 = skel.maps.rodata_data.as_ref().unwrap().nr_l3; + + for l3 in 0..nr_l3 { + let key = (l3 as u32).to_ne_bytes(); + let mask = if let Some(v) = skel + .maps + .l3_to_cpus + .lookup(&key, MapFlags::ANY)? + { + let bytes = v.as_slice(); + let mut longs = [0u64; CPUMASK_LONG_ENTRIES]; + let mut i = 0; + while i < CPUMASK_LONG_ENTRIES && i * 8 + 8 <= bytes.len() { + longs[i] = u64::from_ne_bytes(bytes[i * 8..i * 8 + 8].try_into().unwrap()); + i += 1; + } + Cpumask::from_vec(longs.to_vec()) + } else { + Cpumask::new() + }; + l3_to_cpus.push((l3, mask)); + } + Ok(l3_to_cpus) +} + fn main() -> Result<()> { + let opts = Opts::parse(); if opts.version { diff --git a/scheds/rust/scx_mitosis/src/mitosis_topology_utils.rs b/scheds/rust/scx_mitosis/src/mitosis_topology_utils.rs new file mode 100644 index 0000000000..de19b1e02d --- /dev/null +++ b/scheds/rust/scx_mitosis/src/mitosis_topology_utils.rs @@ -0,0 +1,168 @@ +use anyhow::{bail, Context, Result}; +use libbpf_rs::{MapCore, MapFlags}; +use scx_utils::Topology; +use std::collections::HashMap; +use std::io::{self, BufRead, BufReader}; +use std::path::Path; + +use crate::bpf_skel::BpfSkel; + +const CPUMASK_LONG_ENTRIES: usize = 128; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum MapKind { + CpuToL3, + L3ToCpus, +} + +impl std::str::FromStr for MapKind { + type Err = anyhow::Error; + fn from_str(s: &str) -> Result { + match s { + "cpu_to_l3" => Ok(MapKind::CpuToL3), + "l3_to_cpus" => Ok(MapKind::L3ToCpus), + _ => bail!("unknown map {s}"), + } + } +} + +impl std::fmt::Display for MapKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(match self { + MapKind::CpuToL3 => "cpu_to_l3", + MapKind::L3ToCpus => "l3_to_cpus", + }) + } +} + +pub const SUPPORTED_MAPS: &[MapKind] = &[MapKind::CpuToL3, MapKind::L3ToCpus]; + +/// Parse lines of the form `cpu,l3` from the provided reader. +fn parse_cpu_l3_map(reader: R) -> Result> { + let mut pairs = Vec::new(); + for line in reader.lines() { + let line = line?; + let line = line.trim(); + // Ignore blank lines and comments + if line.is_empty() || line.starts_with('#') { + continue; + } + let mut parts = line.split(','); + let cpu = parts + .next() + .ok_or_else(|| anyhow::anyhow!("missing cpu"))? + .trim() + .parse::()?; + let l3 = parts + .next() + .ok_or_else(|| anyhow::anyhow!("missing l3"))? + .trim() + .parse::()?; + pairs.push((cpu, l3)); + } + Ok(pairs) +} + +/// Read CPU/L3 pairs either from a file or standard input. +fn read_cpu_l3_map(path: &str) -> Result> { + if path == "-" { + println!("reading from stdin"); + let stdin = io::stdin(); + let reader = BufReader::new(stdin.lock()); + parse_cpu_l3_map(reader) + } else { + println!("reading from {path}"); + let file = std::fs::File::open(Path::new(path))?; + let reader = BufReader::new(file); + parse_cpu_l3_map(reader) + } +} + +/// Update map entries either from a file or from the host topology. +/// This function can be used by both the main scheduler and CLI tools. +pub fn populate_topology_maps(skel: &mut BpfSkel, map: MapKind, file: Option) -> Result<()> { + match map { + MapKind::CpuToL3 => { + let map_entries = if let Some(path) = file { + println!("loading from {path}"); + read_cpu_l3_map(&path)? + } else { + println!("loading from host topology"); + let topo = Topology::new()?; + (0..*scx_utils::NR_CPUS_POSSIBLE) + // Use 0 if a CPU is missing from the topology + .map(|cpu| (cpu, topo.all_cpus.get(&cpu).map(|c| c.l3_id).unwrap_or(0))) + .collect() + }; + for (cpu, l3) in map_entries { + // Each CPU index is stored as a 32bit key mapping to its L3 id + let key = (cpu as u32).to_ne_bytes(); + let val = (l3 as u32).to_ne_bytes(); + skel.maps.cpu_to_l3.update(&key, &val, MapFlags::ANY)?; + } + } + MapKind::L3ToCpus => { + if file.is_some() { + anyhow::bail!("Loading l3_to_cpus from file is not supported yet"); + } + + println!("loading l3_to_cpus from host topology"); + let topo = Topology::new()?; + + // Group CPUs by L3 cache ID + let mut l3_to_cpus: HashMap> = HashMap::new(); + for cpu in topo.all_cpus.values() { + l3_to_cpus.entry(cpu.l3_id).or_default().push(cpu.id); + } + + // For each L3 cache, create a cpumask and populate the map + for (l3_id, cpus) in l3_to_cpus { + let key = (l3_id as u32).to_ne_bytes(); + + // Create a cpumask structure that matches the BPF side + let mut cpumask_longs = [0u64; CPUMASK_LONG_ENTRIES]; + + // Set bits for each CPU in this L3 cache + for cpu in cpus { + let long_idx = cpu / 64; + let bit_idx = cpu % 64; + if long_idx < CPUMASK_LONG_ENTRIES { + cpumask_longs[long_idx] |= 1u64 << bit_idx; + } + } + + // Convert to bytes for the map update + let mut value_bytes = Vec::new(); + for long_val in cpumask_longs { + value_bytes.extend_from_slice(&long_val.to_ne_bytes()); + } + + skel.maps.l3_to_cpus.update(&key, &value_bytes, MapFlags::ANY) + .context(format!("Failed to update l3_to_cpus map for L3 {}", l3_id))?; + } + } + } + Ok(()) +} + + +/// Display CPU to L3 cache relationships discovered from the host topology. +pub fn print_topology() -> Result<()> { + let topo = Topology::new()?; + println!("Number L3 caches: {}", topo.all_llcs.len()); + println!("CPU -> L3 id:"); + for cpu in topo.all_cpus.values() { + println!("cpu {} -> {}", cpu.id, cpu.l3_id); + } + println!("\nL3 id -> [cpus]:"); + let mut by_l3: std::collections::BTreeMap> = + std::collections::BTreeMap::new(); + for cpu in topo.all_cpus.values() { + by_l3.entry(cpu.l3_id).or_default().push(cpu.id); + } + for (l3, mut cpus) in by_l3 { + cpus.sort_unstable(); + println!("{l3} -> {:?}", cpus); + } + Ok(()) +} diff --git a/scheds/rust/scx_mitosis/src/stats.rs b/scheds/rust/scx_mitosis/src/stats.rs index 749296a4ff..0cfc001667 100644 --- a/scheds/rust/scx_mitosis/src/stats.rs +++ b/scheds/rust/scx_mitosis/src/stats.rs @@ -65,6 +65,8 @@ pub struct Metrics { pub share_of_decisions_pct: f64, #[stat(desc = "Cell scheduling decisions")] total_decisions: u64, + #[stat(desc = "Work steals since last update")] + pub total_steals: u64, #[stat(desc = "Per-cell metrics")] // TODO: cell names pub cells: BTreeMap, }