From 1b11d2bb14ebfaaec8cc2792c72d36b5ba7bfa08 Mon Sep 17 00:00:00 2001 From: tommy-u Date: Mon, 8 Sep 2025 11:27:59 -0700 Subject: [PATCH 01/12] Only print stats when cell is in_use --- scheds/rust/scx_mitosis/src/main.rs | 33 +++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/scheds/rust/scx_mitosis/src/main.rs b/scheds/rust/scx_mitosis/src/main.rs index 8f42568df6..c8740c918d 100644 --- a/scheds/rust/scx_mitosis/src/main.rs +++ b/scheds/rust/scx_mitosis/src/main.rs @@ -159,6 +159,13 @@ impl Display for DistributionStats { } impl<'a> Scheduler<'a> { + fn is_cell_in_use(&self, cell_id: u32) -> bool { + let cells = &self.skel.maps.bss_data.as_ref().unwrap().cells; + let bpf_cell = cells[cell_id as usize]; + let in_use = unsafe { std::ptr::read_volatile(&bpf_cell.in_use as *const u32) }; + in_use != 0 + } + fn init(opts: &Opts, open_object: &'a mut MaybeUninit) -> Result { let topology = Topology::new()?; @@ -327,8 +334,8 @@ impl<'a> Scheduler<'a> { .map(|&stat| cell_stats_delta[cell][stat as usize]) .sum::(); - // FIXME: This should really query if the cell is enabled or not. - if cell_queue_decisions == 0 { + // Only print stats for cells that are in use and have decisions + if cell_queue_decisions == 0 || !self.is_cell_in_use(cell as u32) { continue; } @@ -418,7 +425,29 @@ impl<'a> Scheduler<'a> { self.log_all_queue_stats(&cell_stats_delta)?; for (cell_id, cell) in &self.cells { + // Check if cell is actually in use from BPF before printing + if !self.is_cell_in_use(*cell_id) { + continue; + } + trace!("CELL[{}]: {}", cell_id, cell.cpus); + + // Read current CPU assignments directly from BPF for comparison + let mut bpf_cpus = Cpumask::new(); + let cpu_ctxs = read_cpu_ctxs(&self.skel)?; + for (i, cpu_ctx) in cpu_ctxs.iter().enumerate() { + if cpu_ctx.cell == *cell_id { + bpf_cpus.set_cpu(i).expect("set cpu in bpf mask"); + } + } + + trace!("CELL[{}]: BPF={}", cell_id, bpf_cpus); + + // Flag potential staleness + if cell.cpus != bpf_cpus { + warn!("STALENESS DETECTED: CELL[{}] userspace={} != bpf={}", + cell_id, cell.cpus, bpf_cpus); + } } for (cell_id, cell) in self.cells.iter() { From 0b7a1fc9de5583730845173102bdfe8ca33a9497 Mon Sep 17 00:00:00 2001 From: tommy-u Date: Mon, 8 Sep 2025 11:39:39 -0700 Subject: [PATCH 02/12] Fix cpumask cleanup. RAII for running guard. --- scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c index b40c06e79d..266a8cb30b 100644 --- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c @@ -758,17 +758,18 @@ static inline int get_cgroup_cpumask(struct cgroup *cgrp, u32 level_cells[MAX_CG_DEPTH]; int running; +/* The guard is a stack variable. When it falls out of scope, + * we drop the running lock. */ +static inline void __running_unlock(int *guard) { + (void)guard; /* unused */ + WRITE_ONCE(running, 0); +} + /* * On tick, we identify new cells and apply CPU assignment */ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run) { - /* - * We serialize tick() on core 0 and ensure only one tick running at a time - * to ensure this can only happen once. - */ - if (bpf_get_smp_processor_id()) - return; u32 local_configuration_seq = READ_ONCE(configuration_seq); if (local_configuration_seq == READ_ONCE(applied_configuration_seq)) @@ -779,6 +780,8 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run) __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) return; + int __attribute__((cleanup(__running_unlock), unused)) __running_guard; + DECLARE_CPUMASK_ENTRY(entry) = allocate_cpumask_entry(); if (!entry) return; @@ -967,13 +970,11 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run) int cpu_idx; bpf_for(cpu_idx, 0, nr_possible_cpus) { - if (bpf_cpumask_test_cpu( - cpu_idx, (const struct cpumask *)&entry->cpumask)) { + if (bpf_cpumask_test_cpu( cpu_idx, (const struct cpumask *)root_bpf_cpumask)) { struct cpu_ctx *cpu_ctx; if (!(cpu_ctx = lookup_cpu_ctx(cpu_idx))) goto out_root_cgrp; cpu_ctx->cell = 0; - bpf_cpumask_clear_cpu(cpu_idx, root_bpf_cpumask); } } @@ -994,7 +995,6 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run) barrier(); WRITE_ONCE(applied_configuration_seq, local_configuration_seq); - WRITE_ONCE(running, 0); bpf_cgroup_release(root_cgrp_ref); return; From 0c3c7bb828b08075f2328ac8c4de40a20911966c Mon Sep 17 00:00:00 2001 From: tommy-u Date: Mon, 8 Sep 2025 15:22:27 -0700 Subject: [PATCH 03/12] Preparing datastructures and helper functions for core scheduler modification. --- scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h | 132 +++++++++++++++ scheds/rust/scx_mitosis/src/bpf/intf.h | 17 +- .../rust/scx_mitosis/src/bpf/l3_aware.bpf.h | 150 ++++++++++++++++++ scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c | 60 ++----- scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h | 104 ++++++++++++ 5 files changed, 404 insertions(+), 59 deletions(-) create mode 100644 scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h create mode 100644 scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h create mode 100644 scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h diff --git a/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h new file mode 100644 index 0000000000..a545cb72ad --- /dev/null +++ b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h @@ -0,0 +1,132 @@ +/* Copyright (c) Meta Platforms, Inc. and affiliates. */ +/* + * This software may be used and distributed according to the terms of the + * GNU General Public License version 2. + * + * This header defines the 32-bit dispatch queue (DSQ) ID encoding + * scheme for scx_mitosis, using type fields to distinguish between + * per-CPU and cell+L3 domain queues. It includes helper functions to + * construct, validate, and parse these DSQ IDs for queue management. + */ +#pragma once + +#include "intf.h" +#include "mitosis.bpf.h" + +/* + * ================================ + * BPF DSQ ID Layout (64 bits wide) + * ================================ + * + * Top-level format: + * [63] [62..0] + * [ B] [ ID ] + * + * If B == 1 it is a Built-in DSQ + * ------------------------- + * [63] [62] [61 .. 32] [31..0] + * [ 1] [ L] [ R ] [ V ] + * + * - L (bit 62): LOCAL_ON flag + * If L == 1 -> V = CPU number + * - R (30 bits): reserved / unused + * - V (32 bits): value (e.g., CPU#) + * + * If B == 0 -> User-defined DSQ + * ----------------------------- + * Only the low 32 bits are used. + * + * [63 .. 32] [31..0] + * [ 0s or unused ] [ VAL ] + * + * Mitosis uses VAL as follows: + * + * [31..24] [23..0] + * [QTYPE ] [DATA ] + * + * QTYPE encodes the queue type (exactly one bit set): + * + * QTYPE = 0x1 -> Per-CPU Q + * [31 .. 24] [23 .. 16] [15 .. 0] + * [00000001] [00000000] [ CPU# ] + * [Q-TYPE:1] + * + * QTYPE = 0x2 -> Cell+L3 Q + * [31 .. 24] [23 .. 16] [15 .. 0] + * [00000010] [ CELL# ] [ L3ID ] + * [Q-TYPE:2] + * + */ + +#define DSQ_ERROR 0xFFFFFFFF; /* Error value for DSQ functions */ + +/* DSQ type enumeration */ +enum dsq_type { + DSQ_UNKNOWN, + DSQ_TYPE_CPU, + DSQ_TYPE_CELL_L3, +}; + +/* DSQ ID structure using unions for type-safe access */ +struct dsq_cpu { + u32 cpu : 16; + u32 unused : 8; + u32 type : 8; +} __attribute__((packed)); + +struct dsq_cell_l3 { + u32 l3 : 16; + u32 cell : 8; + u32 type : 8; +} __attribute__((packed)); + +union dsq_id { + u32 raw; + struct dsq_cpu cpu; + struct dsq_cell_l3 cell_l3; + struct { + u32 data : 24; + u32 type : 8; + } common; +} __attribute__((packed)); + +/* Static assertions to ensure correct sizes */ +/* Verify that all DSQ structures are exactly 32 bits */ +_Static_assert(sizeof(struct dsq_cpu) == 4, "dsq_cpu must be 32 bits"); +_Static_assert(sizeof(struct dsq_cell_l3) == 4, "dsq_cell_l3 must be 32 bits"); +_Static_assert(sizeof(union dsq_id) == 4, "dsq_id union must be 32 bits"); + +/* Inline helper functions for DSQ ID manipulation */ + +// Is this a per CPU DSQ? +static inline bool is_cpu_dsq(u32 dsq_id) +{ + union dsq_id id = { .raw = dsq_id }; + return id.common.type == DSQ_TYPE_CPU; +} + +// If this is a per cpu dsq, return the cpu +static inline u32 get_cpu_from_dsq(u32 dsq_id) +{ + union dsq_id id = { .raw = dsq_id }; + if (id.common.type != DSQ_TYPE_CPU) + return DSQ_ERROR; + return id.cpu.cpu; +} + +/* Helper functions to construct DSQ IDs */ +static inline u32 get_cpu_dsq_id(u32 cpu) +{ + if (cpu >= MAX_CPUS) + return DSQ_ERROR; + union dsq_id id = { .cpu = { .cpu = cpu, .unused = 0, .type = DSQ_TYPE_CPU } }; + return id.raw; +} + +static inline u32 get_cell_l3_dsq_id(u32 cell, u32 l3) +{ + if (cell >= MAX_CELLS || l3 >= MAX_L3S) + return DSQ_ERROR; + union dsq_id id = { .cell_l3 = {.l3 = l3, .cell = cell, .type = DSQ_TYPE_CELL_L3 } }; + return id.raw; +} diff --git a/scheds/rust/scx_mitosis/src/bpf/intf.h b/scheds/rust/scx_mitosis/src/bpf/intf.h index 0658734545..01e1490aa5 100644 --- a/scheds/rust/scx_mitosis/src/bpf/intf.h +++ b/scheds/rust/scx_mitosis/src/bpf/intf.h @@ -18,6 +18,12 @@ typedef _Bool bool; #include #endif +/* ---- Work stealing config (compile-time) ------------------------------- */ +#ifndef MITOSIS_ENABLE_STEALING +#define MITOSIS_ENABLE_STEALING 0 +#endif +/* ----------------------------------------------------------------------- */ + enum consts { CACHELINE_SIZE = 64, MAX_CPUS_SHIFT = 9, @@ -28,6 +34,7 @@ enum consts { PCPU_BASE = 0x80000000, MAX_CG_DEPTH = 256, + }; /* Statistics */ @@ -51,14 +58,4 @@ struct cgrp_ctx { bool cell_owner; }; -/* - * cell is the per-cell book-keeping -*/ -struct cell { - // current vtime of the cell - u64 vtime_now; - // Whether or not the cell is used or not - u32 in_use; -}; - #endif /* __INTF_H */ diff --git a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h new file mode 100644 index 0000000000..0ced3fa78b --- /dev/null +++ b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h @@ -0,0 +1,150 @@ +/* Copyright (c) Meta Platforms, Inc. and affiliates. */ +/* + * This software may be used and distributed according to the terms of the + * GNU General Public License version 2. + * + * This header adds L3 cache awareness to scx_mitosis by defining BPF + * maps for CPU-to-L3 domain mappings. It provides functions to + * recalculate per-L3 CPU counts within cells and implements weighted + * random L3 selection for tasks. It also tracks work-stealing + * statistics for cross-L3 task migrations. + */ +#pragma once + +#include "mitosis.bpf.h" +#include "intf.h" + +// It's also an option to just compute this from the cpu_to_l3 map. +struct l3_cpu_mask { + unsigned long cpumask[CPUMASK_LONG_ENTRIES]; +}; + +/* Work stealing statistics map - accessible from both BPF and userspace */ +struct steal_stats_map { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, 1); +}; + +// A CPU -> L3 cache ID map +struct cpu_to_l3_map { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, MAX_CPUS); +}; + +struct l3_to_cpus_map { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, struct l3_cpu_mask); + __uint(max_entries, MAX_L3S); +}; + +extern struct cpu_to_l3_map cpu_to_l3 SEC(".maps"); +extern struct l3_to_cpus_map l3_to_cpus SEC(".maps"); +extern struct steal_stats_map steal_stats SEC(".maps"); + +static inline const struct cpumask *lookup_l3_cpumask(u32 l3) +{ + struct l3_cpu_mask *mask; + + if (!(mask = bpf_map_lookup_elem(&l3_to_cpus, &l3))) { + scx_bpf_error("no l3 cpumask, l3: %d, %p", l3, &l3_to_cpus); + return NULL; + } + + return (const struct cpumask *)mask; +} + +/* Recompute cell->l3_cpu_cnt[] after cell cpumask changes (no persistent kptrs). */ +static __always_inline void recalc_cell_l3_counts(u32 cell_idx) +{ + struct cell *cell = lookup_cell(cell_idx); + if (!cell) + return; + + struct bpf_cpumask *tmp = bpf_cpumask_create(); + if (!tmp) + return; + + u32 l3, present = 0, total_cpus = 0; + + bpf_rcu_read_lock(); + const struct cpumask *cell_mask = + lookup_cell_cpumask(cell_idx); // RCU ptr + if (!cell_mask) { + bpf_rcu_read_unlock(); + bpf_cpumask_release(tmp); + return; + } + + bpf_for(l3, 0, nr_l3) + { + const struct cpumask *l3_mask = + lookup_l3_cpumask(l3); // plain map memory + if (!l3_mask) { + cell->l3_cpu_cnt[l3] = 0; + continue; + } + + /* ok: dst is bpf_cpumask*, sources are (RCU cpumask*, plain cpumask*) */ + bpf_cpumask_and(tmp, cell_mask, l3_mask); + + u32 cnt = bpf_cpumask_weight((const struct cpumask *)tmp); + cell->l3_cpu_cnt[l3] = cnt; + total_cpus += cnt; + if (cnt) + present++; + } + bpf_rcu_read_unlock(); + + cell->l3_present_cnt = present; + cell->cpu_cnt = total_cpus; + bpf_cpumask_release(tmp); +} + +/** + * Weighted random selection of an L3 cache domain for a task. + * + * Uses the CPU count in each L3 domain within the cell as weights to + * probabilistically select an L3. L3 domains with more CPUs in the cell + * have higher probability of being selected. + * + * @cell_id: The cell ID to select an L3 from + * @return: L3 ID on success, INVALID_L3_ID on error, or 0 as fallback + */ +static inline s32 pick_l3_for_task(u32 cell_id) +{ + struct cell *cell; + u32 l3, target, cur = 0; + s32 ret = INVALID_L3_ID; + + /* Look up the cell structure */ + if (!(cell = lookup_cell(cell_id))) + return INVALID_L3_ID; + + /* Handle case where cell has no CPUs assigned yet */ + if (!cell->cpu_cnt) { + scx_bpf_error( + "pick_l3_for_task: cell %d has no CPUs accounted yet", + cell_id); + return INVALID_L3_ID; + } + + /* Generate random target value in range [0, cpu_cnt) */ + target = bpf_get_prandom_u32() % cell->cpu_cnt; + + /* Find the L3 domain corresponding to the target value using + * weighted selection - accumulate CPU counts until we exceed target */ + bpf_for(l3, 0, nr_l3) + { + cur += cell->l3_cpu_cnt[l3]; + if (target < cur) { + ret = (s32)l3; + break; + } + } + return ret; +} diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c index 266a8cb30b..2b197c87fc 100644 --- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c @@ -14,14 +14,9 @@ #include "intf.h" -#ifdef LSP -#define __bpf__ -#include "../../../../include/scx/common.bpf.h" -#include "../../../../include/scx/ravg_impl.bpf.h" -#else -#include -#include -#endif +#include "mitosis.bpf.h" +#include "dsq.bpf.h" +#include "l3_aware.bpf.h" char _license[] SEC("license") = "GPL"; @@ -35,6 +30,7 @@ const volatile unsigned char all_cpus[MAX_CPUS_U8]; const volatile u64 slice_ns; const volatile u64 root_cgid = 1; +const volatile u32 nr_l3 = 1; /* * CPU assignment changes aren't fully in effect until a subsequent tick() * configuration_seq is bumped on each assignment change @@ -48,6 +44,13 @@ private(root_cgrp) struct cgroup __kptr *root_cgrp; UEI_DEFINE(uei); +/* + * Maps used for L3-aware scheduling +*/ +struct cpu_to_l3_map cpu_to_l3 SEC(".maps"); +struct l3_to_cpus_map l3_to_cpus SEC(".maps"); +struct steal_stats_map steal_stats SEC(".maps"); + /* * We store per-cpu values along with per-cell values. Helper functions to * translate. @@ -119,27 +122,6 @@ static inline struct cgroup *task_cgroup(struct task_struct *p) return cgrp; } -/* - * task_ctx is the per-task information kept by scx_mitosis - */ -struct task_ctx { - /* cpumask is the set of valid cpus this task can schedule on */ - /* (tasks cpumask anded with its cell cpumask) */ - struct bpf_cpumask __kptr *cpumask; - /* started_running_at for recording runtime */ - u64 started_running_at; - u64 basis_vtime; - /* For the sake of monitoring, each task is owned by a cell */ - u32 cell; - /* For the sake of scheduling, a task is exclusively owned by either a cell - * or a cpu */ - u32 dsq; - /* latest configuration that was applied for this task */ - /* (to know if it has to be re-applied) */ - u32 configuration_seq; - /* Is this task allowed on all cores of its cell? */ - bool all_cell_cpus_allowed; -}; struct { __uint(type, BPF_MAP_TYPE_TASK_STORAGE); @@ -607,26 +589,6 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev) scx_bpf_dsq_move_to_local(dsq); } -/* - * A couple of tricky things about checking a cgroup's cpumask: - * - * First, we need an RCU pointer to pass to cpumask kfuncs. The only way to get - * this right now is to copy the cpumask to a map entry. Given that cgroup init - * could be re-entrant we have a few per-cpu entries in a map to make this - * doable. - * - * Second, cpumask can sometimes be stored as an array in-situ or as a pointer - * and with different lengths. Some bpf_core_type_matches finagling can make - * this all work. - */ -#define MAX_CPUMASK_ENTRIES (4) - -/* - * We don't know how big struct cpumask is at compile time, so just allocate a - * large space and check that it is big enough at runtime - */ -#define CPUMASK_LONG_ENTRIES (128) -#define CPUMASK_SIZE (sizeof(long) * CPUMASK_LONG_ENTRIES) struct cpumask_entry { unsigned long cpumask[CPUMASK_LONG_ENTRIES]; diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h new file mode 100644 index 0000000000..e39bbd92d8 --- /dev/null +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h @@ -0,0 +1,104 @@ +/* Copyright (c) Meta Platforms, Inc. and affiliates. */ +/* + * This software may be used and distributed according to the terms of the + * GNU General Public License version 2. + * + * This defines the core data structures, types, and constants + * for the scx_mitosis scheduler, primarily containing `struct cell` + * and `struct task_ctx`. + */ + +#pragma once + +#ifdef LSP +#define __bpf__ +#include "../../../../include/scx/common.bpf.h" +#include "../../../../include/scx/ravg_impl.bpf.h" +#else +#include +#include +#endif + +#include "intf.h" + +#define MAX_L3S 16 + +/* + * A couple of tricky things about checking a cgroup's cpumask: + * + * First, we need an RCU pointer to pass to cpumask kfuncs. The only way to get + * this right now is to copy the cpumask to a map entry. Given that cgroup init + * could be re-entrant we have a few per-cpu entries in a map to make this + * doable. + * + * Second, cpumask can sometimes be stored as an array in-situ or as a pointer + * and with different lengths. Some bpf_core_type_matches finagling can make + * this all work. + */ +#define MAX_CPUMASK_ENTRIES (4) + +extern const volatile u32 nr_l3; + +/* + * We don't know how big struct cpumask is at compile time, so just allocate a + * large space and check that it is big enough at runtime + */ +#define CPUMASK_LONG_ENTRIES (128) +#define CPUMASK_SIZE (sizeof(long) * CPUMASK_LONG_ENTRIES) + +enum mitosis_constants { + /* Invalid/unset L3 value */ + INVALID_L3_ID = -1, +}; + +struct cell { + // Whether or not the cell is used or not + u32 in_use; + // Number of CPUs in this cell + u32 cpu_cnt; + // per-L3 vtimes within this cell + u64 l3_vtime_now[MAX_L3S]; + // Number of CPUs from each L3 assigned to this cell + u32 l3_cpu_cnt[MAX_L3S]; + // Number of L3s with at least one CPU in this cell + u32 l3_present_cnt; + + // TODO XXX remove this, only here temporarily to make the code compile + // current vtime of the cell + u64 vtime_now; +}; + +/* + * task_ctx is the per-task information kept by scx_mitosis + */ +struct task_ctx { + /* cpumask is the set of valid cpus this task can schedule on */ + /* (tasks cpumask anded with its cell cpumask) */ + struct bpf_cpumask __kptr *cpumask; + /* started_running_at for recording runtime */ + u64 started_running_at; + u64 basis_vtime; + /* For the sake of monitoring, each task is owned by a cell */ + u32 cell; + /* For the sake of scheduling, a task is exclusively owned by either a cell + * or a cpu */ + u32 dsq; + /* latest configuration that was applied for this task */ + /* (to know if it has to be re-applied) */ + u32 configuration_seq; + /* Is this task allowed on all cores of its cell? */ + bool all_cell_cpus_allowed; + +#if MITOSIS_ENABLE_STEALING + /* When a task is stolen, dispatch() marks the destination L3 here. + * running() applies the retag and recomputes cpumask (vtime preserved). + */ + s32 pending_l3; + u32 steal_count; /* how many times this task has been stolen */ + u64 last_stolen_at; /* ns timestamp of the last steal (scx_bpf_now) */ +#endif +}; + +// These could go in mitosis.bpf.h, but we'll cross that bridge when we get +static inline struct cell *lookup_cell(int idx); +static inline const struct cpumask *lookup_cell_cpumask(int idx); From ba1924b18fff93f28fec50b67e92c420a28d5301 Mon Sep 17 00:00:00 2001 From: tommy-u Date: Mon, 8 Sep 2025 18:11:30 -0700 Subject: [PATCH 04/12] Prepare rust side for l3 awareness --- scheds/rust/scx_mitosis/src/bpf/intf.h | 8 + scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c | 5 + scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h | 13 +- scheds/rust/scx_mitosis/src/main.rs | 333 ++++++++++++++++-- .../scx_mitosis/src/mitosis_topology_utils.rs | 168 +++++++++ scheds/rust/scx_mitosis/src/stats.rs | 2 + 6 files changed, 504 insertions(+), 25 deletions(-) create mode 100644 scheds/rust/scx_mitosis/src/mitosis_topology_utils.rs diff --git a/scheds/rust/scx_mitosis/src/bpf/intf.h b/scheds/rust/scx_mitosis/src/bpf/intf.h index 01e1490aa5..89c0096fd7 100644 --- a/scheds/rust/scx_mitosis/src/bpf/intf.h +++ b/scheds/rust/scx_mitosis/src/bpf/intf.h @@ -46,6 +46,14 @@ enum cell_stat_idx { NR_CSTATS, }; +/* Function invocation counters */ +enum counter_idx { + COUNTER_SELECT_CPU, + COUNTER_ENQUEUE, + COUNTER_DISPATCH, + NR_COUNTERS, +}; + struct cpu_ctx { u64 cstats[MAX_CELLS][NR_CSTATS]; u64 cell_cycles[MAX_CELLS]; diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c index 2b197c87fc..dd3f2cf240 100644 --- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c @@ -49,6 +49,11 @@ UEI_DEFINE(uei); */ struct cpu_to_l3_map cpu_to_l3 SEC(".maps"); struct l3_to_cpus_map l3_to_cpus SEC(".maps"); + +/* + * Maps for statistics +*/ +struct function_counters_map function_counters SEC(".maps"); struct steal_stats_map steal_stats SEC(".maps"); /* diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h index e39bbd92d8..a4569f883e 100644 --- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h @@ -37,15 +37,16 @@ */ #define MAX_CPUMASK_ENTRIES (4) -extern const volatile u32 nr_l3; - /* * We don't know how big struct cpumask is at compile time, so just allocate a * large space and check that it is big enough at runtime + * TODO: This should be deduplicated with the rust code and put in intf.h */ #define CPUMASK_LONG_ENTRIES (128) #define CPUMASK_SIZE (sizeof(long) * CPUMASK_LONG_ENTRIES) +extern const volatile u32 nr_l3; + enum mitosis_constants { /* Invalid/unset L3 value */ INVALID_L3_ID = -1, @@ -102,3 +103,11 @@ struct task_ctx { // These could go in mitosis.bpf.h, but we'll cross that bridge when we get static inline struct cell *lookup_cell(int idx); static inline const struct cpumask *lookup_cell_cpumask(int idx); + +/* MAP TYPES */ +struct function_counters_map { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, NR_COUNTERS); +}; diff --git a/scheds/rust/scx_mitosis/src/main.rs b/scheds/rust/scx_mitosis/src/main.rs index c8740c918d..c68a1476b1 100644 --- a/scheds/rust/scx_mitosis/src/main.rs +++ b/scheds/rust/scx_mitosis/src/main.rs @@ -6,6 +6,7 @@ mod bpf_skel; pub use bpf_skel::*; pub mod bpf_intf; mod stats; +mod mitosis_topology_utils; use std::cmp::max; use std::collections::HashMap; @@ -16,14 +17,14 @@ use std::sync::atomic::AtomicBool; use std::sync::atomic::Ordering; use std::sync::Arc; use std::time::Duration; +use std::sync::Mutex; use anyhow::bail; use anyhow::Context; use anyhow::Result; use clap::Parser; use crossbeam::channel::RecvTimeoutError; -use libbpf_rs::MapCore as _; -use libbpf_rs::OpenObject; +use libbpf_rs::{MapCore, OpenObject}; use log::debug; use log::info; use log::trace; @@ -46,11 +47,42 @@ use scx_utils::NR_CPUS_POSSIBLE; use stats::CellMetrics; use stats::Metrics; +use crate::mitosis_topology_utils::{populate_topology_maps, MapKind}; const SCHEDULER_NAME: &str = "scx_mitosis"; const MAX_CELLS: usize = bpf_intf::consts_MAX_CELLS as usize; const NR_CSTATS: usize = bpf_intf::cell_stat_idx_NR_CSTATS as usize; +// Can we deduplicate this with mitosis.bpf.h? +const CPUMASK_LONG_ENTRIES: usize = 128; + +// Global debug flags +// TODO: These will be runtime adjustable via a CLI option. +static DEBUG_FLAGS: std::sync::LazyLock>> = std::sync::LazyLock::new(|| { + let mut flags = HashMap::new(); + flags.insert("cpu_to_l3".to_string(), false); + flags.insert("l3_to_cpus".to_string(), false); + flags.insert("cells".to_string(), true ); + flags.insert("counters".to_string(), true ); + flags.insert("steals".to_string(), true ); + flags.insert("metrics".to_string(), true ); + Mutex::new(flags) +}); + +/// Debug Printers +const ANSI_RED: &str = "\x1b[31m"; +const ANSI_GREEN: &str = "\x1b[32m"; +const ANSI_RESET: &str = "\x1b[0m"; + +/// Check if a debug flag is enabled +fn is_debug_flag_enabled(flag: &str) -> bool { + if let Ok(flags) = DEBUG_FLAGS.lock() { + flags.get(flag).copied().unwrap_or(false) + } else { + false + } +} + /// scx_mitosis: A dynamic affinity scheduler /// /// Cgroups are assigned to a dynamic number of Cells which are assigned to a @@ -117,9 +149,11 @@ struct Scheduler<'a> { // These are the per-cell cstats. // Note these are accumulated across all CPUs. prev_cell_stats: [[u64; NR_CSTATS]; MAX_CELLS], + prev_total_steals: u64, metrics: Metrics, stats_server: StatsServer<(), Metrics>, last_configuration_seq: Option, + iteration_count: u64, } struct DistributionStats { @@ -146,7 +180,7 @@ impl Display for DistributionStats { ); write!( f, - "{:width$} {:5.1}% | Local:{:4.1}% From: CPU:{:4.1}% Cell:{:4.1}% | V:{:4.1}%", + "{:width$} {:5.1}% | Local:{:5.1}% From: CPU:{:4.1}% Cell:{:5.1}% | V:{:4.1}%", self.total_decisions, self.share_of_decisions_pct, self.local_q_pct, @@ -189,12 +223,23 @@ impl<'a> Scheduler<'a> { skel.maps.rodata_data.as_mut().unwrap().all_cpus[cpu / 8] |= 1 << (cpu % 8); } + skel.maps.rodata_data.as_mut().unwrap().nr_l3 = topology.all_llcs.len() as u32; + + // print the number of l3s we detected + info!("Found {} L3s", topology.all_llcs.len()); + match *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP { 0 => info!("Kernel does not support queued wakeup optimization."), v => skel.struct_ops.mitosis_mut().flags |= v, } - let skel = scx_ops_load!(skel, mitosis, uei)?; + let mut skel = scx_ops_load!(skel, mitosis, uei)?; + + // Set up CPU to L3 topology mapping using the common functionality + populate_topology_maps(&mut skel, MapKind::CpuToL3, None)?; + + // Set up L3 to CPUs mapping using the common functionality + populate_topology_maps(&mut skel, MapKind::L3ToCpus, None)?; let stats_server = StatsServer::new(stats::server_data()).launch()?; @@ -203,9 +248,11 @@ impl<'a> Scheduler<'a> { monitor_interval: Duration::from_secs(opts.monitor_interval_s), cells: HashMap::new(), prev_cell_stats: [[0; NR_CSTATS]; MAX_CELLS], + prev_total_steals: 0, metrics: Metrics::default(), stats_server, last_configuration_seq: None, + iteration_count: 0, }) } @@ -217,6 +264,7 @@ impl<'a> Scheduler<'a> { let (res_ch, req_ch) = self.stats_server.channels(); while !shutdown.load(Ordering::Relaxed) && !uei_exited!(&self.skel, uei) { + self.iteration_count += 1; self.refresh_bpf_cells()?; self.collect_metrics()?; @@ -299,7 +347,7 @@ impl<'a> Scheduler<'a> { } } - let prefix = "Total Decisions:"; + let prefix = " Total: "; // Here we want to sum the affinity violations over all cells. let scope_affn_viols: u64 = cell_stats_delta @@ -317,7 +365,10 @@ impl<'a> Scheduler<'a> { self.metrics.update(&stats); - trace!("{} {}", prefix, stats); + if is_debug_flag_enabled("metrics") { + trace!("{}{}{}", ANSI_GREEN, "metrics:", ANSI_RESET); + trace!("{} {}", prefix, stats); + } Ok(()) } @@ -335,7 +386,7 @@ impl<'a> Scheduler<'a> { .sum::(); // Only print stats for cells that are in use and have decisions - if cell_queue_decisions == 0 || !self.is_cell_in_use(cell as u32) { + if !self.is_cell_in_use(cell as u32) { continue; } @@ -347,7 +398,7 @@ impl<'a> Scheduler<'a> { const MIN_CELL_WIDTH: usize = 2; let cell_width: usize = max(MIN_CELL_WIDTH, (MAX_CELLS as f64).log10().ceil() as usize); - let prefix = format!(" Cell {:width$}:", cell, width = cell_width); + let prefix = format!(" Cell {:width$}:", cell, width = cell_width); // Sum affinity violations for this cell let scope_affn_viols: u64 = @@ -366,7 +417,9 @@ impl<'a> Scheduler<'a> { .or_default() .update(&stats); - trace!("{} {}", prefix, stats); + if is_debug_flag_enabled("metrics") { + trace!("{} {}", prefix, stats); + } } Ok(()) } @@ -417,36 +470,67 @@ impl<'a> Scheduler<'a> { } Ok(cell_stats_delta) } + /// Print debug printer status summary + fn print_debug_status(&self) { + if let Ok(flags) = DEBUG_FLAGS.lock() { + let mut disabled: Vec<_> = flags.iter().filter_map(|(flag, &enabled)| (!enabled).then_some(format!("{}~{}{}", ANSI_RED, flag, ANSI_RESET))).collect(); + let mut enabled: Vec<_> = flags.iter().filter_map(|(flag, &enabled)| enabled.then_some(format!("{}+{}{}", ANSI_GREEN, flag, ANSI_RESET))).collect(); + disabled.extend(enabled); + trace!("Debug Flags: {}", if disabled.is_empty() { "none".to_string() } else { disabled.join(" ") }); + // trace!("hint: sudo ./scx_mitosis cli debug ~/+"); + } + } /// Collect metrics and out various debugging data like per cell stats, per-cpu stats, etc. fn collect_metrics(&mut self) -> Result<()> { + trace!(""); + trace!("Iteration #{}", self.iteration_count); + let cell_stats_delta = self.calculate_cell_stat_delta()?; self.log_all_queue_stats(&cell_stats_delta)?; + // TODO: I don't really understand this. for (cell_id, cell) in &self.cells { // Check if cell is actually in use from BPF before printing if !self.is_cell_in_use(*cell_id) { continue; } - trace!("CELL[{}]: {}", cell_id, cell.cpus); - - // Read current CPU assignments directly from BPF for comparison - let mut bpf_cpus = Cpumask::new(); - let cpu_ctxs = read_cpu_ctxs(&self.skel)?; - for (i, cpu_ctx) in cpu_ctxs.iter().enumerate() { - if cpu_ctx.cell == *cell_id { - bpf_cpus.set_cpu(i).expect("set cpu in bpf mask"); + } + + // Read total steals from BPF and update metrics + self.update_steal_metrics()?; + + // Read and print function counters + self.print_and_reset_function_counters()?; + if is_debug_flag_enabled("cells") { + trace!("{}cells:{}", ANSI_GREEN, ANSI_RESET); + for i in 0..self.cells.len() { + if let Some(cell) = self.cells.get(&(i as u32)) { + trace!(" CELL[{}]: {} ({:3} CPUs)", i, cell.cpus, cell.cpus.weight()); } } + } + + if is_debug_flag_enabled("cpu_to_l3") { + let cpu_to_l3 = read_cpu_to_l3(&self.skel)?; + let cpu_l3_pairs: Vec = cpu_to_l3.iter().enumerate() + .map(|(cpu, l3)| format!("{:3}:{:2}", cpu, l3)) + .collect(); + let chunked_output = cpu_l3_pairs + .chunks(16) + .map(|chunk| chunk.join(" ")) + .collect::>() + .join("\n"); + trace!("{}cpu_to_l3:{}\n{}", ANSI_GREEN, ANSI_RESET, chunked_output); + } - trace!("CELL[{}]: BPF={}", cell_id, bpf_cpus); - - // Flag potential staleness - if cell.cpus != bpf_cpus { - warn!("STALENESS DETECTED: CELL[{}] userspace={} != bpf={}", - cell_id, cell.cpus, bpf_cpus); + if is_debug_flag_enabled("l3_to_cpus") { + trace!("{}l3_to_cpus:{}", ANSI_GREEN, ANSI_RESET); + let l3_to_cpus = read_l3_to_cpus(&self.skel)?; + for (l3_id, mask) in l3_to_cpus.iter() { + trace!("l3_to_cpus: [{:2}] = {}", l3_id, mask); } } @@ -459,9 +543,167 @@ impl<'a> Scheduler<'a> { } self.metrics.num_cells = self.cells.len() as u32; + // Print debug printer status at the end of each cycle + self.print_debug_status(); + Ok(()) } + fn print_and_reset_function_counters(&mut self) -> Result<()> { + if !is_debug_flag_enabled("counters") { + return Ok(()); + } + trace!("{}counters:{}", ANSI_GREEN, ANSI_RESET); + + let counter_names = ["select", "enqueue", "dispatch"]; + let max_name_len = counter_names.iter().map(|name| name.len()).max().unwrap_or(0); + let mut all_counters = Vec::new(); + + // Read counters for each function + for counter_idx in 0..bpf_intf::counter_idx_NR_COUNTERS { + let key = (counter_idx as u32).to_ne_bytes(); + + // Read per-CPU values + let percpu_values = self.skel + .maps + .function_counters + .lookup_percpu(&key, libbpf_rs::MapFlags::ANY) + .context("Failed to lookup function counter")? + .unwrap_or_default(); + + let mut cpu_values = Vec::new(); + for cpu in 0..*NR_CPUS_POSSIBLE { + if cpu < percpu_values.len() { + let value = u64::from_ne_bytes( + percpu_values[cpu].as_slice().try_into() + .context("Failed to convert counter bytes")? + ); + cpu_values.push(value); + } + } + + all_counters.push(cpu_values); + } + + // Calculate and print statistics for each counter + for (idx, counter_values) in all_counters.iter().enumerate() { + if idx >= counter_names.len() { + break; + } + + let name = counter_names[idx]; + let non_zero_values: Vec = counter_values.iter().filter(|&&v| v > 0).copied().collect(); + + if non_zero_values.is_empty() { + trace!(" Fn[{:6} min={:>4} med={:>4} max={:>5} ({:3} CPUs)", + name, total, min, median, max, non_zero_values.len(), width = max_name_len + ); + } + + // Zero out all counters after printing + for counter_idx in 0..bpf_intf::counter_idx_NR_COUNTERS { + let key = (counter_idx as u32).to_ne_bytes(); + let zero_value = 0u64.to_ne_bytes().to_vec(); + + // Create per-CPU values array (all zeros) + let percpu_values: Vec> = (0..*NR_CPUS_POSSIBLE) + .map(|_| zero_value.clone()) + .collect(); + + self.skel + .maps + .function_counters + .update_percpu(&key, &percpu_values, libbpf_rs::MapFlags::ANY) + .context("Failed to reset function counter")?; + } + + Ok(()) + } + +fn update_steal_metrics(&mut self) -> Result<()> { + let steals_debug = is_debug_flag_enabled("steals"); + + // Early out if stealing is compiled out. + if bpf_intf::MITOSIS_ENABLE_STEALING == 0 { + self.metrics.total_steals = 0; + if steals_debug { + trace!("{}steals:{}", ANSI_GREEN, ANSI_RESET); + trace!(" Work stealing disabled at compile time (MITOSIS_ENABLE_STEALING=0)"); + } + return Ok(()); + } + + let key = 0u32.to_ne_bytes(); + + // Read the count; lazily initialize the slot to 0 if it doesn't exist. + let steal_count = match self.skel.maps.steal_stats.lookup(&key, libbpf_rs::MapFlags::ANY) { + Ok(Some(data)) if data.len() >= 8 => { + u64::from_ne_bytes(data[..8].try_into().unwrap()) + } + Ok(Some(_)) => { + if steals_debug { + debug!("steal_stats map data too small"); + } + 0 + } + Ok(None) => { + let zero = 0u64.to_ne_bytes(); + if let Err(e) = self.skel.maps.steal_stats.update(&key, &zero, libbpf_rs::MapFlags::ANY) { + if steals_debug { + debug!("Failed to initialize steal_stats map: {e}"); + } + } + 0 + } + Err(e) => { + if steals_debug { + debug!("Failed to read steal_stats map: {e}"); + } + 0 + } + }; + + // Calculate steals since last update (delta) + let steals_delta = steal_count - self.prev_total_steals; + self.prev_total_steals = steal_count; + self.metrics.total_steals = steals_delta; + + // Early out if we aren't logging. + if !steals_debug { + return Ok(()); + } + + if steals_delta > 0 { + trace!("{}steals:{}", ANSI_GREEN, ANSI_RESET); + trace!(" Work stealing active: steals_since_last={}", steals_delta); + } else { + trace!("{}steals:{}", ANSI_GREEN, ANSI_RESET); + trace!(" Work stealing enabled but no new steals: steals_since_last={}", steals_delta); + } + + Ok(()) +} + + fn refresh_bpf_cells(&mut self) -> Result<()> { let applied_configuration = unsafe { std::ptr::read_volatile( @@ -538,7 +780,52 @@ fn read_cpu_ctxs(skel: &BpfSkel) -> Result> { Ok(cpu_ctxs) } +fn read_cpu_to_l3(skel: &BpfSkel) -> Result> { + let mut cpu_to_l3 = vec![]; + for cpu in 0..*NR_CPUS_POSSIBLE { + let key = (cpu as u32).to_ne_bytes(); + let val = skel + .maps + .cpu_to_l3 + .lookup(&key, libbpf_rs::MapFlags::ANY)? + .map(|v| u32::from_ne_bytes(v.try_into().unwrap())) + .unwrap_or(0); + cpu_to_l3.push(val); + } + Ok(cpu_to_l3) +} + +fn read_l3_to_cpus(skel: &BpfSkel) -> Result> { + let mut l3_to_cpus = vec![]; + + // Get the number of L3 caches from the BPF rodata + let nr_l3 = skel.maps.rodata_data.as_ref().unwrap().nr_l3; + + for l3 in 0..nr_l3 { + let key = (l3 as u32).to_ne_bytes(); + let mask = if let Some(v) = skel + .maps + .l3_to_cpus + .lookup(&key, libbpf_rs::MapFlags::ANY)? + { + let bytes = v.as_slice(); + let mut longs = [0u64; CPUMASK_LONG_ENTRIES]; + let mut i = 0; + while i < CPUMASK_LONG_ENTRIES && i * 8 + 8 <= bytes.len() { + longs[i] = u64::from_ne_bytes(bytes[i * 8..i * 8 + 8].try_into().unwrap()); + i += 1; + } + Cpumask::from_vec(longs.to_vec()) + } else { + Cpumask::new() + }; + l3_to_cpus.push((l3, mask)); + } + Ok(l3_to_cpus) +} + fn main() -> Result<()> { + let opts = Opts::parse(); if opts.version { diff --git a/scheds/rust/scx_mitosis/src/mitosis_topology_utils.rs b/scheds/rust/scx_mitosis/src/mitosis_topology_utils.rs new file mode 100644 index 0000000000..de19b1e02d --- /dev/null +++ b/scheds/rust/scx_mitosis/src/mitosis_topology_utils.rs @@ -0,0 +1,168 @@ +use anyhow::{bail, Context, Result}; +use libbpf_rs::{MapCore, MapFlags}; +use scx_utils::Topology; +use std::collections::HashMap; +use std::io::{self, BufRead, BufReader}; +use std::path::Path; + +use crate::bpf_skel::BpfSkel; + +const CPUMASK_LONG_ENTRIES: usize = 128; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum MapKind { + CpuToL3, + L3ToCpus, +} + +impl std::str::FromStr for MapKind { + type Err = anyhow::Error; + fn from_str(s: &str) -> Result { + match s { + "cpu_to_l3" => Ok(MapKind::CpuToL3), + "l3_to_cpus" => Ok(MapKind::L3ToCpus), + _ => bail!("unknown map {s}"), + } + } +} + +impl std::fmt::Display for MapKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(match self { + MapKind::CpuToL3 => "cpu_to_l3", + MapKind::L3ToCpus => "l3_to_cpus", + }) + } +} + +pub const SUPPORTED_MAPS: &[MapKind] = &[MapKind::CpuToL3, MapKind::L3ToCpus]; + +/// Parse lines of the form `cpu,l3` from the provided reader. +fn parse_cpu_l3_map(reader: R) -> Result> { + let mut pairs = Vec::new(); + for line in reader.lines() { + let line = line?; + let line = line.trim(); + // Ignore blank lines and comments + if line.is_empty() || line.starts_with('#') { + continue; + } + let mut parts = line.split(','); + let cpu = parts + .next() + .ok_or_else(|| anyhow::anyhow!("missing cpu"))? + .trim() + .parse::()?; + let l3 = parts + .next() + .ok_or_else(|| anyhow::anyhow!("missing l3"))? + .trim() + .parse::()?; + pairs.push((cpu, l3)); + } + Ok(pairs) +} + +/// Read CPU/L3 pairs either from a file or standard input. +fn read_cpu_l3_map(path: &str) -> Result> { + if path == "-" { + println!("reading from stdin"); + let stdin = io::stdin(); + let reader = BufReader::new(stdin.lock()); + parse_cpu_l3_map(reader) + } else { + println!("reading from {path}"); + let file = std::fs::File::open(Path::new(path))?; + let reader = BufReader::new(file); + parse_cpu_l3_map(reader) + } +} + +/// Update map entries either from a file or from the host topology. +/// This function can be used by both the main scheduler and CLI tools. +pub fn populate_topology_maps(skel: &mut BpfSkel, map: MapKind, file: Option) -> Result<()> { + match map { + MapKind::CpuToL3 => { + let map_entries = if let Some(path) = file { + println!("loading from {path}"); + read_cpu_l3_map(&path)? + } else { + println!("loading from host topology"); + let topo = Topology::new()?; + (0..*scx_utils::NR_CPUS_POSSIBLE) + // Use 0 if a CPU is missing from the topology + .map(|cpu| (cpu, topo.all_cpus.get(&cpu).map(|c| c.l3_id).unwrap_or(0))) + .collect() + }; + for (cpu, l3) in map_entries { + // Each CPU index is stored as a 32bit key mapping to its L3 id + let key = (cpu as u32).to_ne_bytes(); + let val = (l3 as u32).to_ne_bytes(); + skel.maps.cpu_to_l3.update(&key, &val, MapFlags::ANY)?; + } + } + MapKind::L3ToCpus => { + if file.is_some() { + anyhow::bail!("Loading l3_to_cpus from file is not supported yet"); + } + + println!("loading l3_to_cpus from host topology"); + let topo = Topology::new()?; + + // Group CPUs by L3 cache ID + let mut l3_to_cpus: HashMap> = HashMap::new(); + for cpu in topo.all_cpus.values() { + l3_to_cpus.entry(cpu.l3_id).or_default().push(cpu.id); + } + + // For each L3 cache, create a cpumask and populate the map + for (l3_id, cpus) in l3_to_cpus { + let key = (l3_id as u32).to_ne_bytes(); + + // Create a cpumask structure that matches the BPF side + let mut cpumask_longs = [0u64; CPUMASK_LONG_ENTRIES]; + + // Set bits for each CPU in this L3 cache + for cpu in cpus { + let long_idx = cpu / 64; + let bit_idx = cpu % 64; + if long_idx < CPUMASK_LONG_ENTRIES { + cpumask_longs[long_idx] |= 1u64 << bit_idx; + } + } + + // Convert to bytes for the map update + let mut value_bytes = Vec::new(); + for long_val in cpumask_longs { + value_bytes.extend_from_slice(&long_val.to_ne_bytes()); + } + + skel.maps.l3_to_cpus.update(&key, &value_bytes, MapFlags::ANY) + .context(format!("Failed to update l3_to_cpus map for L3 {}", l3_id))?; + } + } + } + Ok(()) +} + + +/// Display CPU to L3 cache relationships discovered from the host topology. +pub fn print_topology() -> Result<()> { + let topo = Topology::new()?; + println!("Number L3 caches: {}", topo.all_llcs.len()); + println!("CPU -> L3 id:"); + for cpu in topo.all_cpus.values() { + println!("cpu {} -> {}", cpu.id, cpu.l3_id); + } + println!("\nL3 id -> [cpus]:"); + let mut by_l3: std::collections::BTreeMap> = + std::collections::BTreeMap::new(); + for cpu in topo.all_cpus.values() { + by_l3.entry(cpu.l3_id).or_default().push(cpu.id); + } + for (l3, mut cpus) in by_l3 { + cpus.sort_unstable(); + println!("{l3} -> {:?}", cpus); + } + Ok(()) +} diff --git a/scheds/rust/scx_mitosis/src/stats.rs b/scheds/rust/scx_mitosis/src/stats.rs index 749296a4ff..0cfc001667 100644 --- a/scheds/rust/scx_mitosis/src/stats.rs +++ b/scheds/rust/scx_mitosis/src/stats.rs @@ -65,6 +65,8 @@ pub struct Metrics { pub share_of_decisions_pct: f64, #[stat(desc = "Cell scheduling decisions")] total_decisions: u64, + #[stat(desc = "Work steals since last update")] + pub total_steals: u64, #[stat(desc = "Per-cell metrics")] // TODO: cell names pub cells: BTreeMap, } From 0dd6be6d4d306b616d4927f95da4de972bc32a77 Mon Sep 17 00:00:00 2001 From: tommy-u Date: Thu, 11 Sep 2025 16:53:15 -0700 Subject: [PATCH 05/12] scx_mitosis: add L3 awareness and work stealing --- scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h | 154 ++++--- scheds/rust/scx_mitosis/src/bpf/intf.h | 3 +- scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c | 405 ++++++++++++++---- scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h | 23 +- 4 files changed, 446 insertions(+), 139 deletions(-) diff --git a/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h index a545cb72ad..a6b899d2f5 100644 --- a/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h +++ b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h @@ -3,7 +3,7 @@ * This software may be used and distributed according to the terms of the * GNU General Public License version 2. * - * This header defines the 32-bit dispatch queue (DSQ) ID encoding + * This header defines the 64-bit dispatch queue (DSQ) ID encoding * scheme for scx_mitosis, using type fields to distinguish between * per-CPU and cell+L3 domain queues. It includes helper functions to * construct, validate, and parse these DSQ IDs for queue management. @@ -37,96 +37,138 @@ * Only the low 32 bits are used. * * [63 .. 32] [31..0] - * [ 0s or unused ] [ VAL ] + * [ 0][ unused ] [ VAL ] * * Mitosis uses VAL as follows: * - * [31..24] [23..0] + * [31..28] [27..0] * [QTYPE ] [DATA ] * - * QTYPE encodes the queue type (exactly one bit set): + * QTYPE encodes the queue type: * * QTYPE = 0x1 -> Per-CPU Q - * [31 .. 24] [23 .. 16] [15 .. 0] - * [00000001] [00000000] [ CPU# ] + * [31..28] [27 .. .. 0] + * [ 0001 ] [ CPU# ] * [Q-TYPE:1] * * QTYPE = 0x2 -> Cell+L3 Q - * [31 .. 24] [23 .. 16] [15 .. 0] - * [00000010] [ CELL# ] [ L3ID ] + * [31..28] [27 .. 16] [15 .. 0] + * [ 0010 ] [ CELL# ] [ L3ID ] * [Q-TYPE:2] * */ +/* + * The use of these bitfields depends on compiler defined byte AND bit ordering. + * Make sure we're only building with Clang/LLVM and that we're little-endian. + */ +#ifndef __clang__ +#error "This code must be compiled with Clang/LLVM (eBPF: clang -target bpf)." +#endif + +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ +#error "dsq64 bitfield layout assumes little-endian (bpfel)." +#endif + +/* ---- Bitfield widths (bits) ---- */ +#define CPU_B 28 +#define L3_B 16 +#define CELL_B 12 +#define TYPE_B 4 +#define DATA_B 28 +#define RSVD_B 32 + +/* Sum checks (in bits) */ +_Static_assert(CPU_B + TYPE_B == 32, "CPU layout low half must be 32 bits"); +_Static_assert(L3_B + CELL_B + TYPE_B == 32, "CELL+L3 layout low half must be 32 bits"); +_Static_assert(DATA_B + TYPE_B == 32, "Common layout low half must be 32 bits"); + +typedef union { + u64 raw; -#define DSQ_ERROR 0xFFFFFFFF; /* Error value for DSQ functions */ + /* Per-CPU user DSQ */ + struct { u64 cpu: CPU_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } cpu_dsq; + + /* Cell+L3 user DSQ */ + struct { u64 l3: L3_B; u64 cell: CELL_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } cell_l3_dsq; + + /* Generic user view */ + struct { u64 data: DATA_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } user_dsq; + + /* Built-in DSQ view */ + struct { u64 value:32; u64 rsvd:30; u64 local_on:1; u64 builtin:1; } builtin_dsq; + + /* NOTE: Considered packed and aligned attributes, but that's redundant */ +} dsq_id_t; + +/* + * Invalid DSQ ID Sentinel: + * invalid bc bit 63 clear (it's a user DSQ) && dsq_type == 0 (no type) + * Good for catching uninitialized DSQ IDs. +*/ +#define DSQ_INVALID ((u64) 0) + +_Static_assert(sizeof(((dsq_id_t){0}).cpu_dsq) == sizeof(u64), "cpu view must be 8 bytes"); +_Static_assert(sizeof(((dsq_id_t){0}).cell_l3_dsq) == sizeof(u64), "cell+l3 view must be 8 bytes"); +_Static_assert(sizeof(((dsq_id_t){0}).user_dsq) == sizeof(u64), "user common view must be 8 bytes"); +_Static_assert(sizeof(((dsq_id_t){0}).builtin_dsq) == sizeof(u64), "builtin view must be 8 bytes"); + +/* Compile-time checks (in bytes) */ +_Static_assert(sizeof(dsq_id_t) == sizeof(u64), "dsq_id_t must be 8 bytes (64 bits)"); +_Static_assert(_Alignof(dsq_id_t) == sizeof(u64), "dsq_id_t must be 8-byte aligned"); /* DSQ type enumeration */ enum dsq_type { - DSQ_UNKNOWN, + DSQ_TYPE_NONE, DSQ_TYPE_CPU, DSQ_TYPE_CELL_L3, }; -/* DSQ ID structure using unions for type-safe access */ -struct dsq_cpu { - u32 cpu : 16; - u32 unused : 8; - u32 type : 8; -} __attribute__((packed)); - -struct dsq_cell_l3 { - u32 l3 : 16; - u32 cell : 8; - u32 type : 8; -} __attribute__((packed)); - -union dsq_id { - u32 raw; - struct dsq_cpu cpu; - struct dsq_cell_l3 cell_l3; - struct { - u32 data : 24; - u32 type : 8; - } common; -} __attribute__((packed)); - -/* Static assertions to ensure correct sizes */ -/* Verify that all DSQ structures are exactly 32 bits */ -_Static_assert(sizeof(struct dsq_cpu) == 4, "dsq_cpu must be 32 bits"); -_Static_assert(sizeof(struct dsq_cell_l3) == 4, "dsq_cell_l3 must be 32 bits"); -_Static_assert(sizeof(union dsq_id) == 4, "dsq_id union must be 32 bits"); - -/* Inline helper functions for DSQ ID manipulation */ +/* Range guards */ +_Static_assert(MAX_CPUS <= (1u << CPU_B), "MAX_CPUS must fit in field"); +_Static_assert(MAX_L3S <= (1u << L3_B), "MAX_L3S must fit in field"); +_Static_assert(MAX_CELLS <= (1u << CELL_B), "MAX_CELLS must fit in field"); +_Static_assert(DSQ_TYPE_CELL_L3 < (1u << TYPE_B), "DSQ_TYPE_CELL_L3 must fit in field"); + +/* + * While I considered error propagation, I decided to bail to force errors early. +*/ + +static inline bool is_user_dsq(dsq_id_t dsq_id){ + return !dsq_id.builtin_dsq.builtin && dsq_id.user_dsq.type != DSQ_TYPE_NONE; +} // Is this a per CPU DSQ? -static inline bool is_cpu_dsq(u32 dsq_id) +static inline bool is_cpu_dsq(dsq_id_t dsq_id) { - union dsq_id id = { .raw = dsq_id }; - return id.common.type == DSQ_TYPE_CPU; + return is_user_dsq(dsq_id) && dsq_id.user_dsq.type == DSQ_TYPE_CPU; } // If this is a per cpu dsq, return the cpu -static inline u32 get_cpu_from_dsq(u32 dsq_id) +static inline u32 get_cpu_from_dsq(u64 id) { - union dsq_id id = { .raw = dsq_id }; - if (id.common.type != DSQ_TYPE_CPU) - return DSQ_ERROR; - return id.cpu.cpu; + dsq_id_t dsq_id = (dsq_id_t) {.raw = id}; + if (!is_cpu_dsq(dsq_id)) + scx_bpf_error("trying to get cpu from non-cpu dsq\n"); + + return dsq_id.cpu_dsq.cpu; } /* Helper functions to construct DSQ IDs */ -static inline u32 get_cpu_dsq_id(u32 cpu) +static inline u64 get_cpu_dsq_id(u32 cpu) { + // Check for valid CPU range, 0 indexed so >=. if (cpu >= MAX_CPUS) - return DSQ_ERROR; - union dsq_id id = { .cpu = { .cpu = cpu, .unused = 0, .type = DSQ_TYPE_CPU } }; - return id.raw; + scx_bpf_error("invalid cpu %u\n", cpu); + dsq_id_t dsq_id = { .cpu_dsq = { .cpu = cpu, .type = DSQ_TYPE_CPU } }; + + return dsq_id.raw; } -static inline u32 get_cell_l3_dsq_id(u32 cell, u32 l3) +static inline u64 get_cell_l3_dsq_id(u32 cell, u32 l3) { if (cell >= MAX_CELLS || l3 >= MAX_L3S) - return DSQ_ERROR; - union dsq_id id = { .cell_l3 = {.l3 = l3, .cell = cell, .type = DSQ_TYPE_CELL_L3 } }; - return id.raw; + scx_bpf_error("cell %u or l3 %u too large\n", cell, l3); + dsq_id_t dsq_id = { .cell_l3_dsq = { .l3 = l3, .cell = cell, .type = DSQ_TYPE_CELL_L3 } }; + + return dsq_id.raw; } diff --git a/scheds/rust/scx_mitosis/src/bpf/intf.h b/scheds/rust/scx_mitosis/src/bpf/intf.h index 89c0096fd7..64c0e27e87 100644 --- a/scheds/rust/scx_mitosis/src/bpf/intf.h +++ b/scheds/rust/scx_mitosis/src/bpf/intf.h @@ -20,7 +20,7 @@ typedef _Bool bool; /* ---- Work stealing config (compile-time) ------------------------------- */ #ifndef MITOSIS_ENABLE_STEALING -#define MITOSIS_ENABLE_STEALING 0 +#define MITOSIS_ENABLE_STEALING 1 #endif /* ----------------------------------------------------------------------- */ @@ -34,7 +34,6 @@ enum consts { PCPU_BASE = 0x80000000, MAX_CG_DEPTH = 256, - }; /* Statistics */ diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c index dd3f2cf240..76cacf134b 100644 --- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c @@ -12,6 +12,7 @@ * cgroups belonging to the cell. */ +// TODO: fix debug printer. #include "intf.h" #include "mitosis.bpf.h" @@ -56,23 +57,13 @@ struct l3_to_cpus_map l3_to_cpus SEC(".maps"); struct function_counters_map function_counters SEC(".maps"); struct steal_stats_map steal_stats SEC(".maps"); -/* - * We store per-cpu values along with per-cell values. Helper functions to - * translate. - */ -static inline u32 cpu_dsq(u32 cpu) -{ - return PCPU_BASE | cpu; -} - -static inline u32 cell_dsq(u32 cell) -{ - return cell; -} +static inline void increment_counter(enum counter_idx idx) { + u64 *counter; + u32 key = idx; -static inline u32 dsq_to_cpu(u32 dsq) -{ - return dsq & ~PCPU_BASE; + counter = bpf_map_lookup_elem(&function_counters, &key); + if (counter) + (*counter)++; } static inline struct cgroup *lookup_cgrp_ancestor(struct cgroup *cgrp, @@ -127,7 +118,6 @@ static inline struct cgroup *task_cgroup(struct task_struct *p) return cgrp; } - struct { __uint(type, BPF_MAP_TYPE_TASK_STORAGE); __uint(map_flags, BPF_F_NO_PREALLOC); @@ -199,8 +189,12 @@ static inline int allocate_cell() if (!(c = lookup_cell(cell_idx))) return -1; - if (__sync_bool_compare_and_swap(&c->in_use, 0, 1)) + if (__sync_bool_compare_and_swap(&c->in_use, 0, 1)) { + // TODO XXX, I think we need to make this concurrent safe + __builtin_memset(c->l3_cpu_cnt, 0, sizeof(c->l3_cpu_cnt)); + c->l3_present_cnt = 0; return cell_idx; + } } scx_bpf_error("No available cells to allocate"); return -1; @@ -279,7 +273,6 @@ static inline int update_task_cpumask(struct task_struct *p, { const struct cpumask *cell_cpumask; struct cpu_ctx *cpu_ctx; - struct cell *cell; u32 cpu; if (!(cell_cpumask = lookup_cell_cpumask(tctx->cell))) @@ -288,11 +281,24 @@ static inline int update_task_cpumask(struct task_struct *p, if (!tctx->cpumask) return -EINVAL; + /* + * Calculate the intersection of CPUs that are both: + * 1. In this task's assigned cell (cell_cpumask) + * 2. Allowed by the task's CPU affinity (p->cpus_ptr) + * Store result in tctx->cpumask - this becomes the effective CPU set + * where this task can actually run. + */ bpf_cpumask_and(tctx->cpumask, cell_cpumask, p->cpus_ptr); - if (cell_cpumask) - tctx->all_cell_cpus_allowed = - bpf_cpumask_subset(cell_cpumask, p->cpus_ptr); + /* + * Check if the task can run on ALL CPUs in its assigned cell. + * If cell_cpumask is a subset of p->cpus_ptr, it means the task's + * CPU affinity doesn't restrict it within the cell - it can use + * any CPU in the cell. This affects scheduling decisions later. + * True if all the bits in cell_cpumask are set in p->cpus_ptr. + */ + tctx->all_cell_cpus_allowed = + bpf_cpumask_subset(cell_cpumask, p->cpus_ptr); /* * XXX - To be correct, we'd need to calculate the vtime @@ -304,16 +310,56 @@ static inline int update_task_cpumask(struct task_struct *p, * Revisit if high frequency dynamic cell switching * needs to be supported. */ + + // We want to set the task vtime to that of the cell it's joining. + // This used to be done by looking up the cell's dsq + // but now each cell has potentially multiple per l3 dsqs. if (tctx->all_cell_cpus_allowed) { - tctx->dsq = cell_dsq(tctx->cell); - if (!(cell = lookup_cell(tctx->cell))) + + const struct cpumask *l3_mask = NULL; + if (tctx->l3 != INVALID_L3_ID) { + l3_mask = lookup_l3_cpumask((u32)tctx->l3); + /* If the L3 no longer intersects the cell's cpumask, invalidate it */ + if (!l3_mask || !bpf_cpumask_intersects(cell_cpumask, l3_mask)) + tctx->l3 = INVALID_L3_ID; + } + + /* --- Pick a new L3 if needed --- */ + if (tctx->l3 == INVALID_L3_ID) { + s32 new_l3 = pick_l3_for_task(tctx->cell); + if (new_l3 < 0) + return -ENODEV; + tctx->l3 = new_l3; + l3_mask = lookup_l3_cpumask((u32)tctx->l3); + if (!l3_mask) + return -ENOENT; + } + + /* --- Narrow the effective cpumask by the chosen L3 --- */ + /* tctx->cpumask already contains (task_affinity ∧ cell_mask) */ + if (tctx->cpumask) + bpf_cpumask_and(tctx->cpumask, (const struct cpumask *)tctx->cpumask, l3_mask); + + /* If empty after intersection, nothing can run here */ + if (tctx->cpumask && bpf_cpumask_empty((const struct cpumask *)tctx->cpumask)) + return -ENODEV; + + /* --- Point to the correct (cell,L3) DSQ and set vtime baseline --- */ + tctx->dsq = get_cell_l3_dsq_id(tctx->cell, tctx->l3); + + struct cell *cell = lookup_cell(tctx->cell); + if (!cell) return -ENOENT; - p->scx.dsq_vtime = READ_ONCE(cell->vtime_now); + + if (tctx->l3 < 0 || tctx->l3 >= MAX_L3S) + return -EINVAL; + p->scx.dsq_vtime = READ_ONCE(cell->l3_vtime_now[tctx->l3]); } else { + /* Task is CPU-restricted, use task mask */ cpu = bpf_cpumask_any_distribute(p->cpus_ptr); if (!(cpu_ctx = lookup_cpu_ctx(cpu))) return -ENOENT; - tctx->dsq = cpu_dsq(cpu); + tctx->dsq = get_cpu_dsq_id(cpu); p->scx.dsq_vtime = READ_ONCE(cpu_ctx->vtime_now); } @@ -429,20 +475,24 @@ s32 BPF_STRUCT_OPS(mitosis_select_cpu, struct task_struct *p, s32 prev_cpu, struct cpu_ctx *cctx; struct task_ctx *tctx; + increment_counter(COUNTER_SELECT_CPU); + if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p))) return prev_cpu; if (maybe_refresh_cell(p, tctx) < 0) return prev_cpu; + /* Pinned path: only if our task really requires a per-CPU queue. */ if (!tctx->all_cell_cpus_allowed) { cstat_inc(CSTAT_AFFN_VIOL, tctx->cell, cctx); - cpu = dsq_to_cpu(tctx->dsq); + cpu = get_cpu_from_dsq(tctx->dsq); if (scx_bpf_test_and_clear_cpu_idle(cpu)) scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0); return cpu; } + // Grab an idle core if ((cpu = pick_idle_cpu(p, prev_cpu, cctx, tctx)) >= 0) { cstat_inc(CSTAT_LOCAL, tctx->cell, cctx); scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0); @@ -476,14 +526,17 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags) s32 cpu = -1; u64 basis_vtime; + increment_counter(COUNTER_ENQUEUE); + if (!(tctx = lookup_task_ctx(p)) || !(cctx = lookup_cpu_ctx(-1))) return; if (maybe_refresh_cell(p, tctx) < 0) return; + // Cpu pinned work if (!tctx->all_cell_cpus_allowed) { - cpu = dsq_to_cpu(tctx->dsq); + cpu = get_cpu_from_dsq(tctx->dsq); } else if (!__COMPAT_is_enq_cpu_selected(enq_flags)) { /* * If we haven't selected a cpu, then we haven't looked for and kicked an @@ -507,12 +560,22 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags) } if (tctx->all_cell_cpus_allowed) { + // This is a task that can run on any cpu in the cell + cstat_inc(CSTAT_CELL_DSQ, tctx->cell, cctx); - /* Task can use any CPU in its cell, so use the cell DSQ */ + + /* Task can use any CPU in its cell, set basis_vtime from per-(cell, L3) vtime */ if (!(cell = lookup_cell(tctx->cell))) return; - basis_vtime = READ_ONCE(cell->vtime_now); + + if (tctx->l3 < 0 || tctx->l3 >= MAX_L3S) { + scx_bpf_error("Invalid L3 ID for task %d in enqueue", p->pid); + return; + } + basis_vtime = READ_ONCE(cell->l3_vtime_now[tctx->l3]); + } else { + // This is a task that can only run on a specific cpu cstat_inc(CSTAT_CPU_DSQ, tctx->cell, cctx); /* @@ -527,7 +590,8 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags) tctx->basis_vtime = basis_vtime; - if (time_after(vtime, basis_vtime + 8192 * slice_ns)) { + if (time_after(vtime, + basis_vtime + VTIME_MAX_FUTURE_MULTIPLIER * slice_ns)) { scx_bpf_error("vtime is too far in the future for %d", p->pid); return; } @@ -535,6 +599,7 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags) * Limit the amount of budget that an idling task can accumulate * to one slice. */ + // TODO: Should this be time_before64? if (time_before(vtime, basis_vtime - slice_ns)) vtime = basis_vtime - slice_ns; @@ -550,51 +615,107 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev) struct cpu_ctx *cctx; u32 cell; + increment_counter(COUNTER_DISPATCH); + if (!(cctx = lookup_cpu_ctx(-1))) return; cell = READ_ONCE(cctx->cell); - bool found = false; - u64 min_vtime_dsq; - u64 min_vtime; + /* Start from a valid DSQ */ + u64 local_dsq = get_cpu_dsq_id(cpu); + bool found = false; + u64 min_vtime_dsq = local_dsq; + u64 min_vtime = ~0ULL; /* U64_MAX */ struct task_struct *p; - bpf_for_each(scx_dsq, p, cell, 0) { - min_vtime = p->scx.dsq_vtime; - min_vtime_dsq = cell; - found = true; - break; + + // Get L3 + u32 cpu_key = (u32)cpu; + u32 *l3_ptr = bpf_map_lookup_elem(&cpu_to_l3, &cpu_key); + s32 l3 = l3_ptr ? (s32)*l3_ptr : INVALID_L3_ID; + + /* Check the L3 queue */ + if (l3 != INVALID_L3_ID) { + u64 cell_l3_dsq = get_cell_l3_dsq_id(cell, l3); + bpf_for_each(scx_dsq, p, cell_l3_dsq, 0) { + min_vtime = p->scx.dsq_vtime; + min_vtime_dsq = cell_l3_dsq; + found = true; + break; + } } - u64 dsq = cpu_dsq(cpu); - bpf_for_each(scx_dsq, p, dsq, 0) { + /* Check the CPU DSQ for a lower vtime */ + bpf_for_each(scx_dsq, p, local_dsq, 0) { if (!found || time_before(p->scx.dsq_vtime, min_vtime)) { min_vtime = p->scx.dsq_vtime; - min_vtime_dsq = dsq; + min_vtime_dsq = local_dsq; found = true; } break; } /* - * If we failed to find an eligible task, scx will keep running prev if - * prev->scx.flags & SCX_TASK_QUEUED (we don't set SCX_OPS_ENQ_LAST), and - * otherwise go idle. - */ - if (!found) - return; - /* - * The move_to_local can fail if we raced with some other cpu in the cell - * and now the cell is empty. We have to ensure to try the cpu_dsq or else - * we might never wakeup. - */ - - if (!scx_bpf_dsq_move_to_local(min_vtime_dsq) && min_vtime_dsq != dsq) - scx_bpf_dsq_move_to_local(dsq); + * The move_to_local can fail if we raced with some other cpu in the cell + * and now the cell is empty. We have to ensure to try the cpu_dsq or else + * we might never wakeup. + */ + // TODO: The upstream side has "&& min_vtime_dsq != dsq" as part of this condition. + // Do we care? + if (!scx_bpf_dsq_move_to_local(min_vtime_dsq)) { +#if MITOSIS_ENABLE_STEALING + /* Dead-simple work stealing: + * If our local choices are empty, scan sibling (cell,L3) DSQs in the + * same cell and steal the head task if it can run on @cpu. + * No thresholds/cooldowns/lag heuristics—just the first eligible head. + */ + bool moved = false; + if (l3 != INVALID_L3_ID) { + // TODO: This math is kinda dumb and confusing. + u32 start = ((u32)l3 + 1) % nr_l3; + u32 off; + // TODO: This might try a bunch of L3s outside of the cell + bpf_for (off, 0, nr_l3) { + u32 cand = (start + off) % nr_l3; + if (cand == (u32)l3) + continue; + u64 src = get_cell_l3_dsq_id(cell, cand); + + struct task_struct *q; + /* Peek only at the head. */ + bpf_for_each(scx_dsq, q, src, 0) { + // TODO maybe this should use if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, q, SCX_DSQ_LOCAL, 0) == 0) + if (scx_bpf_dsq_move_to_local(src)) { + struct task_ctx *qt = lookup_task_ctx(q); + if (qt) { + qt->steal_count++; + qt->last_stolen_at = scx_bpf_now(); + /* Retag to thief L3 */ + qt->pending_l3 = l3; + } + /* Increment steal counter in map */ + u32 key = 0; + u64 *count = bpf_map_lookup_elem(&steal_stats, &key); + // NOTE: This could get expensive, but I'm not + // anticipating that many steals. Percpu if we care. + if (count) + __sync_fetch_and_add(count, 1); + moved = true; + } + /* head only */ + break; + } + if (moved) + break; + } + } + if (!moved) +#endif + scx_bpf_dsq_move_to_local(local_dsq); + } } - struct cpumask_entry { unsigned long cpumask[CPUMASK_LONG_ENTRIES]; u64 used; @@ -779,8 +900,7 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run) } /* - * Initialize root cell cpumask to all cpus, and then remove from it as we - * go + * Initialize root cell cpumask to all cpus, and then remove from it as we go */ bpf_cpumask_copy(root_bpf_cpumask, (const struct cpumask *)all_cpumask); @@ -960,17 +1080,38 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run) goto out_root_cgrp; } + int cell_idx; + /* Recalculate L3 counts for all active cells after CPU assignment changes */ + bpf_for(cell_idx, 1, MAX_CELLS) { + struct cell *cell; + if (!(cell = lookup_cell(cell_idx))) { + scx_bpf_error("Lookup for cell %d failed in tick()", cell_idx); + goto out_root_cgrp; + } + + if (!cell->in_use) + continue; + + /* Recalculate L3 counts for each active cell */ + recalc_cell_l3_counts(cell_idx); + } + + /* Recalculate root cell's L3 counts after cpumask update */ + recalc_cell_l3_counts(ROOT_CELL_ID); + barrier(); WRITE_ONCE(applied_configuration_seq, local_configuration_seq); bpf_cgroup_release(root_cgrp_ref); return; + out_rcu_unlock: bpf_rcu_read_unlock(); out_root_cgrp: bpf_cgroup_release(root_cgrp_ref); out: - bpf_cpumask_release(root_bpf_cpumask); + if (root_bpf_cpumask) + bpf_cpumask_release(root_bpf_cpumask); } void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p) @@ -984,12 +1125,44 @@ void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p) return; /* - * Update both the CPU's cell and the cpu's vtime so the vtime's are - * comparable at dispatch time. + * If this task was stolen across L3s, retag to thief L3 and recompute + * effective cpumask+DSQ. Preserve vtime to keep fairness. + */ +#if MITOSIS_ENABLE_STEALING + if (tctx->pending_l3 >= 0 && tctx->pending_l3 < MAX_L3S) { + u64 save_v = p->scx.dsq_vtime; + tctx->l3 = tctx->pending_l3; + tctx->pending_l3 = INVALID_L3_ID; + update_task_cpumask(p, tctx); + p->scx.dsq_vtime = save_v; + } +#endif + + /* Validate task's DSQ before it starts running */ + if (tctx->dsq == DSQ_INVALID) { + if (tctx->all_cell_cpus_allowed) { + scx_bpf_error( + "Task %d has invalid DSQ 0 in running callback (CELL-SCHEDULABLE task, can run on any CPU in cell %d)", + p->pid, tctx->cell); + } else { + scx_bpf_error( + "Task %d has invalid DSQ 0 in running callback (CORE-PINNED task, restricted to specific CPUs)", + p->pid); + } + return; + } + + /* + * Update per-(cell, L3) vtime for cell-schedulable tasks */ - if (time_before(READ_ONCE(cell->vtime_now), p->scx.dsq_vtime)) - WRITE_ONCE(cell->vtime_now, p->scx.dsq_vtime); + if (tctx->all_cell_cpus_allowed && tctx->l3 >= 0 && tctx->l3 < MAX_L3S) { + if (time_before(READ_ONCE(cell->l3_vtime_now[tctx->l3]), p->scx.dsq_vtime)) + WRITE_ONCE(cell->l3_vtime_now[tctx->l3], p->scx.dsq_vtime); + } + /* + * Update CPU vtime for CPU-pinned tasks + */ if (time_before(READ_ONCE(cctx->vtime_now), p->scx.dsq_vtime)) WRITE_ONCE(cctx->vtime_now, p->scx.dsq_vtime); @@ -1015,7 +1188,7 @@ void BPF_STRUCT_OPS(mitosis_stopping, struct task_struct *p, bool runnable) used = now - tctx->started_running_at; tctx->started_running_at = now; /* scale the execution time by the inverse of the weight and charge */ - p->scx.dsq_vtime += used * 100 / p->scx.weight; + p->scx.dsq_vtime += used * DEFAULT_WEIGHT_MULTIPLIER / p->scx.weight; if (cidx != 0 || tctx->all_cell_cpus_allowed) { u64 *cell_cycles = MEMBER_VPTR(cctx->cell_cycles, [cidx]); @@ -1024,6 +1197,18 @@ void BPF_STRUCT_OPS(mitosis_stopping, struct task_struct *p, bool runnable) return; } *cell_cycles += used; + + /* + * For cell-schedulable tasks, also accumulate vtime into + * per-cell per-L3 queues + */ + if (tctx->all_cell_cpus_allowed && tctx->l3 >= 0 && + tctx->l3 < MAX_L3S) { + /* Accumulate weighted execution time into per-(cell, L3) vtime */ + cell->l3_vtime_now[tctx->l3] += + used * DEFAULT_WEIGHT_MULTIPLIER / + p->scx.weight; + } } } @@ -1050,8 +1235,9 @@ s32 BPF_STRUCT_OPS(mitosis_cgroup_init, struct cgroup *cgrp, return -ENOENT; } + // Special case for root cell if (cgrp->kn->id == root_cgid) { - WRITE_ONCE(cgc->cell, 0); + WRITE_ONCE(cgc->cell, ROOT_CELL_ID); return 0; } @@ -1142,6 +1328,7 @@ s32 BPF_STRUCT_OPS(mitosis_init_task, struct task_struct *p, { struct task_ctx *tctx; struct bpf_cpumask *cpumask; + int ret; tctx = bpf_task_storage_get(&task_ctxs, p, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); @@ -1167,16 +1354,29 @@ s32 BPF_STRUCT_OPS(mitosis_init_task, struct task_struct *p, return -EINVAL; } - return update_task_cell(p, tctx, args->cgroup); + /* Initialize L3 to invalid before cell assignment */ + tctx->l3 = INVALID_L3_ID; +#if MITOSIS_ENABLE_STEALING + tctx->pending_l3 = INVALID_L3_ID; + tctx->steal_count = 0; + tctx->last_stolen_at = 0; +#endif + + // TODO clean this up + if ((ret = update_task_cell(p, tctx, args->cgroup))) { + return ret; + } + + return 0; } __hidden void dump_cpumask_word(s32 word, const struct cpumask *cpumask) { u32 u, v = 0; - bpf_for(u, 0, 32) + bpf_for(u, 0, BITS_PER_U32) { - s32 cpu = 32 * word + u; + s32 cpu = BITS_PER_U32 * word + u; if (cpu < nr_possible_cpus && bpf_cpumask_test_cpu(cpu, cpumask)) v |= 1 << u; @@ -1206,6 +1406,31 @@ static void dump_cell_cpumask(int id) dump_cpumask(cell_cpumask); } +/* Print cell state for debugging */ +static __always_inline void dump_cell_state(u32 cell_idx) +{ + struct cell *cell = lookup_cell(cell_idx); + if (!cell) { + scx_bpf_dump("Cell %d: NOT FOUND", cell_idx); + return; + } + + scx_bpf_dump("Cell %d: in_use=%d, cpu_cnt=%d, l3_present_cnt=%d", + cell_idx, cell->in_use, cell->cpu_cnt, cell->l3_present_cnt); + + u32 l3; + // Print vtimes for L3s + bpf_for(l3, 0, nr_l3) { + if (cell->l3_cpu_cnt[l3] > 0) { + scx_bpf_dump(" L3[%d]: %d CPUs", l3, cell->l3_cpu_cnt[l3]); + } + } +} + +// TODO: FIX THIS +static __always_inline void dump_l3_state(){ +} + void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx) { u64 dsq_id; @@ -1226,9 +1451,7 @@ void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx) scx_bpf_dump("CELL[%d] CPUS=", i); dump_cell_cpumask(i); scx_bpf_dump("\n"); - scx_bpf_dump("CELL[%d] vtime=%llu nr_queued=%d\n", i, - READ_ONCE(cell->vtime_now), - scx_bpf_dsq_nr_queued(i)); + dump_cell_state(i); } bpf_for(i, 0, nr_possible_cpus) @@ -1236,11 +1459,14 @@ void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx) if (!(cpu_ctx = lookup_cpu_ctx(i))) return; - dsq_id = cpu_dsq(i); + dsq_id = get_cpu_dsq_id(i); scx_bpf_dump("CPU[%d] cell=%d vtime=%llu nr_queued=%d\n", i, cpu_ctx->cell, READ_ONCE(cpu_ctx->vtime_now), scx_bpf_dsq_nr_queued(dsq_id)); } + + dump_l3_state(); + } void BPF_STRUCT_OPS(mitosis_dump_task, struct scx_dump_ctx *dctx, @@ -1252,7 +1478,7 @@ void BPF_STRUCT_OPS(mitosis_dump_task, struct scx_dump_ctx *dctx, return; scx_bpf_dump( - "Task[%d] vtime=%llu basis_vtime=%llu cell=%u dsq=%x all_cell_cpus_allowed=%d\n", + "Task[%d] vtime=%llu basis_vtime=%llu cell=%u dsq=%llu all_cell_cpus_allowed=%d\n", p->pid, p->scx.dsq_vtime, tctx->basis_vtime, tctx->cell, tctx->dsq, tctx->all_cell_cpus_allowed); scx_bpf_dump("Task[%d] CPUS=", p->pid); @@ -1286,7 +1512,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init) if ((u8_ptr = MEMBER_VPTR(all_cpus, [i / 8]))) { if (*u8_ptr & (1 << (i % 8))) { bpf_cpumask_set_cpu(i, cpumask); - ret = scx_bpf_create_dsq(cpu_dsq(i), -1); + ret = scx_bpf_create_dsq(get_cpu_dsq_id(i), ANY_NUMA); if (ret < 0) { bpf_cpumask_release(cpumask); return ret; @@ -1301,14 +1527,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init) if (cpumask) bpf_cpumask_release(cpumask); + /* setup cell cpumasks */ bpf_for(i, 0, MAX_CELLS) { struct cell_cpumask_wrapper *cpumaskw; - - ret = scx_bpf_create_dsq(i, -1); - if (ret < 0) - return ret; - if (!(cpumaskw = bpf_map_lookup_elem(&cell_cpumasks, &i))) return -ENOENT; @@ -1341,11 +1563,34 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init) } cells[0].in_use = true; + + /* Configure root cell (cell 0) topology at init time using nr_l3 and l3_to_cpu masks */ + recalc_cell_l3_counts(ROOT_CELL_ID); + + /* Create (cell,L3) DSQs for all pairs. Userspace will populate maps. */ + // This is a crazy over-estimate + bpf_for(i, 0, MAX_CELLS) + { + u32 l3; + bpf_for(l3, 0, nr_l3) + { + u64 id = get_cell_l3_dsq_id(i, l3); + ret = scx_bpf_create_dsq(id, ANY_NUMA); + if (ret < 0) + scx_bpf_error( "Failed to create DSQ for cell %d, L3 %d: err %d", i, l3, ret); + } + } + return 0; } void BPF_STRUCT_OPS(mitosis_exit, struct scx_exit_info *ei) { + // int i; + // bpf_for(i, 0, MAX_CELLS); { + // dump_cell_state((u32)i); + // } + UEI_RECORD(uei, ei); } diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h index a4569f883e..4eb3b2231f 100644 --- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h @@ -23,6 +23,9 @@ #define MAX_L3S 16 +#include "dsq.bpf.h" + + /* * A couple of tricky things about checking a cgroup's cpumask: * @@ -48,8 +51,24 @@ extern const volatile u32 nr_l3; enum mitosis_constants { + + /* Root cell index */ + ROOT_CELL_ID = 0, + /* Invalid/unset L3 value */ INVALID_L3_ID = -1, + + /* Default weight divisor for vtime calculation */ + DEFAULT_WEIGHT_MULTIPLIER = 100, + + /* Vtime validation multiplier (slice_ns * 8192) */ + VTIME_MAX_FUTURE_MULTIPLIER = 8192, + + /* Bits per u32 for cpumask operations */ + BITS_PER_U32 = 32, + + /* No NUMA constraint for DSQ creation */ + ANY_NUMA = -1, }; struct cell { @@ -83,12 +102,14 @@ struct task_ctx { u32 cell; /* For the sake of scheduling, a task is exclusively owned by either a cell * or a cpu */ - u32 dsq; + u64 dsq; /* latest configuration that was applied for this task */ /* (to know if it has to be re-applied) */ u32 configuration_seq; /* Is this task allowed on all cores of its cell? */ bool all_cell_cpus_allowed; + // Which L3 this task is assigned to + s32 l3; #if MITOSIS_ENABLE_STEALING /* When a task is stolen, dispatch() marks the destination L3 here. From 8523b9d1c28bc04abf7e9f64ff75c47d56314c83 Mon Sep 17 00:00:00 2001 From: tommy-u Date: Mon, 22 Sep 2025 15:09:55 -0700 Subject: [PATCH 06/12] scx_mitosis: major work stealing cleanup --- scheds/rust/scx_mitosis/src/bpf/intf.h | 2 +- .../rust/scx_mitosis/src/bpf/l3_aware.bpf.h | 156 ++++++++++++++++-- scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c | 111 +++++-------- scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h | 46 +++++- scheds/rust/scx_mitosis/src/main.rs | 4 +- 5 files changed, 225 insertions(+), 94 deletions(-) diff --git a/scheds/rust/scx_mitosis/src/bpf/intf.h b/scheds/rust/scx_mitosis/src/bpf/intf.h index 64c0e27e87..8957d7165c 100644 --- a/scheds/rust/scx_mitosis/src/bpf/intf.h +++ b/scheds/rust/scx_mitosis/src/bpf/intf.h @@ -46,7 +46,7 @@ enum cell_stat_idx { }; /* Function invocation counters */ -enum counter_idx { +enum fn_counter_idx { COUNTER_SELECT_CPU, COUNTER_ENQUEUE, COUNTER_DISPATCH, diff --git a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h index 0ced3fa78b..80ab1cc26b 100644 --- a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h +++ b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h @@ -3,8 +3,8 @@ * This software may be used and distributed according to the terms of the * GNU General Public License version 2. * - * This header adds L3 cache awareness to scx_mitosis by defining BPF - * maps for CPU-to-L3 domain mappings. It provides functions to + * This header assists adding L3 cache awareness to scx_mitosis by defining + * maps and fns for managing CPU-to-L3 domain mappings. It provides code to * recalculate per-L3 CPU counts within cells and implements weighted * random L3 selection for tasks. It also tracks work-stealing * statistics for cross-L3 task migrations. @@ -14,10 +14,15 @@ #include "mitosis.bpf.h" #include "intf.h" -// It's also an option to just compute this from the cpu_to_l3 map. -struct l3_cpu_mask { - unsigned long cpumask[CPUMASK_LONG_ENTRIES]; -}; +typedef u32 l3_id_t; +#define L3_INVALID ((l3_id_t) ~0u) + +// Configure how aggressively we steal work. +// When task is detected as a steal candidate, skip it this many times +// On a web server workload, 100 reduced steal count by ~90% +#ifdef MITOSIS_ENABLE_STEALING +#define PREVENT_N_STEALS 0 +#endif /* Work stealing statistics map - accessible from both BPF and userspace */ struct steal_stats_map { @@ -38,27 +43,46 @@ struct cpu_to_l3_map { struct l3_to_cpus_map { __uint(type, BPF_MAP_TYPE_ARRAY); __type(key, u32); - __type(value, struct l3_cpu_mask); + __type(value, struct cpumask); __uint(max_entries, MAX_L3S); }; -extern struct cpu_to_l3_map cpu_to_l3 SEC(".maps"); -extern struct l3_to_cpus_map l3_to_cpus SEC(".maps"); -extern struct steal_stats_map steal_stats SEC(".maps"); +extern struct cpu_to_l3_map cpu_to_l3; +extern struct l3_to_cpus_map l3_to_cpus; +extern struct steal_stats_map steal_stats; + +static inline const bool l3_is_valid(u32 l3_id) { + if (l3_id == L3_INVALID) + return false; + + return (l3_id >= 0) && (l3_id < MAX_L3S); +} + +static inline void init_task_l3(struct task_ctx *tctx) { + tctx->l3 = L3_INVALID; + +#if MITOSIS_ENABLE_STEALING + tctx->pending_l3 = L3_INVALID; + tctx->steal_count = 0; + tctx->last_stolen_at = 0; + tctx->steals_prevented = 0; +#endif + +} static inline const struct cpumask *lookup_l3_cpumask(u32 l3) { - struct l3_cpu_mask *mask; + struct cpumask *mask; if (!(mask = bpf_map_lookup_elem(&l3_to_cpus, &l3))) { scx_bpf_error("no l3 cpumask, l3: %d, %p", l3, &l3_to_cpus); return NULL; } - return (const struct cpumask *)mask; + return mask; } -/* Recompute cell->l3_cpu_cnt[] after cell cpumask changes (no persistent kptrs). */ +/* Recompute cell->l3_cpu_cnt[] after cell cpumask changes */ static __always_inline void recalc_cell_l3_counts(u32 cell_idx) { struct cell *cell = lookup_cell(cell_idx); @@ -89,7 +113,6 @@ static __always_inline void recalc_cell_l3_counts(u32 cell_idx) continue; } - /* ok: dst is bpf_cpumask*, sources are (RCU cpumask*, plain cpumask*) */ bpf_cpumask_and(tmp, cell_mask, l3_mask); u32 cnt = bpf_cpumask_weight((const struct cpumask *)tmp); @@ -113,24 +136,24 @@ static __always_inline void recalc_cell_l3_counts(u32 cell_idx) * have higher probability of being selected. * * @cell_id: The cell ID to select an L3 from - * @return: L3 ID on success, INVALID_L3_ID on error, or 0 as fallback + * @return: L3 ID on success, L3_INVALID on error */ static inline s32 pick_l3_for_task(u32 cell_id) { struct cell *cell; u32 l3, target, cur = 0; - s32 ret = INVALID_L3_ID; + s32 ret = L3_INVALID; /* Look up the cell structure */ if (!(cell = lookup_cell(cell_id))) - return INVALID_L3_ID; + return L3_INVALID; /* Handle case where cell has no CPUs assigned yet */ if (!cell->cpu_cnt) { scx_bpf_error( "pick_l3_for_task: cell %d has no CPUs accounted yet", cell_id); - return INVALID_L3_ID; + return L3_INVALID; } /* Generate random target value in range [0, cpu_cnt) */ @@ -148,3 +171,100 @@ static inline s32 pick_l3_for_task(u32 cell_id) } return ret; } + +#ifdef MITOSIS_ENABLE_STEALING + +static inline bool try_stealing_this_task(struct task_ctx *task_ctx, + s32 local_l3, u64 candidate_dsq) +{ + // Attempt the steal, can fail beacuse it's a race. + if (!scx_bpf_dsq_move_to_local(candidate_dsq)) + return false; + + // We got the task! + task_ctx->steal_count++; + task_ctx->last_stolen_at = scx_bpf_now(); + /* Retag to thief L3 (the one for this cpu) */ + task_ctx->pending_l3 = local_l3; + task_ctx->steals_prevented = 0; + + /* Increment steal counter in map */ + u32 key = 0; + u64 *count = bpf_map_lookup_elem(&steal_stats, &key); + // NOTE: This could get expensive, but I'm not anticipating that many steals. Percpu if we care. + if (count) + __sync_fetch_and_add(count, 1); + + return true; +} + +/* Work stealing: + * Scan sibling (cell,L3) DSQs in the same cell and steal the first queued task if it can run on this cpu +*/ +static inline bool try_stealing_work(u32 cell, s32 local_l3) +{ + if (!l3_is_valid(local_l3)) + scx_bpf_error("try_stealing_work: invalid local_l3"); + + struct cell *cell_ptr = lookup_cell(cell); + if (!cell_ptr) + scx_bpf_error("try_stealing_work: invalid cell"); + + // Loop over all other L3s, looking for a queued task to steal + u32 i; + bpf_for(i, 1, nr_l3) + { + // Start with the next one to spread out the load + u32 candidate_l3 = (local_l3 + i) % nr_l3; + + // Prevents the optimizer from removing the following conditional return + // so that the verifier knows the read wil be safe + barrier_var(candidate_l3); + + if (candidate_l3 >= MAX_L3S) + continue; + + // Skip L3s that are not present in this cell + // Note: rechecking cell_ptr for verifier + if (cell_ptr && cell_ptr->l3_cpu_cnt[candidate_l3] == 0) + continue; + + u64 candidate_dsq = get_cell_l3_dsq_id(cell, candidate_l3); + + struct task_struct *task = NULL; + struct task_ctx *task_ctx; + // I'm only using this for the verifier + bool found_task = false; + + // Optimization: skip if faster than constructing an iterator + // Not redundant with later checking if task found (race) + if (scx_bpf_dsq_nr_queued(candidate_dsq)) + continue; + + // Just a trick for peeking the head element + bpf_for_each(scx_dsq, task, candidate_dsq, 0) + { + task_ctx = lookup_task_ctx(task); + found_task = (task_ctx != NULL); + break; + } + + // No task? Try next L3 + if (!found_task) + continue; + + // This knob throttles stealing. + // TODO: make runtime configurable + if (task_ctx->steals_prevented++ < PREVENT_N_STEALS) { + continue; + } + + if (!try_stealing_this_task(task_ctx, local_l3, candidate_dsq)) + continue; + + // Success, we got a task (no guarantee it was the one we peeked though... race) + return true; + } + return false; +} +#endif diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c index 76cacf134b..363a013935 100644 --- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c @@ -57,7 +57,7 @@ struct l3_to_cpus_map l3_to_cpus SEC(".maps"); struct function_counters_map function_counters SEC(".maps"); struct steal_stats_map steal_stats SEC(".maps"); -static inline void increment_counter(enum counter_idx idx) { +static inline void increment_counter(enum fn_counter_idx idx) { u64 *counter; u32 key = idx; @@ -312,20 +312,18 @@ static inline int update_task_cpumask(struct task_struct *p, */ // We want to set the task vtime to that of the cell it's joining. - // This used to be done by looking up the cell's dsq - // but now each cell has potentially multiple per l3 dsqs. if (tctx->all_cell_cpus_allowed) { const struct cpumask *l3_mask = NULL; - if (tctx->l3 != INVALID_L3_ID) { + if (tctx->l3 != L3_INVALID) { l3_mask = lookup_l3_cpumask((u32)tctx->l3); /* If the L3 no longer intersects the cell's cpumask, invalidate it */ if (!l3_mask || !bpf_cpumask_intersects(cell_cpumask, l3_mask)) - tctx->l3 = INVALID_L3_ID; + tctx->l3 = L3_INVALID; } /* --- Pick a new L3 if needed --- */ - if (tctx->l3 == INVALID_L3_ID) { + if (tctx->l3 == L3_INVALID) { s32 new_l3 = pick_l3_for_task(tctx->cell); if (new_l3 < 0) return -ENODEV; @@ -351,8 +349,9 @@ static inline int update_task_cpumask(struct task_struct *p, if (!cell) return -ENOENT; - if (tctx->l3 < 0 || tctx->l3 >= MAX_L3S) + if (!l3_is_valid(tctx->l3)) return -EINVAL; + p->scx.dsq_vtime = READ_ONCE(cell->l3_vtime_now[tctx->l3]); } else { /* Task is CPU-restricted, use task mask */ @@ -568,7 +567,7 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags) if (!(cell = lookup_cell(tctx->cell))) return; - if (tctx->l3 < 0 || tctx->l3 >= MAX_L3S) { + if (!l3_is_valid(tctx->l3)) { scx_bpf_error("Invalid L3 ID for task %d in enqueue", p->pid); return; } @@ -633,10 +632,10 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev) // Get L3 u32 cpu_key = (u32)cpu; u32 *l3_ptr = bpf_map_lookup_elem(&cpu_to_l3, &cpu_key); - s32 l3 = l3_ptr ? (s32)*l3_ptr : INVALID_L3_ID; + s32 l3 = l3_ptr ? (s32)*l3_ptr : L3_INVALID; /* Check the L3 queue */ - if (l3 != INVALID_L3_ID) { + if (l3 != L3_INVALID) { u64 cell_l3_dsq = get_cell_l3_dsq_id(cell, l3); bpf_for_each(scx_dsq, p, cell_l3_dsq, 0) { min_vtime = p->scx.dsq_vtime; @@ -661,59 +660,33 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev) * and now the cell is empty. We have to ensure to try the cpu_dsq or else * we might never wakeup. */ - // TODO: The upstream side has "&& min_vtime_dsq != dsq" as part of this condition. - // Do we care? - if (!scx_bpf_dsq_move_to_local(min_vtime_dsq)) { -#if MITOSIS_ENABLE_STEALING - /* Dead-simple work stealing: - * If our local choices are empty, scan sibling (cell,L3) DSQs in the - * same cell and steal the head task if it can run on @cpu. - * No thresholds/cooldowns/lag heuristics—just the first eligible head. - */ - bool moved = false; - if (l3 != INVALID_L3_ID) { - // TODO: This math is kinda dumb and confusing. - u32 start = ((u32)l3 + 1) % nr_l3; - u32 off; - // TODO: This might try a bunch of L3s outside of the cell - bpf_for (off, 0, nr_l3) { - u32 cand = (start + off) % nr_l3; - if (cand == (u32)l3) - continue; - u64 src = get_cell_l3_dsq_id(cell, cand); - - struct task_struct *q; - /* Peek only at the head. */ - bpf_for_each(scx_dsq, q, src, 0) { - // TODO maybe this should use if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, q, SCX_DSQ_LOCAL, 0) == 0) - if (scx_bpf_dsq_move_to_local(src)) { - struct task_ctx *qt = lookup_task_ctx(q); - if (qt) { - qt->steal_count++; - qt->last_stolen_at = scx_bpf_now(); - /* Retag to thief L3 */ - qt->pending_l3 = l3; - } - /* Increment steal counter in map */ - u32 key = 0; - u64 *count = bpf_map_lookup_elem(&steal_stats, &key); - // NOTE: This could get expensive, but I'm not - // anticipating that many steals. Percpu if we care. - if (count) - __sync_fetch_and_add(count, 1); - moved = true; - } - /* head only */ - break; - } - if (moved) - break; - } + + + if (found) { + // We found a task in the local or cell-L3 DSQ + + // If it was in the per cpu DSQ, there is no competation, grab it and return + if (min_vtime_dsq == local_dsq) { + scx_bpf_dsq_move_to_local(min_vtime_dsq); + return; + } + + // If it was in the cell L3 DSQ, we are competing with other cpus in the cell-l3 + // try to move it to the local DSQ + if (scx_bpf_dsq_move_to_local(min_vtime_dsq)) { + // We won the race and got the task, return + return; } - if (!moved) -#endif - scx_bpf_dsq_move_to_local(local_dsq); } + +#if MITOSIS_ENABLE_STEALING + // We didn't find a task in either DSQ, or lost the race. + // Instead of going straight to idle, attempt to steal a task from another + // L3 in the cell. + + // Try stealing. If successful, this moves the task to the local runqueue + try_stealing_work(cell, l3); +#endif } struct cpumask_entry { @@ -1129,10 +1102,10 @@ void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p) * effective cpumask+DSQ. Preserve vtime to keep fairness. */ #if MITOSIS_ENABLE_STEALING - if (tctx->pending_l3 >= 0 && tctx->pending_l3 < MAX_L3S) { + if (l3_is_valid(tctx->pending_l3)) { u64 save_v = p->scx.dsq_vtime; tctx->l3 = tctx->pending_l3; - tctx->pending_l3 = INVALID_L3_ID; + tctx->pending_l3 = L3_INVALID; update_task_cpumask(p, tctx); p->scx.dsq_vtime = save_v; } @@ -1155,7 +1128,7 @@ void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p) /* * Update per-(cell, L3) vtime for cell-schedulable tasks */ - if (tctx->all_cell_cpus_allowed && tctx->l3 >= 0 && tctx->l3 < MAX_L3S) { + if (tctx->all_cell_cpus_allowed && l3_is_valid(tctx->l3)) { if (time_before(READ_ONCE(cell->l3_vtime_now[tctx->l3]), p->scx.dsq_vtime)) WRITE_ONCE(cell->l3_vtime_now[tctx->l3], p->scx.dsq_vtime); } @@ -1202,8 +1175,7 @@ void BPF_STRUCT_OPS(mitosis_stopping, struct task_struct *p, bool runnable) * For cell-schedulable tasks, also accumulate vtime into * per-cell per-L3 queues */ - if (tctx->all_cell_cpus_allowed && tctx->l3 >= 0 && - tctx->l3 < MAX_L3S) { + if (tctx->all_cell_cpus_allowed && l3_is_valid(tctx->l3)) { /* Accumulate weighted execution time into per-(cell, L3) vtime */ cell->l3_vtime_now[tctx->l3] += used * DEFAULT_WEIGHT_MULTIPLIER / @@ -1355,12 +1327,7 @@ s32 BPF_STRUCT_OPS(mitosis_init_task, struct task_struct *p, } /* Initialize L3 to invalid before cell assignment */ - tctx->l3 = INVALID_L3_ID; -#if MITOSIS_ENABLE_STEALING - tctx->pending_l3 = INVALID_L3_ID; - tctx->steal_count = 0; - tctx->last_stolen_at = 0; -#endif + init_task_l3(tctx); // TODO clean this up if ((ret = update_task_cell(p, tctx, args->cgroup))) { diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h index 4eb3b2231f..3f546512e8 100644 --- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h @@ -56,7 +56,7 @@ enum mitosis_constants { ROOT_CELL_ID = 0, /* Invalid/unset L3 value */ - INVALID_L3_ID = -1, + // INVALID_L3_ID = -1, /* Default weight divisor for vtime calculation */ DEFAULT_WEIGHT_MULTIPLIER = 100, @@ -118,6 +118,7 @@ struct task_ctx { s32 pending_l3; u32 steal_count; /* how many times this task has been stolen */ u64 last_stolen_at; /* ns timestamp of the last steal (scx_bpf_now) */ + u32 steals_prevented; /* how many times this task has been prevented from being stolen */ #endif }; @@ -125,6 +126,8 @@ struct task_ctx { static inline struct cell *lookup_cell(int idx); static inline const struct cpumask *lookup_cell_cpumask(int idx); +static inline struct task_ctx *lookup_task_ctx(struct task_struct *p); + /* MAP TYPES */ struct function_counters_map { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); @@ -132,3 +135,44 @@ struct function_counters_map { __type(value, u64); __uint(max_entries, NR_COUNTERS); }; + +// static __always_inline void task_release_cleanup(struct task_struct **pp) +// { +// if (*pp) +// bpf_task_release(*pp); +// } + +// #define SCOPED_TASK __attribute__((cleanup(task_release_cleanup))) + +// __always_inline struct task_struct * dsq_head_peek(u64 dsq_id, task_struct *p) +// { +// bpf_rcu_read_lock(); +// struct task_struct *p = NULL; +// bpf_for_each(scx_dsq, p, dsq_id, 0) { +// bpf_task_acquire(p); /* extend lifetime beyond loop */ +// break; /* only want the head */ +// } +// bpf_rcu_read_unlock(); + +// return p; +// } + +// static __always_inline struct task_struct * +// dsq_head_peek(u64 dsq_id) +// { +// struct bpf_iter_scx_dsq it = {}; +// struct task_struct *p; + +// if (bpf_iter_scx_dsq_new(&it, dsq_id, 0)) +// return NULL; + +// /* First element in dispatch order is the head. */ +// p = bpf_iter_scx_dsq_next(&it); + +// /* Take a ref so the pointer remains valid after we destroy the iter. */ +// if (p) +// bpf_task_acquire(p); + +// bpf_iter_scx_dsq_destroy(&it); +// return p; /* caller must bpf_task_release(p) when done */ +// } diff --git a/scheds/rust/scx_mitosis/src/main.rs b/scheds/rust/scx_mitosis/src/main.rs index c68a1476b1..9677be60c9 100644 --- a/scheds/rust/scx_mitosis/src/main.rs +++ b/scheds/rust/scx_mitosis/src/main.rs @@ -560,7 +560,7 @@ impl<'a> Scheduler<'a> { let mut all_counters = Vec::new(); // Read counters for each function - for counter_idx in 0..bpf_intf::counter_idx_NR_COUNTERS { + for counter_idx in 0..bpf_intf::fn_counter_idx_NR_COUNTERS { let key = (counter_idx as u32).to_ne_bytes(); // Read per-CPU values @@ -620,7 +620,7 @@ impl<'a> Scheduler<'a> { } // Zero out all counters after printing - for counter_idx in 0..bpf_intf::counter_idx_NR_COUNTERS { + for counter_idx in 0..bpf_intf::fn_counter_idx_NR_COUNTERS { let key = (counter_idx as u32).to_ne_bytes(); let zero_value = 0u64.to_ne_bytes().to_vec(); From 7ddaba0d9646098a02d8bf178d934902f9bf35d9 Mon Sep 17 00:00:00 2001 From: tommy-u Date: Fri, 26 Sep 2025 05:48:25 -0700 Subject: [PATCH 07/12] Use dsq_id_t type --- scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h | 13 ++--- .../rust/scx_mitosis/src/bpf/l3_aware.bpf.h | 2 +- scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c | 31 ++++++------ scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h | 47 ++----------------- 4 files changed, 24 insertions(+), 69 deletions(-) diff --git a/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h index a6b899d2f5..a8a8a21c2e 100644 --- a/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h +++ b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h @@ -144,9 +144,8 @@ static inline bool is_cpu_dsq(dsq_id_t dsq_id) } // If this is a per cpu dsq, return the cpu -static inline u32 get_cpu_from_dsq(u64 id) +static inline u32 get_cpu_from_dsq(dsq_id_t dsq_id) { - dsq_id_t dsq_id = (dsq_id_t) {.raw = id}; if (!is_cpu_dsq(dsq_id)) scx_bpf_error("trying to get cpu from non-cpu dsq\n"); @@ -154,21 +153,19 @@ static inline u32 get_cpu_from_dsq(u64 id) } /* Helper functions to construct DSQ IDs */ -static inline u64 get_cpu_dsq_id(u32 cpu) +static inline dsq_id_t get_cpu_dsq_id(u32 cpu) { // Check for valid CPU range, 0 indexed so >=. if (cpu >= MAX_CPUS) scx_bpf_error("invalid cpu %u\n", cpu); - dsq_id_t dsq_id = { .cpu_dsq = { .cpu = cpu, .type = DSQ_TYPE_CPU } }; - return dsq_id.raw; + return (dsq_id_t){ .cpu_dsq = { .cpu = cpu, .type = DSQ_TYPE_CPU } }; } -static inline u64 get_cell_l3_dsq_id(u32 cell, u32 l3) +static inline dsq_id_t get_cell_l3_dsq_id(u32 cell, u32 l3) { if (cell >= MAX_CELLS || l3 >= MAX_L3S) scx_bpf_error("cell %u or l3 %u too large\n", cell, l3); - dsq_id_t dsq_id = { .cell_l3_dsq = { .l3 = l3, .cell = cell, .type = DSQ_TYPE_CELL_L3 } }; - return dsq_id.raw; + return (dsq_id_t){ .cell_l3_dsq = { .l3 = l3, .cell = cell, .type = DSQ_TYPE_CELL_L3 } }; } diff --git a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h index 80ab1cc26b..12c1a2c28c 100644 --- a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h +++ b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h @@ -229,7 +229,7 @@ static inline bool try_stealing_work(u32 cell, s32 local_l3) if (cell_ptr && cell_ptr->l3_cpu_cnt[candidate_l3] == 0) continue; - u64 candidate_dsq = get_cell_l3_dsq_id(cell, candidate_l3); + u64 candidate_dsq = get_cell_l3_dsq_id(cell, candidate_l3).raw; struct task_struct *task = NULL; struct task_ctx *task_ctx; diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c index 363a013935..98820c122b 100644 --- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c @@ -602,7 +602,7 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags) if (time_before(vtime, basis_vtime - slice_ns)) vtime = basis_vtime - slice_ns; - scx_bpf_dsq_insert_vtime(p, tctx->dsq, slice_ns, vtime, enq_flags); + scx_bpf_dsq_insert_vtime(p, tctx->dsq.raw, slice_ns, vtime, enq_flags); /* Kick the CPU if needed */ if (!__COMPAT_is_enq_cpu_selected(enq_flags) && cpu >= 0) @@ -622,10 +622,10 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev) cell = READ_ONCE(cctx->cell); /* Start from a valid DSQ */ - u64 local_dsq = get_cpu_dsq_id(cpu); + dsq_id_t local_dsq = get_cpu_dsq_id(cpu); bool found = false; - u64 min_vtime_dsq = local_dsq; + dsq_id_t min_vtime_dsq = local_dsq; u64 min_vtime = ~0ULL; /* U64_MAX */ struct task_struct *p; @@ -636,8 +636,8 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev) /* Check the L3 queue */ if (l3 != L3_INVALID) { - u64 cell_l3_dsq = get_cell_l3_dsq_id(cell, l3); - bpf_for_each(scx_dsq, p, cell_l3_dsq, 0) { + dsq_id_t cell_l3_dsq = get_cell_l3_dsq_id(cell, l3); + bpf_for_each(scx_dsq, p, cell_l3_dsq.raw, 0) { min_vtime = p->scx.dsq_vtime; min_vtime_dsq = cell_l3_dsq; found = true; @@ -646,7 +646,7 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev) } /* Check the CPU DSQ for a lower vtime */ - bpf_for_each(scx_dsq, p, local_dsq, 0) { + bpf_for_each(scx_dsq, p, local_dsq.raw, 0) { if (!found || time_before(p->scx.dsq_vtime, min_vtime)) { min_vtime = p->scx.dsq_vtime; min_vtime_dsq = local_dsq; @@ -666,14 +666,14 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev) // We found a task in the local or cell-L3 DSQ // If it was in the per cpu DSQ, there is no competation, grab it and return - if (min_vtime_dsq == local_dsq) { - scx_bpf_dsq_move_to_local(min_vtime_dsq); + if (min_vtime_dsq.raw == local_dsq.raw) { + scx_bpf_dsq_move_to_local(min_vtime_dsq.raw); return; } // If it was in the cell L3 DSQ, we are competing with other cpus in the cell-l3 // try to move it to the local DSQ - if (scx_bpf_dsq_move_to_local(min_vtime_dsq)) { + if (scx_bpf_dsq_move_to_local(min_vtime_dsq.raw)) { // We won the race and got the task, return return; } @@ -1112,7 +1112,7 @@ void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p) #endif /* Validate task's DSQ before it starts running */ - if (tctx->dsq == DSQ_INVALID) { + if (tctx->dsq.raw == DSQ_INVALID) { if (tctx->all_cell_cpus_allowed) { scx_bpf_error( "Task %d has invalid DSQ 0 in running callback (CELL-SCHEDULABLE task, can run on any CPU in cell %d)", @@ -1400,7 +1400,7 @@ static __always_inline void dump_l3_state(){ void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx) { - u64 dsq_id; + dsq_id_t dsq_id; int i; struct cell *cell; struct cpu_ctx *cpu_ctx; @@ -1429,7 +1429,7 @@ void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx) dsq_id = get_cpu_dsq_id(i); scx_bpf_dump("CPU[%d] cell=%d vtime=%llu nr_queued=%d\n", i, cpu_ctx->cell, READ_ONCE(cpu_ctx->vtime_now), - scx_bpf_dsq_nr_queued(dsq_id)); + scx_bpf_dsq_nr_queued(dsq_id.raw)); } dump_l3_state(); @@ -1447,7 +1447,7 @@ void BPF_STRUCT_OPS(mitosis_dump_task, struct scx_dump_ctx *dctx, scx_bpf_dump( "Task[%d] vtime=%llu basis_vtime=%llu cell=%u dsq=%llu all_cell_cpus_allowed=%d\n", p->pid, p->scx.dsq_vtime, tctx->basis_vtime, tctx->cell, - tctx->dsq, tctx->all_cell_cpus_allowed); + tctx->dsq.raw, tctx->all_cell_cpus_allowed); scx_bpf_dump("Task[%d] CPUS=", p->pid); dump_cpumask(p->cpus_ptr); scx_bpf_dump("\n"); @@ -1479,7 +1479,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init) if ((u8_ptr = MEMBER_VPTR(all_cpus, [i / 8]))) { if (*u8_ptr & (1 << (i % 8))) { bpf_cpumask_set_cpu(i, cpumask); - ret = scx_bpf_create_dsq(get_cpu_dsq_id(i), ANY_NUMA); + ret = scx_bpf_create_dsq(get_cpu_dsq_id(i).raw, ANY_NUMA); if (ret < 0) { bpf_cpumask_release(cpumask); return ret; @@ -1541,8 +1541,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init) u32 l3; bpf_for(l3, 0, nr_l3) { - u64 id = get_cell_l3_dsq_id(i, l3); - ret = scx_bpf_create_dsq(id, ANY_NUMA); + ret = scx_bpf_create_dsq(get_cell_l3_dsq_id(i, l3).raw, ANY_NUMA); if (ret < 0) scx_bpf_error( "Failed to create DSQ for cell %d, L3 %d: err %d", i, l3, ret); } diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h index 3f546512e8..2024d2b5a1 100644 --- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h @@ -83,8 +83,8 @@ struct cell { // Number of L3s with at least one CPU in this cell u32 l3_present_cnt; - // TODO XXX remove this, only here temporarily to make the code compile - // current vtime of the cell + // TODO XXX remove this, only here temporarily to make the code compile + // current vtime of the cell u64 vtime_now; }; @@ -102,7 +102,7 @@ struct task_ctx { u32 cell; /* For the sake of scheduling, a task is exclusively owned by either a cell * or a cpu */ - u64 dsq; + dsq_id_t dsq; /* latest configuration that was applied for this task */ /* (to know if it has to be re-applied) */ u32 configuration_seq; @@ -135,44 +135,3 @@ struct function_counters_map { __type(value, u64); __uint(max_entries, NR_COUNTERS); }; - -// static __always_inline void task_release_cleanup(struct task_struct **pp) -// { -// if (*pp) -// bpf_task_release(*pp); -// } - -// #define SCOPED_TASK __attribute__((cleanup(task_release_cleanup))) - -// __always_inline struct task_struct * dsq_head_peek(u64 dsq_id, task_struct *p) -// { -// bpf_rcu_read_lock(); -// struct task_struct *p = NULL; -// bpf_for_each(scx_dsq, p, dsq_id, 0) { -// bpf_task_acquire(p); /* extend lifetime beyond loop */ -// break; /* only want the head */ -// } -// bpf_rcu_read_unlock(); - -// return p; -// } - -// static __always_inline struct task_struct * -// dsq_head_peek(u64 dsq_id) -// { -// struct bpf_iter_scx_dsq it = {}; -// struct task_struct *p; - -// if (bpf_iter_scx_dsq_new(&it, dsq_id, 0)) -// return NULL; - -// /* First element in dispatch order is the head. */ -// p = bpf_iter_scx_dsq_next(&it); - -// /* Take a ref so the pointer remains valid after we destroy the iter. */ -// if (p) -// bpf_task_acquire(p); - -// bpf_iter_scx_dsq_destroy(&it); -// return p; /* caller must bpf_task_release(p) when done */ -// } From 7639d21e720bf770d1903dc2f54aa946305b546b Mon Sep 17 00:00:00 2001 From: tommy-u Date: Wed, 8 Oct 2025 17:53:59 -0700 Subject: [PATCH 08/12] First cut at locking --- scheds/rust/scx_mitosis/build.rs | 2 +- scheds/rust/scx_mitosis/src/bpf/intf.h | 31 +++++ scheds/rust/scx_mitosis/src/bpf/intf_rust.h | 4 + .../rust/scx_mitosis/src/bpf/l3_aware.bpf.h | 109 +++++++++++------- scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c | 47 +++++--- scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h | 99 ++++++++++++---- scheds/rust/scx_mitosis/src/main.rs | 80 +++++++++---- 7 files changed, 272 insertions(+), 100 deletions(-) create mode 100644 scheds/rust/scx_mitosis/src/bpf/intf_rust.h diff --git a/scheds/rust/scx_mitosis/build.rs b/scheds/rust/scx_mitosis/build.rs index f617cea07d..a5854f718c 100644 --- a/scheds/rust/scx_mitosis/build.rs +++ b/scheds/rust/scx_mitosis/build.rs @@ -6,7 +6,7 @@ fn main() { scx_cargo::BpfBuilder::new() .unwrap() - .enable_intf("src/bpf/intf.h", "bpf_intf.rs") + .enable_intf("src/bpf/intf_rust.h", "bpf_intf.rs") .enable_skel("src/bpf/mitosis.bpf.c", "bpf") .build() .unwrap(); diff --git a/scheds/rust/scx_mitosis/src/bpf/intf.h b/scheds/rust/scx_mitosis/src/bpf/intf.h index 8957d7165c..00045df399 100644 --- a/scheds/rust/scx_mitosis/src/bpf/intf.h +++ b/scheds/rust/scx_mitosis/src/bpf/intf.h @@ -11,6 +11,7 @@ typedef unsigned int u32; typedef _Bool bool; #endif + #ifdef LSP #define __bpf__ #include "../../../../include/scx/ravg.bpf.h" @@ -34,6 +35,36 @@ enum consts { PCPU_BASE = 0x80000000, MAX_CG_DEPTH = 256, + + MAX_L3S = 16, +}; + +/* Kernel side sees the real lock; userspace sees padded bytes of same size/alignment */ +#if defined(__BPF__) +# define CELL_LOCK_T struct bpf_spin_lock +#else +/* userspace placeholder: kernel won’t copy spin_lock */ +# define CELL_LOCK_T struct { u32 __pad; } /* 4-byte aligned as required */ +#endif + +struct cell { + // This is a lock in the kernel and padding in the user + CELL_LOCK_T lock; + + // Whether or not the cell is used + u32 in_use; + // Number of CPUs in this cell + u32 cpu_cnt; + // per-L3 vtimes within this cell + u64 l3_vtime_now[MAX_L3S]; + // Number of CPUs from each L3 assigned to this cell + u32 l3_cpu_cnt[MAX_L3S]; + // Number of L3s with at least one CPU in this cell + u32 l3_present_cnt; + + // TODO XXX remove this, only here temporarily to make the code compile + // current vtime of the cell + u64 vtime_now; }; /* Statistics */ diff --git a/scheds/rust/scx_mitosis/src/bpf/intf_rust.h b/scheds/rust/scx_mitosis/src/bpf/intf_rust.h new file mode 100644 index 0000000000..f8ffd3252a --- /dev/null +++ b/scheds/rust/scx_mitosis/src/bpf/intf_rust.h @@ -0,0 +1,4 @@ +/* Force userspace path for Rust bindgen */ +#undef __BPF__ +#undef __bpf__ +#include "intf.h" diff --git a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h index 12c1a2c28c..7ed77d68c3 100644 --- a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h +++ b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h @@ -86,46 +86,67 @@ static inline const struct cpumask *lookup_l3_cpumask(u32 l3) static __always_inline void recalc_cell_l3_counts(u32 cell_idx) { struct cell *cell = lookup_cell(cell_idx); - if (!cell) + if (!cell) { + scx_bpf_error("recalc_cell_l3_counts: invalid cell %d", + cell_idx); return; + } - struct bpf_cpumask *tmp = bpf_cpumask_create(); - if (!tmp) + CPUMASK_GUARD(tmp_guard); + if (!tmp_guard.mask) { + scx_bpf_error( + "recalc_cell_l3_counts: failed to create tmp mask"); return; + } - u32 l3, present = 0, total_cpus = 0; + u32 l3, l3s_present = 0, total_cpus = 0; + // Just so we don't hold the lock longer than necessary + u32 l3_cpu_cnt_tmp[MAX_L3S] = {0}; - bpf_rcu_read_lock(); - const struct cpumask *cell_mask = - lookup_cell_cpumask(cell_idx); // RCU ptr - if (!cell_mask) { - bpf_rcu_read_unlock(); - bpf_cpumask_release(tmp); - return; - } + { // RCU context + RCU_READ_GUARD(); + const struct cpumask *cell_mask = + lookup_cell_cpumask(cell_idx); // RCU ptr - bpf_for(l3, 0, nr_l3) - { - const struct cpumask *l3_mask = - lookup_l3_cpumask(l3); // plain map memory - if (!l3_mask) { - cell->l3_cpu_cnt[l3] = 0; - continue; + if (!cell_mask) { + scx_bpf_error("recalc_cell_l3_counts: invalid cell mask"); + return; + } + + bpf_for(l3, 0, nr_l3) + { + const struct cpumask *l3_mask = lookup_l3_cpumask(l3); + if (!l3_mask) { + scx_bpf_error( "recalc_cell_l3_counts: invalid l3 mask"); + return; + } + + bpf_cpumask_and(tmp_guard.mask, cell_mask, l3_mask); + + u32 cnt = bpf_cpumask_weight((const struct cpumask *)tmp_guard.mask); + + l3_cpu_cnt_tmp[l3] = cnt; + + bpf_printk("recalc_cell_l3_counts: cnt %d", cnt); + + // These are counted across the whole cell + total_cpus += cnt; + + // Number of non-empty L3s in this cell + if (cnt) + l3s_present++; } + } // unlock RCU - bpf_cpumask_and(tmp, cell_mask, l3_mask); - u32 cnt = bpf_cpumask_weight((const struct cpumask *)tmp); - cell->l3_cpu_cnt[l3] = cnt; - total_cpus += cnt; - if (cnt) - present++; + bpf_spin_lock(&cell->lock); + for (u32 l3 = 0; l3 < nr_l3; l3++) { + cell->l3_cpu_cnt[l3] = l3_cpu_cnt_tmp[l3]; } - bpf_rcu_read_unlock(); - cell->l3_present_cnt = present; + cell->l3_present_cnt = l3s_present; cell->cpu_cnt = total_cpus; - bpf_cpumask_release(tmp); + bpf_spin_unlock(&cell->lock); } /** @@ -138,29 +159,32 @@ static __always_inline void recalc_cell_l3_counts(u32 cell_idx) * @cell_id: The cell ID to select an L3 from * @return: L3 ID on success, L3_INVALID on error */ +// TODO: Lock static inline s32 pick_l3_for_task(u32 cell_id) { struct cell *cell; - u32 l3, target, cur = 0; - s32 ret = L3_INVALID; /* Look up the cell structure */ - if (!(cell = lookup_cell(cell_id))) + if (!(cell = lookup_cell(cell_id))) { + scx_bpf_error("pick_l3_for_task: invalid cell %d", cell_id); return L3_INVALID; + } - /* Handle case where cell has no CPUs assigned yet */ + // No cpus if (!cell->cpu_cnt) { - scx_bpf_error( - "pick_l3_for_task: cell %d has no CPUs accounted yet", - cell_id); + scx_bpf_error( "pick_l3_for_task: cell %d has no CPUs accounted yet", cell_id); return L3_INVALID; } - /* Generate random target value in range [0, cpu_cnt) */ - target = bpf_get_prandom_u32() % cell->cpu_cnt; - /* Find the L3 domain corresponding to the target value using * weighted selection - accumulate CPU counts until we exceed target */ + + /* Generate random target value in range [0, cpu_cnt) */ + u32 target = bpf_get_prandom_u32() % cell->cpu_cnt; + u32 l3, cur = 0; + s32 ret = L3_INVALID; + + // This could be a prefix sum. Find first l3 where we exceed target bpf_for(l3, 0, nr_l3) { cur += cell->l3_cpu_cnt[l3]; @@ -169,6 +193,12 @@ static inline s32 pick_l3_for_task(u32 cell_id) break; } } + + if (ret == L3_INVALID) { + scx_bpf_error("pick_l3_for_task: invalid L3"); + return L3_INVALID; + } + return ret; } @@ -226,6 +256,7 @@ static inline bool try_stealing_work(u32 cell, s32 local_l3) // Skip L3s that are not present in this cell // Note: rechecking cell_ptr for verifier + // TODO: Lock? if (cell_ptr && cell_ptr->l3_cpu_cnt[candidate_l3] == 0) continue; @@ -244,7 +275,7 @@ static inline bool try_stealing_work(u32 cell, s32 local_l3) // Just a trick for peeking the head element bpf_for_each(scx_dsq, task, candidate_dsq, 0) { - task_ctx = lookup_task_ctx(task); + task_ctx = lookup_task_ctx(task); found_task = (task_ctx != NULL); break; } diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c index 98820c122b..b920ecaf25 100644 --- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c @@ -13,7 +13,7 @@ */ // TODO: fix debug printer. -#include "intf.h" +// #include "intf.h" #include "mitosis.bpf.h" #include "dsq.bpf.h" @@ -45,9 +45,15 @@ private(root_cgrp) struct cgroup __kptr *root_cgrp; UEI_DEFINE(uei); +// Cells now defined as a map so we can lock. +struct cell_map cells SEC(".maps"); + /* * Maps used for L3-aware scheduling */ +#if 0 +struct cell_locks_map cell_locks SEC(".maps"); +#endif struct cpu_to_l3_map cpu_to_l3 SEC(".maps"); struct l3_to_cpus_map l3_to_cpus SEC(".maps"); @@ -162,19 +168,6 @@ static inline struct cpu_ctx *lookup_cpu_ctx(int cpu) return cctx; } -struct cell cells[MAX_CELLS]; - -static inline struct cell *lookup_cell(int idx) -{ - struct cell *cell; - - cell = MEMBER_VPTR(cells, [idx]); - if (!cell) { - scx_bpf_error("Invalid cell %d", idx); - return NULL; - } - return cell; -} /* * Cells are allocated concurrently in some cases (e.g. cgroup_init). @@ -191,8 +184,11 @@ static inline int allocate_cell() if (__sync_bool_compare_and_swap(&c->in_use, 0, 1)) { // TODO XXX, I think we need to make this concurrent safe + // TODO, lock with recalc_cell...() __builtin_memset(c->l3_cpu_cnt, 0, sizeof(c->l3_cpu_cnt)); c->l3_present_cnt = 0; + // TODO zero cpu_cnt + // TODO Just zero the whole cell struct? return cell_idx; } } @@ -325,8 +321,10 @@ static inline int update_task_cpumask(struct task_struct *p, /* --- Pick a new L3 if needed --- */ if (tctx->l3 == L3_INVALID) { s32 new_l3 = pick_l3_for_task(tctx->cell); - if (new_l3 < 0) + if (new_l3 < 0) { + scx_bpf_error("bad L3: %d", new_l3); return -ENODEV; + } tctx->l3 = new_l3; l3_mask = lookup_l3_cpumask((u32)tctx->l3); if (!l3_mask) @@ -339,8 +337,10 @@ static inline int update_task_cpumask(struct task_struct *p, bpf_cpumask_and(tctx->cpumask, (const struct cpumask *)tctx->cpumask, l3_mask); /* If empty after intersection, nothing can run here */ - if (tctx->cpumask && bpf_cpumask_empty((const struct cpumask *)tctx->cpumask)) + if (tctx->cpumask && bpf_cpumask_empty((const struct cpumask *)tctx->cpumask)) { + scx_bpf_error("Empty cpumask after intersection"); return -ENODEV; + } /* --- Point to the correct (cell,L3) DSQ and set vtime baseline --- */ tctx->dsq = get_cell_l3_dsq_id(tctx->cell, tctx->l3); @@ -349,8 +349,10 @@ static inline int update_task_cpumask(struct task_struct *p, if (!cell) return -ENOENT; - if (!l3_is_valid(tctx->l3)) + if (!l3_is_valid(tctx->l3)){ + scx_bpf_error("Invalid L3 %d", tctx->l3); return -EINVAL; + } p->scx.dsq_vtime = READ_ONCE(cell->l3_vtime_now[tctx->l3]); } else { @@ -1386,7 +1388,8 @@ static __always_inline void dump_cell_state(u32 cell_idx) cell_idx, cell->in_use, cell->cpu_cnt, cell->l3_present_cnt); u32 l3; - // Print vtimes for L3s + // TODO Print vtimes for L3s + // TODO lock bpf_for(l3, 0, nr_l3) { if (cell->l3_cpu_cnt[l3] > 0) { scx_bpf_dump(" L3[%d]: %d CPUs", l3, cell->l3_cpu_cnt[l3]); @@ -1490,6 +1493,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init) } } + cpumask = bpf_kptr_xchg(&all_cpumask, cpumask); if (cpumask) bpf_cpumask_release(cpumask); @@ -1529,7 +1533,12 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init) } } - cells[0].in_use = true; + struct cell *cell = lookup_cell(0); + if (!cell) { + scx_bpf_error("Failed to lookup cell 0"); + return -ENOENT; + } + cell->in_use = true; /* Configure root cell (cell 0) topology at init time using nr_l3 and l3_to_cpu masks */ recalc_cell_l3_counts(ROOT_CELL_ID); diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h index 2024d2b5a1..4441a19a27 100644 --- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h @@ -20,12 +20,8 @@ #endif #include "intf.h" - -#define MAX_L3S 16 - #include "dsq.bpf.h" - /* * A couple of tricky things about checking a cgroup's cpumask: * @@ -50,6 +46,10 @@ extern const volatile u32 nr_l3; + + +extern struct cell_map cells; + enum mitosis_constants { /* Root cell index */ @@ -71,22 +71,36 @@ enum mitosis_constants { ANY_NUMA = -1, }; -struct cell { - // Whether or not the cell is used or not - u32 in_use; - // Number of CPUs in this cell - u32 cpu_cnt; - // per-L3 vtimes within this cell - u64 l3_vtime_now[MAX_L3S]; - // Number of CPUs from each L3 assigned to this cell - u32 l3_cpu_cnt[MAX_L3S]; - // Number of L3s with at least one CPU in this cell - u32 l3_present_cnt; - - // TODO XXX remove this, only here temporarily to make the code compile - // current vtime of the cell - u64 vtime_now; -}; + + + +static inline struct cell *lookup_cell(int idx) +{ + struct cell *cell; + + cell = bpf_map_lookup_elem(&cells, &idx); + + if (!cell) { + scx_bpf_error("Invalid cell %d", idx); + return NULL; + } + return cell; +} + +static inline struct bpf_spin_lock *get_cell_lock(u32 cell_idx) +{ + if (cell_idx >= MAX_CELLS) { + scx_bpf_error("Invalid cell index %d", cell_idx); + return NULL; + } + + struct cell *cell = lookup_cell(cell_idx); + if (!cell) { + scx_bpf_error("Cell %d not found", cell_idx); + return NULL; + } + return &cell->lock; +} /* * task_ctx is the per-task information kept by scx_mitosis @@ -123,7 +137,6 @@ struct task_ctx { }; // These could go in mitosis.bpf.h, but we'll cross that bridge when we get -static inline struct cell *lookup_cell(int idx); static inline const struct cpumask *lookup_cell_cpumask(int idx); static inline struct task_ctx *lookup_task_ctx(struct task_struct *p); @@ -135,3 +148,47 @@ struct function_counters_map { __type(value, u64); __uint(max_entries, NR_COUNTERS); }; + +struct cell_map { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, struct cell); + __uint(max_entries, MAX_CELLS); +}; + +struct rcu_read_guard { + bool active; +}; + +static inline struct rcu_read_guard rcu_read_lock_guard(void) { + bpf_rcu_read_lock(); + return (struct rcu_read_guard){.active = true}; +} + +static inline void rcu_read_guard_release(struct rcu_read_guard *guard) { + if (guard->active) { + bpf_rcu_read_unlock(); + guard->active = false; + } +} +#define RCU_READ_GUARD() \ + struct rcu_read_guard __rcu_guard __attribute__((__cleanup__(rcu_read_guard_release))) = rcu_read_lock_guard() + +struct cpumask_guard { + struct bpf_cpumask *mask; +}; + +static inline struct cpumask_guard cpumask_create_guard(void) { + struct bpf_cpumask *mask = bpf_cpumask_create(); + return (struct cpumask_guard){.mask = mask}; +} + +static inline void cpumask_guard_release(struct cpumask_guard *guard) { + if (guard->mask) { + bpf_cpumask_release(guard->mask); + guard->mask = NULL; + } +} + +#define CPUMASK_GUARD(var_name) \ + struct cpumask_guard var_name __attribute__((__cleanup__(cpumask_guard_release))) = cpumask_create_guard() diff --git a/scheds/rust/scx_mitosis/src/main.rs b/scheds/rust/scx_mitosis/src/main.rs index 9677be60c9..b25be74326 100644 --- a/scheds/rust/scx_mitosis/src/main.rs +++ b/scheds/rust/scx_mitosis/src/main.rs @@ -24,7 +24,7 @@ use anyhow::Context; use anyhow::Result; use clap::Parser; use crossbeam::channel::RecvTimeoutError; -use libbpf_rs::{MapCore, OpenObject}; +use libbpf_rs::{MapCore, OpenObject, MapFlags}; use log::debug; use log::info; use log::trace; @@ -49,6 +49,12 @@ use stats::CellMetrics; use stats::Metrics; use crate::mitosis_topology_utils::{populate_topology_maps, MapKind}; +// This is the cell type from intf.h. +// When copied to user, the lock field is omitted. +// We can mmap it, or use calls to the BPF_MAP_LOOKUP_ELEM +// command of the bpf() system call with the BPF_F_LOCK flag +type BpfCell = bpf_intf::cell; + const SCHEDULER_NAME: &str = "scx_mitosis"; const MAX_CELLS: usize = bpf_intf::consts_MAX_CELLS as usize; const NR_CSTATS: usize = bpf_intf::cell_stat_idx_NR_CSTATS as usize; @@ -138,14 +144,14 @@ const QUEUE_STATS_IDX: [bpf_intf::cell_stat_idx; 3] = [ // Per cell book-keeping #[derive(Debug)] -struct Cell { +struct CellMask { cpus: Cpumask, } struct Scheduler<'a> { skel: BpfSkel<'a>, monitor_interval: Duration, - cells: HashMap, + cells: HashMap, // These are the per-cell cstats. // Note these are accumulated across all CPUs. prev_cell_stats: [[u64; NR_CSTATS]; MAX_CELLS], @@ -193,11 +199,36 @@ impl Display for DistributionStats { } impl<'a> Scheduler<'a> { + fn get_bpf_cell(&self, cell_id: u32) -> anyhow::Result> { + let key = cell_id.to_ne_bytes(); + let map = &self.skel.maps.cells; // NOTE: map is a field, not a method + + match map.lookup(&key, MapFlags::ANY)? { + Some(bytes) => { + let need = core::mem::size_of::(); + if bytes.len() != need { + anyhow::bail!("cells value size {} != BpfCell {}", bytes.len(), need); + } + // Copy to an aligned buffer to avoid misaligned reference + let mut tmp = MaybeUninit::::uninit(); + unsafe { + std::ptr::copy_nonoverlapping( + bytes.as_ptr(), + tmp.as_mut_ptr() as *mut u8, + need, + ); + Ok(Some(tmp.assume_init())) + } + } + None => Ok(None), + } + } + fn is_cell_in_use(&self, cell_id: u32) -> bool { - let cells = &self.skel.maps.bss_data.as_ref().unwrap().cells; - let bpf_cell = cells[cell_id as usize]; - let in_use = unsafe { std::ptr::read_volatile(&bpf_cell.in_use as *const u32) }; - in_use != 0 + match self.get_bpf_cell(cell_id) { + Ok(Some(c)) => c.in_use != 0, + _ => false, + } } fn init(opts: &Opts, open_object: &'a mut MaybeUninit) -> Result { @@ -235,6 +266,18 @@ impl<'a> Scheduler<'a> { let mut skel = scx_ops_load!(skel, mitosis, uei)?; + // Verify our version of the cell datastructure is the same size + // as the bpf one. + let cells_info = skel.maps.cells.info()?; + let usz = core::mem::size_of::() as u32; + if cells_info.info.value_size != usz { + bail!( + "cells value_size={} but Rust expects {} (BpfCell)", + cells_info.info.value_size, + usz + ); + } + // Set up CPU to L3 topology mapping using the common functionality populate_topology_maps(&mut skel, MapKind::CpuToL3, None)?; @@ -474,7 +517,7 @@ impl<'a> Scheduler<'a> { fn print_debug_status(&self) { if let Ok(flags) = DEBUG_FLAGS.lock() { let mut disabled: Vec<_> = flags.iter().filter_map(|(flag, &enabled)| (!enabled).then_some(format!("{}~{}{}", ANSI_RED, flag, ANSI_RESET))).collect(); - let mut enabled: Vec<_> = flags.iter().filter_map(|(flag, &enabled)| enabled.then_some(format!("{}+{}{}", ANSI_GREEN, flag, ANSI_RESET))).collect(); + let enabled: Vec<_> = flags.iter().filter_map(|(flag, &enabled)| enabled.then_some(format!("{}+{}{}", ANSI_GREEN, flag, ANSI_RESET))).collect(); disabled.extend(enabled); trace!("Debug Flags: {}", if disabled.is_empty() { "none".to_string() } else { disabled.join(" ") }); // trace!("hint: sudo ./scx_mitosis cli debug ~/+"); @@ -567,7 +610,7 @@ impl<'a> Scheduler<'a> { let percpu_values = self.skel .maps .function_counters - .lookup_percpu(&key, libbpf_rs::MapFlags::ANY) + .lookup_percpu(&key, MapFlags::ANY) .context("Failed to lookup function counter")? .unwrap_or_default(); @@ -632,7 +675,7 @@ impl<'a> Scheduler<'a> { self.skel .maps .function_counters - .update_percpu(&key, &percpu_values, libbpf_rs::MapFlags::ANY) + .update_percpu(&key, &percpu_values, MapFlags::ANY) .context("Failed to reset function counter")?; } @@ -655,7 +698,7 @@ fn update_steal_metrics(&mut self) -> Result<()> { let key = 0u32.to_ne_bytes(); // Read the count; lazily initialize the slot to 0 if it doesn't exist. - let steal_count = match self.skel.maps.steal_stats.lookup(&key, libbpf_rs::MapFlags::ANY) { + let steal_count = match self.skel.maps.steal_stats.lookup(&key, MapFlags::ANY) { Ok(Some(data)) if data.len() >= 8 => { u64::from_ne_bytes(data[..8].try_into().unwrap()) } @@ -667,7 +710,7 @@ fn update_steal_metrics(&mut self) -> Result<()> { } Ok(None) => { let zero = 0u64.to_ne_bytes(); - if let Err(e) = self.skel.maps.steal_stats.update(&key, &zero, libbpf_rs::MapFlags::ANY) { + if let Err(e) = self.skel.maps.steal_stats.update(&key, &zero, MapFlags::ANY) { if steals_debug { debug!("Failed to initialize steal_stats map: {e}"); } @@ -736,15 +779,12 @@ fn update_steal_metrics(&mut self) -> Result<()> { // Create cells we don't have yet, drop cells that are no longer in use. // If we continue to drop cell metrics once a cell is removed, we'll need to make sure we // flush metrics for a cell before we remove it completely. - let cells = &self.skel.maps.bss_data.as_ref().unwrap().cells; for i in 0..MAX_CELLS { let cell_idx = i as u32; - let bpf_cell = cells[i]; - let in_use = unsafe { std::ptr::read_volatile(&bpf_cell.in_use as *const u32) }; - if in_use > 0 { + if self.is_cell_in_use(cell_idx) { self.cells .entry(cell_idx) - .or_insert_with(|| Cell { + .or_insert_with(|| CellMask { cpus: Cpumask::new(), }) .cpus = cell_to_cpus @@ -769,7 +809,7 @@ fn read_cpu_ctxs(skel: &BpfSkel) -> Result> { let cpu_ctxs_vec = skel .maps .cpu_ctxs - .lookup_percpu(&0u32.to_ne_bytes(), libbpf_rs::MapFlags::ANY) + .lookup_percpu(&0u32.to_ne_bytes(), MapFlags::ANY) .context("Failed to lookup cpu_ctx")? .unwrap(); for cpu in 0..*NR_CPUS_POSSIBLE { @@ -787,7 +827,7 @@ fn read_cpu_to_l3(skel: &BpfSkel) -> Result> { let val = skel .maps .cpu_to_l3 - .lookup(&key, libbpf_rs::MapFlags::ANY)? + .lookup(&key, MapFlags::ANY)? .map(|v| u32::from_ne_bytes(v.try_into().unwrap())) .unwrap_or(0); cpu_to_l3.push(val); @@ -806,7 +846,7 @@ fn read_l3_to_cpus(skel: &BpfSkel) -> Result> { let mask = if let Some(v) = skel .maps .l3_to_cpus - .lookup(&key, libbpf_rs::MapFlags::ANY)? + .lookup(&key, MapFlags::ANY)? { let bytes = v.as_slice(); let mut longs = [0u64; CPUMASK_LONG_ENTRIES]; From 7972846b8459eb6455d652889459101fc5ea73b3 Mon Sep 17 00:00:00 2001 From: tommy-u Date: Thu, 9 Oct 2025 14:10:13 -0700 Subject: [PATCH 09/12] Lock cell state --- code.txt | 2382 +++++++++++++++++ scheds/rust/scx_mitosis/src/bpf/intf.h | 47 +- .../rust/scx_mitosis/src/bpf/l3_aware.bpf.h | 17 +- scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c | 25 +- scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h | 14 +- 5 files changed, 2454 insertions(+), 31 deletions(-) create mode 100644 code.txt diff --git a/code.txt b/code.txt new file mode 100644 index 0000000000..64c3002bbe --- /dev/null +++ b/code.txt @@ -0,0 +1,2382 @@ +]633;E;for file in scheds/rust/scx_mitosis/src/bpf/*;7dc75c10-53e2-4af4-8cab-ea0159bd7502]633;C# File: scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h +/* Copyright (c) Meta Platforms, Inc. and affiliates. */ +/* + * This software may be used and distributed according to the terms of the + * GNU General Public License version 2. + * + * This header defines the 64-bit dispatch queue (DSQ) ID encoding + * scheme for scx_mitosis, using type fields to distinguish between + * per-CPU and cell+L3 domain queues. It includes helper functions to + * construct, validate, and parse these DSQ IDs for queue management. + */ +#pragma once + +#include "intf.h" +#include "mitosis.bpf.h" + +/* + * ================================ + * BPF DSQ ID Layout (64 bits wide) + * ================================ + * + * Top-level format: + * [63] [62..0] + * [ B] [ ID ] + * + * If B == 1 it is a Built-in DSQ + * ------------------------- + * [63] [62] [61 .. 32] [31..0] + * [ 1] [ L] [ R ] [ V ] + * + * - L (bit 62): LOCAL_ON flag + * If L == 1 -> V = CPU number + * - R (30 bits): reserved / unused + * - V (32 bits): value (e.g., CPU#) + * + * If B == 0 -> User-defined DSQ + * ----------------------------- + * Only the low 32 bits are used. + * + * [63 .. 32] [31..0] + * [ 0][ unused ] [ VAL ] + * + * Mitosis uses VAL as follows: + * + * [31..28] [27..0] + * [QTYPE ] [DATA ] + * + * QTYPE encodes the queue type: + * + * QTYPE = 0x1 -> Per-CPU Q + * [31..28] [27 .. .. 0] + * [ 0001 ] [ CPU# ] + * [Q-TYPE:1] + * + * QTYPE = 0x2 -> Cell+L3 Q + * [31..28] [27 .. 16] [15 .. 0] + * [ 0010 ] [ CELL# ] [ L3ID ] + * [Q-TYPE:2] + * + */ +/* + * The use of these bitfields depends on compiler defined byte AND bit ordering. + * Make sure we're only building with Clang/LLVM and that we're little-endian. + */ +#ifndef __clang__ +#error "This code must be compiled with Clang/LLVM (eBPF: clang -target bpf)." +#endif + +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ +#error "dsq64 bitfield layout assumes little-endian (bpfel)." +#endif + +/* ---- Bitfield widths (bits) ---- */ +#define CPU_B 28 +#define L3_B 16 +#define CELL_B 12 +#define TYPE_B 4 +#define DATA_B 28 +#define RSVD_B 32 + +/* Sum checks (in bits) */ +_Static_assert(CPU_B + TYPE_B == 32, "CPU layout low half must be 32 bits"); +_Static_assert(L3_B + CELL_B + TYPE_B == 32, "CELL+L3 layout low half must be 32 bits"); +_Static_assert(DATA_B + TYPE_B == 32, "Common layout low half must be 32 bits"); + +typedef union { + u64 raw; + + /* Per-CPU user DSQ */ + struct { u64 cpu: CPU_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } cpu_dsq; + + /* Cell+L3 user DSQ */ + struct { u64 l3: L3_B; u64 cell: CELL_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } cell_l3_dsq; + + /* Generic user view */ + struct { u64 data: DATA_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } user_dsq; + + /* Built-in DSQ view */ + struct { u64 value:32; u64 rsvd:30; u64 local_on:1; u64 builtin:1; } builtin_dsq; + + /* NOTE: Considered packed and aligned attributes, but that's redundant */ +} dsq_id_t; + +/* + * Invalid DSQ ID Sentinel: + * invalid bc bit 63 clear (it's a user DSQ) && dsq_type == 0 (no type) + * Good for catching uninitialized DSQ IDs. +*/ +#define DSQ_INVALID ((u64) 0) + +_Static_assert(sizeof(((dsq_id_t){0}).cpu_dsq) == sizeof(u64), "cpu view must be 8 bytes"); +_Static_assert(sizeof(((dsq_id_t){0}).cell_l3_dsq) == sizeof(u64), "cell+l3 view must be 8 bytes"); +_Static_assert(sizeof(((dsq_id_t){0}).user_dsq) == sizeof(u64), "user common view must be 8 bytes"); +_Static_assert(sizeof(((dsq_id_t){0}).builtin_dsq) == sizeof(u64), "builtin view must be 8 bytes"); + +/* Compile-time checks (in bytes) */ +_Static_assert(sizeof(dsq_id_t) == sizeof(u64), "dsq_id_t must be 8 bytes (64 bits)"); +_Static_assert(_Alignof(dsq_id_t) == sizeof(u64), "dsq_id_t must be 8-byte aligned"); + +/* DSQ type enumeration */ +enum dsq_type { + DSQ_TYPE_NONE, + DSQ_TYPE_CPU, + DSQ_TYPE_CELL_L3, +}; + +/* Range guards */ +_Static_assert(MAX_CPUS <= (1u << CPU_B), "MAX_CPUS must fit in field"); +_Static_assert(MAX_L3S <= (1u << L3_B), "MAX_L3S must fit in field"); +_Static_assert(MAX_CELLS <= (1u << CELL_B), "MAX_CELLS must fit in field"); +_Static_assert(DSQ_TYPE_CELL_L3 < (1u << TYPE_B), "DSQ_TYPE_CELL_L3 must fit in field"); + +/* + * While I considered error propagation, I decided to bail to force errors early. +*/ + +static inline bool is_user_dsq(dsq_id_t dsq_id){ + return !dsq_id.builtin_dsq.builtin && dsq_id.user_dsq.type != DSQ_TYPE_NONE; +} + +// Is this a per CPU DSQ? +static inline bool is_cpu_dsq(dsq_id_t dsq_id) +{ + return is_user_dsq(dsq_id) && dsq_id.user_dsq.type == DSQ_TYPE_CPU; +} + +// If this is a per cpu dsq, return the cpu +static inline u32 get_cpu_from_dsq(dsq_id_t dsq_id) +{ + if (!is_cpu_dsq(dsq_id)) + scx_bpf_error("trying to get cpu from non-cpu dsq\n"); + + return dsq_id.cpu_dsq.cpu; +} + +/* Helper functions to construct DSQ IDs */ +static inline dsq_id_t get_cpu_dsq_id(u32 cpu) +{ + // Check for valid CPU range, 0 indexed so >=. + if (cpu >= MAX_CPUS) + scx_bpf_error("invalid cpu %u\n", cpu); + + return (dsq_id_t){ .cpu_dsq = { .cpu = cpu, .type = DSQ_TYPE_CPU } }; +} + +static inline dsq_id_t get_cell_l3_dsq_id(u32 cell, u32 l3) +{ + if (cell >= MAX_CELLS || l3 >= MAX_L3S) + scx_bpf_error("cell %u or l3 %u too large\n", cell, l3); + + return (dsq_id_t){ .cell_l3_dsq = { .l3 = l3, .cell = cell, .type = DSQ_TYPE_CELL_L3 } }; +} +# File: scheds/rust/scx_mitosis/src/bpf/intf.h +// Copyright (c) Meta Platforms, Inc. and affiliates. + +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. +#ifndef __INTF_H +#define __INTF_H + +#ifndef __KERNEL__ +typedef unsigned long long u64; +typedef unsigned int u32; +typedef _Bool bool; +#endif + +#ifdef LSP +#define __bpf__ +#include "../../../../include/scx/ravg.bpf.h" +#else +#include +#endif + +/* ---- Work stealing config (compile-time) ------------------------------- */ +#ifndef MITOSIS_ENABLE_STEALING +#define MITOSIS_ENABLE_STEALING 1 +#endif +/* ----------------------------------------------------------------------- */ + +enum consts { + CACHELINE_SIZE = 64, + MAX_CPUS_SHIFT = 9, + MAX_CPUS = 1 << MAX_CPUS_SHIFT, + MAX_CPUS_U8 = MAX_CPUS / 8, + MAX_CELLS = 16, + USAGE_HALF_LIFE = 100000000, /* 100ms */ + + PCPU_BASE = 0x80000000, + MAX_CG_DEPTH = 256, +}; + +/* Statistics */ +enum cell_stat_idx { + CSTAT_LOCAL, + CSTAT_CPU_DSQ, + CSTAT_CELL_DSQ, + CSTAT_AFFN_VIOL, + NR_CSTATS, +}; + +/* Function invocation counters */ +enum fn_counter_idx { + COUNTER_SELECT_CPU, + COUNTER_ENQUEUE, + COUNTER_DISPATCH, + NR_COUNTERS, +}; + +struct cpu_ctx { + u64 cstats[MAX_CELLS][NR_CSTATS]; + u64 cell_cycles[MAX_CELLS]; + u32 cell; + u64 vtime_now; +}; + +struct cgrp_ctx { + u32 cell; + bool cell_owner; +}; + +#endif /* __INTF_H */ +# File: scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h +/* Copyright (c) Meta Platforms, Inc. and affiliates. */ +/* + * This software may be used and distributed according to the terms of the + * GNU General Public License version 2. + * + * This header assists adding L3 cache awareness to scx_mitosis by defining + * maps and fns for managing CPU-to-L3 domain mappings. It provides code to + * recalculate per-L3 CPU counts within cells and implements weighted + * random L3 selection for tasks. It also tracks work-stealing + * statistics for cross-L3 task migrations. + */ +#pragma once + +#include "mitosis.bpf.h" +#include "intf.h" + +typedef u32 l3_id_t; +#define L3_INVALID ((l3_id_t)~0u) + +// Configure how aggressively we steal work. +// When task is detected as a steal candidate, skip it this many times +// On a web server workload, 100 reduced steal count by ~90% +#ifdef MITOSIS_ENABLE_STEALING +#define PREVENT_N_STEALS 0 +#endif + +/* Work stealing statistics map - accessible from both BPF and userspace */ +struct steal_stats_map { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, 1); +}; + +// A CPU -> L3 cache ID map +struct cpu_to_l3_map { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, MAX_CPUS); +}; + +struct l3_to_cpus_map { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, struct cpumask); + __uint(max_entries, MAX_L3S); +}; + +extern struct cpu_to_l3_map cpu_to_l3; +extern struct l3_to_cpus_map l3_to_cpus; +extern struct steal_stats_map steal_stats; + +static inline const bool l3_is_valid(u32 l3_id) +{ + if (l3_id == L3_INVALID) + return false; + + return (l3_id >= 0) && (l3_id < MAX_L3S); +} + +static inline void init_task_l3(struct task_ctx *tctx) +{ + tctx->l3 = L3_INVALID; + +#if MITOSIS_ENABLE_STEALING + tctx->pending_l3 = L3_INVALID; + tctx->steal_count = 0; + tctx->last_stolen_at = 0; + tctx->steals_prevented = 0; +#endif +} + +static inline const struct cpumask *lookup_l3_cpumask(u32 l3) +{ + struct cpumask *mask; + + if (!(mask = bpf_map_lookup_elem(&l3_to_cpus, &l3))) { + scx_bpf_error("no l3 cpumask, l3: %d, %p", l3, &l3_to_cpus); + return NULL; + } + + return mask; +} + +/* Recompute cell->l3_cpu_cnt[] after cell cpumask changes */ +// TODO: use RAII and lock around updates (races with ) +static __always_inline void recalc_cell_l3_counts(u32 cell_idx) +{ + struct cell *cell = lookup_cell(cell_idx); + if (!cell) { + scx_bpf_error("recalc_cell_l3_counts: invalid cell %d", + cell_idx); + return; + } + + CPUMASK_GUARD(tmp_guard); + if (!tmp_guard.mask) { + scx_bpf_error( + "recalc_cell_l3_counts: failed to create tmp mask"); + return; + } + + u32 l3, l3s_present = 0, total_cpus = 0; + // Just so we don't hold the lock longer than necessary + u32 l3_cpu_cnt_tmp[MAX_L3S] = {0}; + + { // RCU context + RCU_READ_GUARD(); + const struct cpumask *cell_mask = + lookup_cell_cpumask(cell_idx); // RCU ptr + + if (!cell_mask) { + scx_bpf_error( + "recalc_cell_l3_counts: invalid cell mask"); + return; + } + + bpf_for(l3, 0, nr_l3) + { + const struct cpumask *l3_mask = lookup_l3_cpumask(l3); + if (!l3_mask) { + scx_bpf_error( + "recalc_cell_l3_counts: invalid l3 mask"); + return; + } + + bpf_cpumask_and(tmp_guard.mask, cell_mask, l3_mask); + + u32 cnt = bpf_cpumask_weight( (const struct cpumask *)tmp_guard.mask); + + l3_cpu_cnt_tmp[l3] = cnt; + + bpf_printk("recalc_cell_l3_counts: cnt %d", cnt); + + // These are counted across the whole cell + total_cpus += cnt; + + if (cnt) + l3s_present++; + } + } // bpf_rcu_read_unlock(); + + // WITH_CELL_LOCK(cell, cell_idx, { + for (u32 l3 = 0; l3 < nr_l3; l3++) { + cell->l3_cpu_cnt[l3] = l3_cpu_cnt_tmp[l3]; + } + + cell->l3_present_cnt = l3s_present; + cell->cpu_cnt = total_cpus; + // }); +} + +/** + * Weighted random selection of an L3 cache domain for a task. + * + * Uses the CPU count in each L3 domain within the cell as weights to + * probabilistically select an L3. L3 domains with more CPUs in the cell + * have higher probability of being selected. + * + * @cell_id: The cell ID to select an L3 from + * @return: L3 ID on success, L3_INVALID on error + */ +// TODO: Lock +static inline s32 pick_l3_for_task(u32 cell_id) +{ + struct cell *cell; + + /* Look up the cell structure */ + if (!(cell = lookup_cell(cell_id))) { + scx_bpf_error("pick_l3_for_task: invalid cell %d", cell_id); + return L3_INVALID; + } + + // No cells + if (!cell->cpu_cnt) { + scx_bpf_error( "pick_l3_for_task: cell %d has no CPUs accounted yet", cell_id); + return L3_INVALID; + } + + /* Find the L3 domain corresponding to the target value using + * weighted selection - accumulate CPU counts until we exceed target */ + + /* Generate random target value in range [0, cpu_cnt) */ + u32 target = bpf_get_prandom_u32() % cell->cpu_cnt; + u32 l3, cur = 0; + s32 ret = L3_INVALID; + + // This could be a prefix sum. Find first l3 where we exceed target + bpf_for(l3, 0, nr_l3) + { + cur += cell->l3_cpu_cnt[l3]; + if (target < cur) { + ret = (s32)l3; + break; + } + } + + if (ret == L3_INVALID) { + scx_bpf_error("pick_l3_for_task: invalid L3"); + return L3_INVALID; + } + + return ret; +} + +#ifdef MITOSIS_ENABLE_STEALING + +static inline bool try_stealing_this_task(struct task_ctx *task_ctx, + s32 local_l3, u64 candidate_dsq) +{ + // Attempt the steal, can fail beacuse it's a race. + if (!scx_bpf_dsq_move_to_local(candidate_dsq)) + return false; + + // We got the task! + task_ctx->steal_count++; + task_ctx->last_stolen_at = scx_bpf_now(); + /* Retag to thief L3 (the one for this cpu) */ + task_ctx->pending_l3 = local_l3; + task_ctx->steals_prevented = 0; + + /* Increment steal counter in map */ + u32 key = 0; + u64 *count = bpf_map_lookup_elem(&steal_stats, &key); + // NOTE: This could get expensive, but I'm not anticipating that many steals. Percpu if we care. + if (count) + __sync_fetch_and_add(count, 1); + + return true; +} + +/* Work stealing: + * Scan sibling (cell,L3) DSQs in the same cell and steal the first queued task if it can run on this cpu +*/ +static inline bool try_stealing_work(u32 cell, s32 local_l3) +{ + if (!l3_is_valid(local_l3)) + scx_bpf_error("try_stealing_work: invalid local_l3"); + + struct cell *cell_ptr = lookup_cell(cell); + if (!cell_ptr) + scx_bpf_error("try_stealing_work: invalid cell"); + + // Loop over all other L3s, looking for a queued task to steal + u32 i; + bpf_for(i, 1, nr_l3) + { + // Start with the next one to spread out the load + u32 candidate_l3 = (local_l3 + i) % nr_l3; + + // Prevents the optimizer from removing the following conditional return + // so that the verifier knows the read wil be safe + barrier_var(candidate_l3); + + if (candidate_l3 >= MAX_L3S) + continue; + + // Skip L3s that are not present in this cell + // Note: rechecking cell_ptr for verifier + // TODO: Lock? + if (cell_ptr && cell_ptr->l3_cpu_cnt[candidate_l3] == 0) + continue; + + u64 candidate_dsq = get_cell_l3_dsq_id(cell, candidate_l3).raw; + + struct task_struct *task = NULL; + struct task_ctx *task_ctx; + // I'm only using this for the verifier + bool found_task = false; + + // Optimization: skip if faster than constructing an iterator + // Not redundant with later checking if task found (race) + if (scx_bpf_dsq_nr_queued(candidate_dsq)) + continue; + + // Just a trick for peeking the head element + bpf_for_each(scx_dsq, task, candidate_dsq, 0) + { + task_ctx = lookup_task_ctx(task); + found_task = (task_ctx != NULL); + break; + } + + // No task? Try next L3 + if (!found_task) + continue; + + // This knob throttles stealing. + // TODO: make runtime configurable + if (task_ctx->steals_prevented++ < PREVENT_N_STEALS) { + continue; + } + + if (!try_stealing_this_task(task_ctx, local_l3, candidate_dsq)) + continue; + + // Success, we got a task (no guarantee it was the one we peeked though... race) + return true; + } + return false; +} +#endif +# File: scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c +/* Copyright (c) Meta Platforms, Inc. and affiliates. */ +/* + * This software may be used and distributed according to the terms of the + * GNU General Public License version 2. + * + * scx_mitosis is a dynamic affinity scheduler. Cgroups (and their tasks) are + * assigned to Cells which are affinitized to discrete sets of CPUs. The number + * of cells is dynamic, as is cgroup to cell assignment and cell to CPU + * assignment (all are determined by userspace). + * + * Each cell has an associated DSQ which it uses for vtime scheduling of the + * cgroups belonging to the cell. + */ + +// TODO: fix debug printer. +#include "intf.h" + +#include "mitosis.bpf.h" +#include "dsq.bpf.h" +#include "l3_aware.bpf.h" + +char _license[] SEC("license") = "GPL"; + +/* + * Variables populated by userspace + */ +const volatile u32 nr_possible_cpus = 1; +const volatile bool smt_enabled = true; +const volatile unsigned char all_cpus[MAX_CPUS_U8]; + +const volatile u64 slice_ns; +const volatile u64 root_cgid = 1; + +const volatile u32 nr_l3 = 1; +/* + * CPU assignment changes aren't fully in effect until a subsequent tick() + * configuration_seq is bumped on each assignment change + * applied_configuration_seq is bumped when the effect is fully applied + */ +u32 configuration_seq; +u32 applied_configuration_seq; + +private(all_cpumask) struct bpf_cpumask __kptr *all_cpumask; +private(root_cgrp) struct cgroup __kptr *root_cgrp; + +UEI_DEFINE(uei); + +// Cells now defined as a map so we can lock. +struct cell_map cells SEC(".maps"); + +/* + * Maps used for L3-aware scheduling +*/ +#if 0 +struct cell_locks_map cell_locks SEC(".maps"); +#endif +struct cpu_to_l3_map cpu_to_l3 SEC(".maps"); +struct l3_to_cpus_map l3_to_cpus SEC(".maps"); + +/* + * Maps for statistics +*/ +struct function_counters_map function_counters SEC(".maps"); +struct steal_stats_map steal_stats SEC(".maps"); + +static inline void increment_counter(enum fn_counter_idx idx) { + u64 *counter; + u32 key = idx; + + counter = bpf_map_lookup_elem(&function_counters, &key); + if (counter) + (*counter)++; +} + +static inline struct cgroup *lookup_cgrp_ancestor(struct cgroup *cgrp, + u32 ancestor) +{ + struct cgroup *cg; + + if (!(cg = bpf_cgroup_ancestor(cgrp, ancestor))) { + scx_bpf_error("Failed to get ancestor level %d for cgid %llu", + ancestor, cgrp->kn->id); + return NULL; + } + + return cg; +} + +struct { + __uint(type, BPF_MAP_TYPE_CGRP_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct cgrp_ctx); +} cgrp_ctxs SEC(".maps"); + +static inline struct cgrp_ctx *lookup_cgrp_ctx_fallible(struct cgroup *cgrp) +{ + struct cgrp_ctx *cgc; + + if (!(cgc = bpf_cgrp_storage_get(&cgrp_ctxs, cgrp, 0, 0))) { + return NULL; + } + + return cgc; +} + +static inline struct cgrp_ctx *lookup_cgrp_ctx(struct cgroup *cgrp) +{ + struct cgrp_ctx *cgc = lookup_cgrp_ctx_fallible(cgrp); + + if (!cgc) + scx_bpf_error("cgrp_ctx lookup failed for cgid %llu", + cgrp->kn->id); + + return cgc; +} + +static inline struct cgroup *task_cgroup(struct task_struct *p) +{ + struct cgroup *cgrp = __COMPAT_scx_bpf_task_cgroup(p); + if (!cgrp) { + scx_bpf_error("Failed to get cgroup for task %d", p->pid); + } + return cgrp; +} + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct task_ctx); +} task_ctxs SEC(".maps"); + +static inline struct task_ctx *lookup_task_ctx(struct task_struct *p) +{ + struct task_ctx *tctx; + + if ((tctx = bpf_task_storage_get(&task_ctxs, p, 0, 0))) { + return tctx; + } + + scx_bpf_error("task_ctx lookup failed"); + return NULL; +} + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct cpu_ctx); + __uint(max_entries, 1); +} cpu_ctxs SEC(".maps"); + +static inline struct cpu_ctx *lookup_cpu_ctx(int cpu) +{ + struct cpu_ctx *cctx; + u32 zero = 0; + + if (cpu < 0) + cctx = bpf_map_lookup_elem(&cpu_ctxs, &zero); + else + cctx = bpf_map_lookup_percpu_elem(&cpu_ctxs, &zero, cpu); + + if (!cctx) { + scx_bpf_error("no cpu_ctx for cpu %d", cpu); + return NULL; + } + + return cctx; +} + + + +/* + * Cells are allocated concurrently in some cases (e.g. cgroup_init). + * allocate_cell and free_cell enable these allocations to be done safely + */ +static inline int allocate_cell() +{ + int cell_idx; + bpf_for(cell_idx, 0, MAX_CELLS) + { + struct cell *c; + if (!(c = lookup_cell(cell_idx))) + return -1; + + if (__sync_bool_compare_and_swap(&c->in_use, 0, 1)) { + // TODO XXX, I think we need to make this concurrent safe + // TODO, lock with recalc_cell...() + __builtin_memset(c->l3_cpu_cnt, 0, sizeof(c->l3_cpu_cnt)); + c->l3_present_cnt = 0; + // TODO zero cpu_cnt + // TODO Just zero the whole cell struct? + return cell_idx; + } + } + scx_bpf_error("No available cells to allocate"); + return -1; +} + +static inline int free_cell(int cell_idx) +{ + struct cell *c; + + if (cell_idx < 0 || cell_idx >= MAX_CELLS) { + scx_bpf_error("Invalid cell %d", cell_idx); + return -1; + } + + if (!(c = lookup_cell(cell_idx))) + return -1; + + WRITE_ONCE(c->in_use, 0); + return 0; +} + +/* + * Store the cpumask for each cell (owned by BPF logic). We need this in an + * explicit map to allow for these to be kptrs. + */ +struct cell_cpumask_wrapper { + struct bpf_cpumask __kptr *cpumask; + /* + * To avoid allocation on the reconfiguration path, have a second cpumask we + * can just do an xchg on. + */ + struct bpf_cpumask __kptr *tmp_cpumask; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, struct cell_cpumask_wrapper); + __uint(max_entries, MAX_CELLS); + __uint(map_flags, 0); +} cell_cpumasks SEC(".maps"); + +static inline const struct cpumask *lookup_cell_cpumask(int idx) +{ + struct cell_cpumask_wrapper *cpumaskw; + + if (!(cpumaskw = bpf_map_lookup_elem(&cell_cpumasks, &idx))) { + scx_bpf_error("no cell cpumask"); + return NULL; + } + + return (const struct cpumask *)cpumaskw->cpumask; +} + +/* + * Helper functions for bumping per-cell stats + */ +static void cstat_add(enum cell_stat_idx idx, u32 cell, struct cpu_ctx *cctx, + s64 delta) +{ + u64 *vptr; + + if ((vptr = MEMBER_VPTR(*cctx, .cstats[cell][idx]))) + (*vptr) += delta; + else + scx_bpf_error("invalid cell or stat idxs: %d, %d", idx, cell); +} + +static void cstat_inc(enum cell_stat_idx idx, u32 cell, struct cpu_ctx *cctx) +{ + cstat_add(idx, cell, cctx, 1); +} + +static inline int update_task_cpumask(struct task_struct *p, + struct task_ctx *tctx) +{ + const struct cpumask *cell_cpumask; + struct cpu_ctx *cpu_ctx; + u32 cpu; + + if (!(cell_cpumask = lookup_cell_cpumask(tctx->cell))) + return -ENOENT; + + if (!tctx->cpumask) + return -EINVAL; + + /* + * Calculate the intersection of CPUs that are both: + * 1. In this task's assigned cell (cell_cpumask) + * 2. Allowed by the task's CPU affinity (p->cpus_ptr) + * Store result in tctx->cpumask - this becomes the effective CPU set + * where this task can actually run. + */ + bpf_cpumask_and(tctx->cpumask, cell_cpumask, p->cpus_ptr); + + /* + * Check if the task can run on ALL CPUs in its assigned cell. + * If cell_cpumask is a subset of p->cpus_ptr, it means the task's + * CPU affinity doesn't restrict it within the cell - it can use + * any CPU in the cell. This affects scheduling decisions later. + * True if all the bits in cell_cpumask are set in p->cpus_ptr. + */ + tctx->all_cell_cpus_allowed = + bpf_cpumask_subset(cell_cpumask, p->cpus_ptr); + + /* + * XXX - To be correct, we'd need to calculate the vtime + * delta in the previous dsq, scale it by the load + * fraction difference and then offset from the new + * dsq's vtime_now. For now, just do the simple thing + * and assume the offset to be zero. + * + * Revisit if high frequency dynamic cell switching + * needs to be supported. + */ + + // We want to set the task vtime to that of the cell it's joining. + if (tctx->all_cell_cpus_allowed) { + + const struct cpumask *l3_mask = NULL; + if (tctx->l3 != L3_INVALID) { + l3_mask = lookup_l3_cpumask((u32)tctx->l3); + /* If the L3 no longer intersects the cell's cpumask, invalidate it */ + if (!l3_mask || !bpf_cpumask_intersects(cell_cpumask, l3_mask)) + tctx->l3 = L3_INVALID; + } + + /* --- Pick a new L3 if needed --- */ + if (tctx->l3 == L3_INVALID) { + s32 new_l3 = pick_l3_for_task(tctx->cell); + if (new_l3 < 0) { + scx_bpf_error("bad L3: %d", new_l3); + return -ENODEV; + } + tctx->l3 = new_l3; + l3_mask = lookup_l3_cpumask((u32)tctx->l3); + if (!l3_mask) + return -ENOENT; + } + + /* --- Narrow the effective cpumask by the chosen L3 --- */ + /* tctx->cpumask already contains (task_affinity ∧ cell_mask) */ + if (tctx->cpumask) + bpf_cpumask_and(tctx->cpumask, (const struct cpumask *)tctx->cpumask, l3_mask); + + /* If empty after intersection, nothing can run here */ + if (tctx->cpumask && bpf_cpumask_empty((const struct cpumask *)tctx->cpumask)) { + scx_bpf_error("Empty cpumask after intersection"); + return -ENODEV; + } + + /* --- Point to the correct (cell,L3) DSQ and set vtime baseline --- */ + tctx->dsq = get_cell_l3_dsq_id(tctx->cell, tctx->l3); + + struct cell *cell = lookup_cell(tctx->cell); + if (!cell) + return -ENOENT; + + if (!l3_is_valid(tctx->l3)){ + scx_bpf_error("Invalid L3 %d", tctx->l3); + return -EINVAL; + } + + p->scx.dsq_vtime = READ_ONCE(cell->l3_vtime_now[tctx->l3]); + } else { + /* Task is CPU-restricted, use task mask */ + cpu = bpf_cpumask_any_distribute(p->cpus_ptr); + if (!(cpu_ctx = lookup_cpu_ctx(cpu))) + return -ENOENT; + tctx->dsq = get_cpu_dsq_id(cpu); + p->scx.dsq_vtime = READ_ONCE(cpu_ctx->vtime_now); + } + + return 0; +} + +/* + * Figure out the task's cell, dsq and store the corresponding cpumask in the + * task_ctx. + */ +static inline int update_task_cell(struct task_struct *p, struct task_ctx *tctx, + struct cgroup *cg) +{ + struct cgrp_ctx *cgc; + + if (!(cgc = lookup_cgrp_ctx(cg))) + return -ENOENT; + + /* + * This ordering is pretty important, we read applied_configuration_seq + * before reading everything else expecting that the updater will update + * everything and then bump applied_configuration_seq last. This ensures + * that we cannot miss an update. + */ + tctx->configuration_seq = READ_ONCE(applied_configuration_seq); + barrier(); + tctx->cell = cgc->cell; + + return update_task_cpumask(p, tctx); +} + +/* Helper function for picking an idle cpu out of a candidate set */ +static s32 pick_idle_cpu_from(struct task_struct *p, + const struct cpumask *cand_cpumask, s32 prev_cpu, + const struct cpumask *idle_smtmask) +{ + bool prev_in_cand = bpf_cpumask_test_cpu(prev_cpu, cand_cpumask); + s32 cpu; + + /* + * If CPU has SMT, any wholly idle CPU is likely a better pick than + * partially idle @prev_cpu. + */ + if (smt_enabled) { + if (prev_in_cand && + bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) && + scx_bpf_test_and_clear_cpu_idle(prev_cpu)) + return prev_cpu; + + cpu = scx_bpf_pick_idle_cpu(cand_cpumask, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + return cpu; + } + + if (prev_in_cand && scx_bpf_test_and_clear_cpu_idle(prev_cpu)) + return prev_cpu; + + return scx_bpf_pick_idle_cpu(cand_cpumask, 0); +} + +/* Check if we need to update the cell/cpumask mapping */ +static __always_inline int maybe_refresh_cell(struct task_struct *p, + struct task_ctx *tctx) +{ + struct cgroup *cgrp; + int ret = 0; + if (tctx->configuration_seq != READ_ONCE(applied_configuration_seq)) { + if (!(cgrp = task_cgroup(p))) + return -1; + if (update_task_cell(p, tctx, cgrp)) + ret = -1; + bpf_cgroup_release(cgrp); + } + return ret; +} + +static __always_inline s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, + struct cpu_ctx *cctx, + struct task_ctx *tctx) +{ + struct cpumask *task_cpumask; + const struct cpumask *idle_smtmask; + s32 cpu; + + if (!(task_cpumask = (struct cpumask *)tctx->cpumask) || + !(idle_smtmask = scx_bpf_get_idle_smtmask())) { + scx_bpf_error("Failed to get task cpumask or idle smtmask"); + return -1; + } + + /* No overlap between cell cpus and task cpus, just find some idle cpu */ + if (bpf_cpumask_empty(task_cpumask)) { + cstat_inc(CSTAT_AFFN_VIOL, tctx->cell, cctx); + cpu = pick_idle_cpu_from(p, p->cpus_ptr, prev_cpu, + idle_smtmask); + goto out; + } + + cpu = pick_idle_cpu_from(p, task_cpumask, prev_cpu, idle_smtmask); +out: + scx_bpf_put_idle_cpumask(idle_smtmask); + return cpu; +} + +/* + * select_cpu is where we update each task's cell assignment and then try to + * dispatch to an idle core in the cell if possible + */ +s32 BPF_STRUCT_OPS(mitosis_select_cpu, struct task_struct *p, s32 prev_cpu, + u64 wake_flags) +{ + s32 cpu; + struct cpu_ctx *cctx; + struct task_ctx *tctx; + + increment_counter(COUNTER_SELECT_CPU); + + if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p))) + return prev_cpu; + + if (maybe_refresh_cell(p, tctx) < 0) + return prev_cpu; + + /* Pinned path: only if our task really requires a per-CPU queue. */ + if (!tctx->all_cell_cpus_allowed) { + cstat_inc(CSTAT_AFFN_VIOL, tctx->cell, cctx); + cpu = get_cpu_from_dsq(tctx->dsq); + if (scx_bpf_test_and_clear_cpu_idle(cpu)) + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0); + return cpu; + } + + // Grab an idle core + if ((cpu = pick_idle_cpu(p, prev_cpu, cctx, tctx)) >= 0) { + cstat_inc(CSTAT_LOCAL, tctx->cell, cctx); + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0); + return cpu; + } + + if (!tctx->cpumask) { + scx_bpf_error("tctx->cpumask should never be NULL"); + return prev_cpu; + } + /* + * All else failed, send it to the prev cpu (if that's valid), otherwise any + * valid cpu. + */ + if (!bpf_cpumask_test_cpu(prev_cpu, cast_mask(tctx->cpumask)) && + tctx->cpumask) + cpu = bpf_cpumask_any_distribute(cast_mask(tctx->cpumask)); + else + cpu = prev_cpu; + + return cpu; +} + +void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags) +{ + struct cpu_ctx *cctx; + struct task_ctx *tctx; + struct cell *cell; + s32 task_cpu = scx_bpf_task_cpu(p); + u64 vtime = p->scx.dsq_vtime; + s32 cpu = -1; + u64 basis_vtime; + + increment_counter(COUNTER_ENQUEUE); + + if (!(tctx = lookup_task_ctx(p)) || !(cctx = lookup_cpu_ctx(-1))) + return; + + if (maybe_refresh_cell(p, tctx) < 0) + return; + + // Cpu pinned work + if (!tctx->all_cell_cpus_allowed) { + cpu = get_cpu_from_dsq(tctx->dsq); + } else if (!__COMPAT_is_enq_cpu_selected(enq_flags)) { + /* + * If we haven't selected a cpu, then we haven't looked for and kicked an + * idle CPU. Let's do the lookup now and kick at the end. + */ + if (!(cctx = lookup_cpu_ctx(-1))) + return; + cpu = pick_idle_cpu(p, task_cpu, cctx, tctx); + if (cpu == -1) + return; + if (cpu == -EBUSY) { + /* + * Verifier gets unhappy claiming two different pointer types for + * the same instruction here. This fixes it + */ + barrier_var(tctx); + if (tctx->cpumask) + cpu = bpf_cpumask_any_distribute( + (const struct cpumask *)tctx->cpumask); + } + } + + if (tctx->all_cell_cpus_allowed) { + // This is a task that can run on any cpu in the cell + + cstat_inc(CSTAT_CELL_DSQ, tctx->cell, cctx); + + /* Task can use any CPU in its cell, set basis_vtime from per-(cell, L3) vtime */ + if (!(cell = lookup_cell(tctx->cell))) + return; + + if (!l3_is_valid(tctx->l3)) { + scx_bpf_error("Invalid L3 ID for task %d in enqueue", p->pid); + return; + } + basis_vtime = READ_ONCE(cell->l3_vtime_now[tctx->l3]); + + } else { + // This is a task that can only run on a specific cpu + cstat_inc(CSTAT_CPU_DSQ, tctx->cell, cctx); + + /* + * cctx is the local core cpu (where enqueue is running), not the core + * the task belongs to. Fetch the right cctx + */ + if (!(cctx = lookup_cpu_ctx(cpu))) + return; + /* Task is pinned to specific CPUs, use per-CPU DSQ */ + basis_vtime = READ_ONCE(cctx->vtime_now); + } + + tctx->basis_vtime = basis_vtime; + + if (time_after(vtime, + basis_vtime + VTIME_MAX_FUTURE_MULTIPLIER * slice_ns)) { + scx_bpf_error("vtime is too far in the future for %d", p->pid); + return; + } + /* + * Limit the amount of budget that an idling task can accumulate + * to one slice. + */ + // TODO: Should this be time_before64? + if (time_before(vtime, basis_vtime - slice_ns)) + vtime = basis_vtime - slice_ns; + + scx_bpf_dsq_insert_vtime(p, tctx->dsq.raw, slice_ns, vtime, enq_flags); + + /* Kick the CPU if needed */ + if (!__COMPAT_is_enq_cpu_selected(enq_flags) && cpu >= 0) + scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); +} + +void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev) +{ + struct cpu_ctx *cctx; + u32 cell; + + increment_counter(COUNTER_DISPATCH); + + if (!(cctx = lookup_cpu_ctx(-1))) + return; + + cell = READ_ONCE(cctx->cell); + + /* Start from a valid DSQ */ + dsq_id_t local_dsq = get_cpu_dsq_id(cpu); + + bool found = false; + dsq_id_t min_vtime_dsq = local_dsq; + u64 min_vtime = ~0ULL; /* U64_MAX */ + struct task_struct *p; + + // Get L3 + u32 cpu_key = (u32)cpu; + u32 *l3_ptr = bpf_map_lookup_elem(&cpu_to_l3, &cpu_key); + s32 l3 = l3_ptr ? (s32)*l3_ptr : L3_INVALID; + + /* Check the L3 queue */ + if (l3 != L3_INVALID) { + dsq_id_t cell_l3_dsq = get_cell_l3_dsq_id(cell, l3); + bpf_for_each(scx_dsq, p, cell_l3_dsq.raw, 0) { + min_vtime = p->scx.dsq_vtime; + min_vtime_dsq = cell_l3_dsq; + found = true; + break; + } + } + + /* Check the CPU DSQ for a lower vtime */ + bpf_for_each(scx_dsq, p, local_dsq.raw, 0) { + if (!found || time_before(p->scx.dsq_vtime, min_vtime)) { + min_vtime = p->scx.dsq_vtime; + min_vtime_dsq = local_dsq; + found = true; + } + break; + } + + /* + * The move_to_local can fail if we raced with some other cpu in the cell + * and now the cell is empty. We have to ensure to try the cpu_dsq or else + * we might never wakeup. + */ + + + if (found) { + // We found a task in the local or cell-L3 DSQ + + // If it was in the per cpu DSQ, there is no competation, grab it and return + if (min_vtime_dsq.raw == local_dsq.raw) { + scx_bpf_dsq_move_to_local(min_vtime_dsq.raw); + return; + } + + // If it was in the cell L3 DSQ, we are competing with other cpus in the cell-l3 + // try to move it to the local DSQ + if (scx_bpf_dsq_move_to_local(min_vtime_dsq.raw)) { + // We won the race and got the task, return + return; + } + } + +#if MITOSIS_ENABLE_STEALING + // We didn't find a task in either DSQ, or lost the race. + // Instead of going straight to idle, attempt to steal a task from another + // L3 in the cell. + + // Try stealing. If successful, this moves the task to the local runqueue + try_stealing_work(cell, l3); +#endif +} + +struct cpumask_entry { + unsigned long cpumask[CPUMASK_LONG_ENTRIES]; + u64 used; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct cpumask_entry); + __uint(max_entries, MAX_CPUMASK_ENTRIES); +} cgrp_init_percpu_cpumask SEC(".maps"); + +static inline struct cpumask_entry *allocate_cpumask_entry() +{ + int cpumask_idx; + bpf_for(cpumask_idx, 0, MAX_CPUMASK_ENTRIES) + { + struct cpumask_entry *ent = bpf_map_lookup_elem( + &cgrp_init_percpu_cpumask, &cpumask_idx); + if (!ent) { + scx_bpf_error("Failed to fetch cpumask_entry"); + return NULL; + } + if (__sync_bool_compare_and_swap(&ent->used, 0, 1)) + return ent; + } + scx_bpf_error("All cpumask entries are in use"); + return NULL; +} + +static inline void free_cpumask_entry(struct cpumask_entry *entry) +{ + WRITE_ONCE(entry->used, 0); +} + +/* For use by cleanup attribute */ +static inline void __free_cpumask_entry(struct cpumask_entry **entry) +{ + if (entry) + if (*entry) + free_cpumask_entry(*entry); +} + +#define DECLARE_CPUMASK_ENTRY(var) \ + struct cpumask_entry *var __attribute__((cleanup(__free_cpumask_entry))) + +/* Define types for cpumasks in-situ vs as a ptr in struct cpuset */ +struct cpumask___local {}; + +typedef struct cpumask___local *cpumask_var_t___ptr; + +struct cpuset___cpumask_ptr { + cpumask_var_t___ptr cpus_allowed; +}; + +typedef struct cpumask___local cpumask_var_t___arr[1]; + +struct cpuset___cpumask_arr { + cpumask_var_t___arr cpus_allowed; +}; + +/* + * Given a cgroup, get its cpumask (populated in entry), returns 0 if no + * cpumask, < 0 on error and > 0 on a populated cpumask. + */ +static inline int get_cgroup_cpumask(struct cgroup *cgrp, + struct cpumask_entry *entry) +{ + if (!cgrp->subsys[cpuset_cgrp_id]) + return 0; + + struct cpuset *cpuset = + container_of(cgrp->subsys[cpuset_cgrp_id], struct cpuset, css); + + if (!cpuset) + return 0; + + unsigned long runtime_cpumask_size = bpf_core_type_size(struct cpumask); + if (runtime_cpumask_size > CPUMASK_SIZE) { + scx_bpf_error( + "Definition of struct cpumask is too large. Please increase CPUMASK_LONG_ENTRIES"); + return -EINVAL; + } + + int err; + if (bpf_core_type_matches(struct cpuset___cpumask_arr)) { + struct cpuset___cpumask_arr *cpuset_typed = + (void *)bpf_core_cast(cpuset, struct cpuset); + err = bpf_core_read(&entry->cpumask, runtime_cpumask_size, + &cpuset_typed->cpus_allowed); + } else if (bpf_core_type_matches(struct cpuset___cpumask_ptr)) { + struct cpuset___cpumask_ptr *cpuset_typed = + (void *)bpf_core_cast(cpuset, struct cpuset); + err = bpf_core_read(&entry->cpumask, runtime_cpumask_size, + cpuset_typed->cpus_allowed); + } else { + scx_bpf_error( + "Definition of struct cpuset did not match any expected struct"); + return -EINVAL; + } + + if (err < 0) { + scx_bpf_error( + "bpf_core_read of cpuset->cpus_allowed failed for cgid %llu", + cgrp->kn->id); + return err; + } + + if (bpf_cpumask_empty((const struct cpumask *)&entry->cpumask)) + return 0; + + if (!all_cpumask) { + scx_bpf_error("all_cpumask should not be NULL"); + return -EINVAL; + } + + if (bpf_cpumask_subset((const struct cpumask *)all_cpumask, + (const struct cpumask *)&entry->cpumask)) + return 0; + + return 1; +} + +/* + * This array keeps track of the cgroup ancestor's cell as we iterate over the + * cgroup hierarchy. + */ +u32 level_cells[MAX_CG_DEPTH]; +int running; + +/* The guard is a stack variable. When it falls out of scope, + * we drop the running lock. */ +static inline void __running_unlock(int *guard) { + (void)guard; /* unused */ + WRITE_ONCE(running, 0); +} + +/* + * On tick, we identify new cells and apply CPU assignment + */ +void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run) +{ + + u32 local_configuration_seq = READ_ONCE(configuration_seq); + if (local_configuration_seq == READ_ONCE(applied_configuration_seq)) + return; + + int zero = 0; + if (!__atomic_compare_exchange_n(&running, &zero, 1, false, + __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) + return; + + int __attribute__((cleanup(__running_unlock), unused)) __running_guard; + + DECLARE_CPUMASK_ENTRY(entry) = allocate_cpumask_entry(); + if (!entry) + return; + + /* Get the root cell (cell 0) and its cpumask */ + struct cell_cpumask_wrapper *root_cell_cpumaskw; + if (!(root_cell_cpumaskw = + bpf_map_lookup_elem(&cell_cpumasks, &zero))) { + scx_bpf_error("Failed to find root cell cpumask"); + return; + } + + struct bpf_cpumask *root_bpf_cpumask; + root_bpf_cpumask = + bpf_kptr_xchg(&root_cell_cpumaskw->tmp_cpumask, NULL); + if (!root_bpf_cpumask) { + scx_bpf_error("tmp_cpumask should never be null"); + return; + } + if (!root_cell_cpumaskw->cpumask) { + scx_bpf_error("root cpumasks should never be null"); + goto out; + } + + if (!all_cpumask) { + scx_bpf_error("NULL all_cpumask"); + goto out; + } + + /* + * Initialize root cell cpumask to all cpus, and then remove from it as we go + */ + bpf_cpumask_copy(root_bpf_cpumask, (const struct cpumask *)all_cpumask); + + struct cgroup_subsys_state *root_css, *pos; + struct cgroup *cur_cgrp, *root_cgrp_ref; + + if (!root_cgrp) { + scx_bpf_error("root_cgrp should not be null"); + goto out; + } + + struct cgrp_ctx *root_cgrp_ctx; + if (!(root_cgrp_ctx = lookup_cgrp_ctx(root_cgrp))) + goto out; + + if (!root_cgrp) { + scx_bpf_error("root_cgrp should not be null"); + goto out; + } + + if (!(root_cgrp_ref = bpf_cgroup_acquire(root_cgrp))) { + scx_bpf_error("Failed to acquire reference to root_cgrp"); + goto out; + } + root_css = &root_cgrp_ref->self; + + bpf_rcu_read_lock(); + /* + * Iterate over all cgroups, check if any have a cpumask and populate them + * as a separate cell. + */ + bpf_for_each(css, pos, root_css, BPF_CGROUP_ITER_DESCENDANTS_PRE) { + cur_cgrp = pos->cgroup; + + /* + * We can iterate over dying cgroups, in which case this lookup will + * fail. These cgroups can't have tasks in them so just continue. + */ + struct cgrp_ctx *cgrp_ctx; + if (!(cgrp_ctx = lookup_cgrp_ctx_fallible(cur_cgrp))) + continue; + + int rc = get_cgroup_cpumask(cur_cgrp, entry); + if (!rc) { + /* + * TODO: If this was a cell owner that just had its cpuset removed, + * it should free the cell. Doing so would require draining + * in-flight tasks scheduled to the dsq. + */ + /* No cpuset, assign to parent cell and continue */ + if (cur_cgrp->kn->id != root_cgid) { + u32 level = cur_cgrp->level; + if (level <= 0 || level >= MAX_CG_DEPTH) { + scx_bpf_error( + "Cgroup hierarchy is too deep: %d", + level); + goto out_rcu_unlock; + } + /* + * This is a janky way of getting the parent cell, ideally we'd + * lookup the parent cgrp_ctx and get it that way, but some + * cgroup lookups don't work here because they are (erroneously) + * only operating on the cgroup namespace of current. Given this + * is a tick() it could be anything. See + * https://lore.kernel.org/bpf/20250811175045.1055202-1-memxor@gmail.com/ + * for details. + * + * Instead, we just track the parent cells as we walk the cgroup + * hierarchy in a separate array. Because the iteration is + * pre-order traversal, we're guaranteed to have the current + * cgroup's ancestor's cells in level_cells. + */ + u32 parent_cell = level_cells[level - 1]; + WRITE_ONCE(cgrp_ctx->cell, parent_cell); + level_cells[level] = parent_cell; + } + continue; + } else if (rc < 0) + goto out_rcu_unlock; + + /* + * cgroup has a cpumask, allocate a new cell if needed, and assign cpus + */ + int cell_idx = READ_ONCE(cgrp_ctx->cell); + if (!cgrp_ctx->cell_owner) { + cell_idx = allocate_cell(); + if (cell_idx < 0) + goto out_rcu_unlock; + cgrp_ctx->cell_owner = true; + } + + struct cell_cpumask_wrapper *cell_cpumaskw; + if (!(cell_cpumaskw = + bpf_map_lookup_elem(&cell_cpumasks, &cell_idx))) { + scx_bpf_error("Failed to find cell cpumask: %d", + cell_idx); + goto out_rcu_unlock; + } + + struct bpf_cpumask *bpf_cpumask; + bpf_cpumask = bpf_kptr_xchg(&cell_cpumaskw->tmp_cpumask, NULL); + if (!bpf_cpumask) { + scx_bpf_error("tmp_cpumask should never be null"); + goto out_rcu_unlock; + } + bpf_cpumask_copy(bpf_cpumask, + (const struct cpumask *)&entry->cpumask); + int cpu_idx; + bpf_for(cpu_idx, 0, nr_possible_cpus) + { + if (bpf_cpumask_test_cpu( + cpu_idx, + (const struct cpumask *)&entry->cpumask)) { + struct cpu_ctx *cpu_ctx; + if (!(cpu_ctx = lookup_cpu_ctx(cpu_idx))) { + bpf_cpumask_release(bpf_cpumask); + goto out_rcu_unlock; + } + cpu_ctx->cell = cell_idx; + bpf_cpumask_clear_cpu(cpu_idx, + root_bpf_cpumask); + } + } + bpf_cpumask = + bpf_kptr_xchg(&cell_cpumaskw->cpumask, bpf_cpumask); + if (!bpf_cpumask) { + scx_bpf_error("cpumask should never be null"); + goto out_rcu_unlock; + } + + bpf_cpumask = + bpf_kptr_xchg(&cell_cpumaskw->tmp_cpumask, bpf_cpumask); + if (bpf_cpumask) { + scx_bpf_error("tmp_cpumask should be null"); + bpf_cpumask_release(bpf_cpumask); + goto out_rcu_unlock; + } + + barrier(); + WRITE_ONCE(cgrp_ctx->cell, cell_idx); + u32 level = cur_cgrp->level; + if (level <= 0 || level >= MAX_CG_DEPTH) { + scx_bpf_error("Cgroup hierarchy is too deep: %d", + level); + goto out_rcu_unlock; + } + level_cells[level] = cell_idx; + } + bpf_rcu_read_unlock(); + + /* + * assign root cell cpus that are left over + */ + int cpu_idx; + bpf_for(cpu_idx, 0, nr_possible_cpus) + { + if (bpf_cpumask_test_cpu( cpu_idx, (const struct cpumask *)root_bpf_cpumask)) { + struct cpu_ctx *cpu_ctx; + if (!(cpu_ctx = lookup_cpu_ctx(cpu_idx))) + goto out_root_cgrp; + cpu_ctx->cell = 0; + } + } + + root_bpf_cpumask = + bpf_kptr_xchg(&root_cell_cpumaskw->cpumask, root_bpf_cpumask); + if (!root_bpf_cpumask) { + scx_bpf_error("root cpumask should never be null"); + bpf_cgroup_release(root_cgrp_ref); + return; + } + + root_bpf_cpumask = bpf_kptr_xchg(&root_cell_cpumaskw->tmp_cpumask, + root_bpf_cpumask); + if (root_bpf_cpumask) { + scx_bpf_error("root tmp_cpumask should be null"); + goto out_root_cgrp; + } + + int cell_idx; + /* Recalculate L3 counts for all active cells after CPU assignment changes */ + bpf_for(cell_idx, 1, MAX_CELLS) { + struct cell *cell; + if (!(cell = lookup_cell(cell_idx))) { + scx_bpf_error("Lookup for cell %d failed in tick()", cell_idx); + goto out_root_cgrp; + } + + if (!cell->in_use) + continue; + + /* Recalculate L3 counts for each active cell */ + recalc_cell_l3_counts(cell_idx); + } + + /* Recalculate root cell's L3 counts after cpumask update */ + recalc_cell_l3_counts(ROOT_CELL_ID); + + barrier(); + WRITE_ONCE(applied_configuration_seq, local_configuration_seq); + + bpf_cgroup_release(root_cgrp_ref); + return; + +out_rcu_unlock: + bpf_rcu_read_unlock(); +out_root_cgrp: + bpf_cgroup_release(root_cgrp_ref); +out: + if (root_bpf_cpumask) + bpf_cpumask_release(root_bpf_cpumask); +} + +void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p) +{ + struct cpu_ctx *cctx; + struct task_ctx *tctx; + struct cell *cell; + + if (!(tctx = lookup_task_ctx(p)) || !(cctx = lookup_cpu_ctx(-1)) || + !(cell = lookup_cell(cctx->cell))) + return; + + /* + * If this task was stolen across L3s, retag to thief L3 and recompute + * effective cpumask+DSQ. Preserve vtime to keep fairness. + */ +#if MITOSIS_ENABLE_STEALING + if (l3_is_valid(tctx->pending_l3)) { + u64 save_v = p->scx.dsq_vtime; + tctx->l3 = tctx->pending_l3; + tctx->pending_l3 = L3_INVALID; + update_task_cpumask(p, tctx); + p->scx.dsq_vtime = save_v; + } +#endif + + /* Validate task's DSQ before it starts running */ + if (tctx->dsq.raw == DSQ_INVALID) { + if (tctx->all_cell_cpus_allowed) { + scx_bpf_error( + "Task %d has invalid DSQ 0 in running callback (CELL-SCHEDULABLE task, can run on any CPU in cell %d)", + p->pid, tctx->cell); + } else { + scx_bpf_error( + "Task %d has invalid DSQ 0 in running callback (CORE-PINNED task, restricted to specific CPUs)", + p->pid); + } + return; + } + + /* + * Update per-(cell, L3) vtime for cell-schedulable tasks + */ + if (tctx->all_cell_cpus_allowed && l3_is_valid(tctx->l3)) { + if (time_before(READ_ONCE(cell->l3_vtime_now[tctx->l3]), p->scx.dsq_vtime)) + WRITE_ONCE(cell->l3_vtime_now[tctx->l3], p->scx.dsq_vtime); + } + + /* + * Update CPU vtime for CPU-pinned tasks + */ + if (time_before(READ_ONCE(cctx->vtime_now), p->scx.dsq_vtime)) + WRITE_ONCE(cctx->vtime_now, p->scx.dsq_vtime); + + tctx->started_running_at = scx_bpf_now(); +} + +void BPF_STRUCT_OPS(mitosis_stopping, struct task_struct *p, bool runnable) +{ + struct cpu_ctx *cctx; + struct task_ctx *tctx; + struct cell *cell; + u64 now, used; + u32 cidx; + + if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p))) + return; + + cidx = tctx->cell; + if (!(cell = lookup_cell(cidx))) + return; + + now = scx_bpf_now(); + used = now - tctx->started_running_at; + tctx->started_running_at = now; + /* scale the execution time by the inverse of the weight and charge */ + p->scx.dsq_vtime += used * DEFAULT_WEIGHT_MULTIPLIER / p->scx.weight; + + if (cidx != 0 || tctx->all_cell_cpus_allowed) { + u64 *cell_cycles = MEMBER_VPTR(cctx->cell_cycles, [cidx]); + if (!cell_cycles) { + scx_bpf_error("Cell index is too large: %d", cidx); + return; + } + *cell_cycles += used; + + /* + * For cell-schedulable tasks, also accumulate vtime into + * per-cell per-L3 queues + */ + if (tctx->all_cell_cpus_allowed && l3_is_valid(tctx->l3)) { + /* Accumulate weighted execution time into per-(cell, L3) vtime */ + cell->l3_vtime_now[tctx->l3] += + used * DEFAULT_WEIGHT_MULTIPLIER / + p->scx.weight; + } + } +} + +SEC("fentry/cpuset_write_resmask") +int BPF_PROG(fentry_cpuset_write_resmask, struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off, ssize_t retval) +{ + /* + * On a write to cpuset.cpus, we'll need to configure new cells, bump + * configuration_seq so tick() does that. + */ + __atomic_add_fetch(&configuration_seq, 1, __ATOMIC_RELEASE); + return 0; +} + +s32 BPF_STRUCT_OPS(mitosis_cgroup_init, struct cgroup *cgrp, + struct scx_cgroup_init_args *args) +{ + struct cgrp_ctx *cgc; + if (!(cgc = bpf_cgrp_storage_get(&cgrp_ctxs, cgrp, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE))) { + scx_bpf_error("cgrp_ctx creation failed for cgid %llu", + cgrp->kn->id); + return -ENOENT; + } + + // Special case for root cell + if (cgrp->kn->id == root_cgid) { + WRITE_ONCE(cgc->cell, ROOT_CELL_ID); + return 0; + } + + DECLARE_CPUMASK_ENTRY(entry) = allocate_cpumask_entry(); + if (!entry) + return -EINVAL; + int rc = get_cgroup_cpumask(cgrp, entry); + if (rc < 0) + return rc; + else if (rc > 0) { + /* + * This cgroup has a cpuset, bump configuration_seq so tick() + * configures it. + */ + __atomic_add_fetch(&configuration_seq, 1, __ATOMIC_RELEASE); + } + + /* Initialize to parent's cell */ + struct cgroup *parent_cg; + if (!(parent_cg = lookup_cgrp_ancestor(cgrp, cgrp->level - 1))) + return -ENOENT; + + struct cgrp_ctx *parent_cgc; + if (!(parent_cgc = lookup_cgrp_ctx(parent_cg))) { + bpf_cgroup_release(parent_cg); + return -ENOENT; + } + + bpf_cgroup_release(parent_cg); + cgc->cell = parent_cgc->cell; + return 0; +} + +s32 BPF_STRUCT_OPS(mitosis_cgroup_exit, struct cgroup *cgrp) +{ + struct cgrp_ctx *cgc; + if (!(cgc = bpf_cgrp_storage_get(&cgrp_ctxs, cgrp, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE))) { + scx_bpf_error("cgrp_ctx creation failed for cgid %llu", + cgrp->kn->id); + return -ENOENT; + } + + if (cgc->cell_owner) { + int ret; + if ((ret = free_cell(cgc->cell))) + return ret; + /* + * Need to make sure the cpus of this cell are freed back to the root + * cell and the root cell cpumask can be expanded. Bump + * configuration_seq so tick() does that. + */ + __atomic_add_fetch(&configuration_seq, 1, __ATOMIC_RELEASE); + } + + return 0; +} + +void BPF_STRUCT_OPS(mitosis_cgroup_move, struct task_struct *p, + struct cgroup *from, struct cgroup *to) +{ + struct task_ctx *tctx; + + if (!(tctx = lookup_task_ctx(p))) + return; + + update_task_cell(p, tctx, to); +} + +void BPF_STRUCT_OPS(mitosis_set_cpumask, struct task_struct *p, + const struct cpumask *cpumask) +{ + struct task_ctx *tctx; + + if (!(tctx = lookup_task_ctx(p))) + return; + + if (!all_cpumask) { + scx_bpf_error("NULL all_cpumask"); + return; + } + + update_task_cpumask(p, tctx); +} + +s32 BPF_STRUCT_OPS(mitosis_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + struct task_ctx *tctx; + struct bpf_cpumask *cpumask; + int ret; + + tctx = bpf_task_storage_get(&task_ctxs, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!tctx) { + scx_bpf_error("task_ctx allocation failure"); + return -ENOMEM; + } + + cpumask = bpf_cpumask_create(); + if (!cpumask) + return -ENOMEM; + + cpumask = bpf_kptr_xchg(&tctx->cpumask, cpumask); + if (cpumask) { + /* Should never happen as we just inserted it above. */ + bpf_cpumask_release(cpumask); + scx_bpf_error("tctx cpumask is unexpectedly populated on init"); + return -EINVAL; + } + + if (!all_cpumask) { + scx_bpf_error("missing all_cpumask"); + return -EINVAL; + } + + /* Initialize L3 to invalid before cell assignment */ + init_task_l3(tctx); + + // TODO clean this up + if ((ret = update_task_cell(p, tctx, args->cgroup))) { + return ret; + } + + return 0; +} + +__hidden void dump_cpumask_word(s32 word, const struct cpumask *cpumask) +{ + u32 u, v = 0; + + bpf_for(u, 0, BITS_PER_U32) + { + s32 cpu = BITS_PER_U32 * word + u; + if (cpu < nr_possible_cpus && + bpf_cpumask_test_cpu(cpu, cpumask)) + v |= 1 << u; + } + scx_bpf_dump("%08x", v); +} + +static void dump_cpumask(const struct cpumask *cpumask) +{ + u32 word, nr_words = (nr_possible_cpus + 31) / 32; + + bpf_for(word, 0, nr_words) + { + if (word) + scx_bpf_dump(","); + dump_cpumask_word(nr_words - word - 1, cpumask); + } +} + +static void dump_cell_cpumask(int id) +{ + const struct cpumask *cell_cpumask; + + if (!(cell_cpumask = lookup_cell_cpumask(id))) + return; + + dump_cpumask(cell_cpumask); +} + +/* Print cell state for debugging */ +static __always_inline void dump_cell_state(u32 cell_idx) +{ + struct cell *cell = lookup_cell(cell_idx); + if (!cell) { + scx_bpf_dump("Cell %d: NOT FOUND", cell_idx); + return; + } + + scx_bpf_dump("Cell %d: in_use=%d, cpu_cnt=%d, l3_present_cnt=%d", + cell_idx, cell->in_use, cell->cpu_cnt, cell->l3_present_cnt); + + u32 l3; + // TODO Print vtimes for L3s + // TODO lock + bpf_for(l3, 0, nr_l3) { + if (cell->l3_cpu_cnt[l3] > 0) { + scx_bpf_dump(" L3[%d]: %d CPUs", l3, cell->l3_cpu_cnt[l3]); + } + } +} + +// TODO: FIX THIS +static __always_inline void dump_l3_state(){ +} + +void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx) +{ + dsq_id_t dsq_id; + int i; + struct cell *cell; + struct cpu_ctx *cpu_ctx; + + scx_bpf_dump_header(); + + bpf_for(i, 0, MAX_CELLS) + { + if (!(cell = lookup_cell(i))) + return; + + if (!cell->in_use) + continue; + + scx_bpf_dump("CELL[%d] CPUS=", i); + dump_cell_cpumask(i); + scx_bpf_dump("\n"); + dump_cell_state(i); + } + + bpf_for(i, 0, nr_possible_cpus) + { + if (!(cpu_ctx = lookup_cpu_ctx(i))) + return; + + dsq_id = get_cpu_dsq_id(i); + scx_bpf_dump("CPU[%d] cell=%d vtime=%llu nr_queued=%d\n", i, + cpu_ctx->cell, READ_ONCE(cpu_ctx->vtime_now), + scx_bpf_dsq_nr_queued(dsq_id.raw)); + } + + dump_l3_state(); + +} + +void BPF_STRUCT_OPS(mitosis_dump_task, struct scx_dump_ctx *dctx, + struct task_struct *p) +{ + struct task_ctx *tctx; + + if (!(tctx = lookup_task_ctx(p))) + return; + + scx_bpf_dump( + "Task[%d] vtime=%llu basis_vtime=%llu cell=%u dsq=%llu all_cell_cpus_allowed=%d\n", + p->pid, p->scx.dsq_vtime, tctx->basis_vtime, tctx->cell, + tctx->dsq.raw, tctx->all_cell_cpus_allowed); + scx_bpf_dump("Task[%d] CPUS=", p->pid); + dump_cpumask(p->cpus_ptr); + scx_bpf_dump("\n"); +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init) +{ + struct bpf_cpumask *cpumask; + u32 i; + s32 ret; + + struct cgroup *rootcg; + if (!(rootcg = bpf_cgroup_from_id(root_cgid))) + return -ENOENT; + + rootcg = bpf_kptr_xchg(&root_cgrp, rootcg); + if (rootcg) + bpf_cgroup_release(rootcg); + + /* setup all_cpumask */ + cpumask = bpf_cpumask_create(); + if (!cpumask) + return -ENOMEM; + + bpf_for(i, 0, nr_possible_cpus) + { + const volatile u8 *u8_ptr; + + if ((u8_ptr = MEMBER_VPTR(all_cpus, [i / 8]))) { + if (*u8_ptr & (1 << (i % 8))) { + bpf_cpumask_set_cpu(i, cpumask); + ret = scx_bpf_create_dsq(get_cpu_dsq_id(i).raw, ANY_NUMA); + if (ret < 0) { + bpf_cpumask_release(cpumask); + return ret; + } + } + } else { + return -EINVAL; + } + } + + + cpumask = bpf_kptr_xchg(&all_cpumask, cpumask); + if (cpumask) + bpf_cpumask_release(cpumask); + + /* setup cell cpumasks */ + bpf_for(i, 0, MAX_CELLS) + { + struct cell_cpumask_wrapper *cpumaskw; + if (!(cpumaskw = bpf_map_lookup_elem(&cell_cpumasks, &i))) + return -ENOENT; + + cpumask = bpf_cpumask_create(); + if (!cpumask) + return -ENOMEM; + + /* + * Start with all full cpumask for all cells. They'll get setup in + * cgroup_init + */ + bpf_cpumask_setall(cpumask); + + cpumask = bpf_kptr_xchg(&cpumaskw->cpumask, cpumask); + if (cpumask) { + /* Should be impossible, we just initialized the cell cpumask */ + bpf_cpumask_release(cpumask); + return -EINVAL; + } + + cpumask = bpf_cpumask_create(); + if (!cpumask) + return -ENOMEM; + cpumask = bpf_kptr_xchg(&cpumaskw->tmp_cpumask, cpumask); + if (cpumask) { + /* Should be impossible, we just initialized the cell tmp_cpumask */ + bpf_cpumask_release(cpumask); + return -EINVAL; + } + } + + // cells[0].in_use = true; + lookup_cell(0)->in_use = true; + + /* Configure root cell (cell 0) topology at init time using nr_l3 and l3_to_cpu masks */ + recalc_cell_l3_counts(ROOT_CELL_ID); + + /* Create (cell,L3) DSQs for all pairs. Userspace will populate maps. */ + // This is a crazy over-estimate + bpf_for(i, 0, MAX_CELLS) + { + u32 l3; + bpf_for(l3, 0, nr_l3) + { + ret = scx_bpf_create_dsq(get_cell_l3_dsq_id(i, l3).raw, ANY_NUMA); + if (ret < 0) + scx_bpf_error( "Failed to create DSQ for cell %d, L3 %d: err %d", i, l3, ret); + } + } + + return 0; +} + +void BPF_STRUCT_OPS(mitosis_exit, struct scx_exit_info *ei) +{ + // int i; + // bpf_for(i, 0, MAX_CELLS); { + // dump_cell_state((u32)i); + // } + + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops mitosis = { + .select_cpu = (void *)mitosis_select_cpu, + .enqueue = (void *)mitosis_enqueue, + .dispatch = (void *)mitosis_dispatch, + .tick = (void *)mitosis_tick, + .running = (void *)mitosis_running, + .stopping = (void *)mitosis_stopping, + .set_cpumask = (void *)mitosis_set_cpumask, + .init_task = (void *)mitosis_init_task, + .cgroup_init = (void *)mitosis_cgroup_init, + .cgroup_exit = (void *)mitosis_cgroup_exit, + .cgroup_move = (void *)mitosis_cgroup_move, + .dump = (void *)mitosis_dump, + .dump_task = (void *)mitosis_dump_task, + .init = (void *)mitosis_init, + .exit = (void *)mitosis_exit, + .name = "mitosis", +}; +# File: scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h +/* Copyright (c) Meta Platforms, Inc. and affiliates. */ +/* + * This software may be used and distributed according to the terms of the + * GNU General Public License version 2. + * + * This defines the core data structures, types, and constants + * for the scx_mitosis scheduler, primarily containing `struct cell` + * and `struct task_ctx`. + */ + +#pragma once + +#ifdef LSP +#define __bpf__ +#include "../../../../include/scx/common.bpf.h" +#include "../../../../include/scx/ravg_impl.bpf.h" +#else +#include +#include +#endif + +#include "intf.h" + +#define MAX_L3S 16 + +#include "dsq.bpf.h" + +/* + * A couple of tricky things about checking a cgroup's cpumask: + * + * First, we need an RCU pointer to pass to cpumask kfuncs. The only way to get + * this right now is to copy the cpumask to a map entry. Given that cgroup init + * could be re-entrant we have a few per-cpu entries in a map to make this + * doable. + * + * Second, cpumask can sometimes be stored as an array in-situ or as a pointer + * and with different lengths. Some bpf_core_type_matches finagling can make + * this all work. + */ +#define MAX_CPUMASK_ENTRIES (4) + +/* + * We don't know how big struct cpumask is at compile time, so just allocate a + * large space and check that it is big enough at runtime + * TODO: This should be deduplicated with the rust code and put in intf.h + */ +#define CPUMASK_LONG_ENTRIES (128) +#define CPUMASK_SIZE (sizeof(long) * CPUMASK_LONG_ENTRIES) + +extern const volatile u32 nr_l3; + +extern struct cell_map cells; + + +enum mitosis_constants { + + /* Root cell index */ + ROOT_CELL_ID = 0, + + /* Invalid/unset L3 value */ + // INVALID_L3_ID = -1, + + /* Default weight divisor for vtime calculation */ + DEFAULT_WEIGHT_MULTIPLIER = 100, + + /* Vtime validation multiplier (slice_ns * 8192) */ + VTIME_MAX_FUTURE_MULTIPLIER = 8192, + + /* Bits per u32 for cpumask operations */ + BITS_PER_U32 = 32, + + /* No NUMA constraint for DSQ creation */ + ANY_NUMA = -1, +}; + +struct cell { + struct bpf_spin_lock lock; + + // Whether or not the cell is used or not + u32 in_use; + // Number of CPUs in this cell + u32 cpu_cnt; + // per-L3 vtimes within this cell + u64 l3_vtime_now[MAX_L3S]; + // Number of CPUs from each L3 assigned to this cell + u32 l3_cpu_cnt[MAX_L3S]; + // Number of L3s with at least one CPU in this cell + u32 l3_present_cnt; + + // TODO XXX remove this, only here temporarily to make the code compile + // current vtime of the cell + u64 vtime_now; +}; + +// #if 0 +/* Wrap the spin lock in a struct for verifier */ +// struct cell_lock_wrapper { +// struct bpf_spin_lock lock; +// }; + +// struct cell_locks_map { +// __uint(type, BPF_MAP_TYPE_ARRAY); +// __type(key, u32); +// __type(value, struct cell_lock_wrapper); +// __uint(max_entries, MAX_CELLS); +// }; + +#define WITH_CELL_LOCK(cell_ptr, cell_idx, block) \ + do { \ + struct bpf_spin_lock *lock = get_cell_lock(cell_idx); \ + if (!lock) { \ + scx_bpf_error("Failed to get lock for cell %d", \ + cell_idx); \ + break; \ + } \ + bpf_spin_lock(lock); \ + block bpf_spin_unlock(lock); \ + } while (0) + +static inline struct cell *lookup_cell(int idx) +{ + struct cell *cell; + + // cell = MEMBER_VPTR(cells, [idx]); + cell = bpf_map_lookup_elem(&cells, &idx); + + + if (!cell) { + scx_bpf_error("Invalid cell %d", idx); + return NULL; + } + return cell; +} + +static inline struct bpf_spin_lock *get_cell_lock(u32 cell_idx) +{ + if (cell_idx >= MAX_CELLS) { + scx_bpf_error("Invalid cell index %d", cell_idx); + return NULL; + } + + struct cell *cell = lookup_cell(cell_idx); + if (!cell) { + scx_bpf_error("Cell %d not found", cell_idx); + return NULL; + } + return &cell->lock; +} +// #endif + +/* + * task_ctx is the per-task information kept by scx_mitosis + */ +struct task_ctx { + /* cpumask is the set of valid cpus this task can schedule on */ + /* (tasks cpumask anded with its cell cpumask) */ + struct bpf_cpumask __kptr *cpumask; + /* started_running_at for recording runtime */ + u64 started_running_at; + u64 basis_vtime; + /* For the sake of monitoring, each task is owned by a cell */ + u32 cell; + /* For the sake of scheduling, a task is exclusively owned by either a cell + * or a cpu */ + dsq_id_t dsq; + /* latest configuration that was applied for this task */ + /* (to know if it has to be re-applied) */ + u32 configuration_seq; + /* Is this task allowed on all cores of its cell? */ + bool all_cell_cpus_allowed; + // Which L3 this task is assigned to + s32 l3; + +#if MITOSIS_ENABLE_STEALING + /* When a task is stolen, dispatch() marks the destination L3 here. + * running() applies the retag and recomputes cpumask (vtime preserved). + */ + s32 pending_l3; + u32 steal_count; /* how many times this task has been stolen */ + u64 last_stolen_at; /* ns timestamp of the last steal (scx_bpf_now) */ + u32 steals_prevented; /* how many times this task has been prevented from being stolen */ +#endif +}; + +// These could go in mitosis.bpf.h, but we'll cross that bridge when we get +static inline const struct cpumask *lookup_cell_cpumask(int idx); + +static inline struct task_ctx *lookup_task_ctx(struct task_struct *p); + +/* MAP TYPES */ +struct function_counters_map { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, NR_COUNTERS); +}; + +struct cell_map { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, struct cell); + __uint(max_entries, MAX_CELLS); +}; + +struct rcu_read_guard { + bool active; +}; + +static inline struct rcu_read_guard rcu_read_lock_guard(void) +{ + bpf_rcu_read_lock(); + return (struct rcu_read_guard){ .active = true }; +} + +static inline void rcu_read_guard_release(struct rcu_read_guard *guard) +{ + if (guard->active) { + bpf_rcu_read_unlock(); + guard->active = false; + } +} +#define RCU_READ_GUARD() \ + struct rcu_read_guard __rcu_guard \ + __attribute__((__cleanup__(rcu_read_guard_release))) = \ + rcu_read_lock_guard() + +struct cpumask_guard { + struct bpf_cpumask *mask; +}; + +static inline struct cpumask_guard cpumask_create_guard(void) +{ + struct bpf_cpumask *mask = bpf_cpumask_create(); + return (struct cpumask_guard){ .mask = mask }; +} + +static inline void cpumask_guard_release(struct cpumask_guard *guard) +{ + if (guard->mask) { + bpf_cpumask_release(guard->mask); + guard->mask = NULL; + } +} + +#define CPUMASK_GUARD(var_name) \ + struct cpumask_guard var_name \ + __attribute__((__cleanup__(cpumask_guard_release))) = \ + cpumask_create_guard() diff --git a/scheds/rust/scx_mitosis/src/bpf/intf.h b/scheds/rust/scx_mitosis/src/bpf/intf.h index 00045df399..130f4f2480 100644 --- a/scheds/rust/scx_mitosis/src/bpf/intf.h +++ b/scheds/rust/scx_mitosis/src/bpf/intf.h @@ -5,13 +5,13 @@ #ifndef __INTF_H #define __INTF_H -#ifndef __KERNEL__ +#ifndef __BPF__ +#include typedef unsigned long long u64; typedef unsigned int u32; typedef _Bool bool; #endif - #ifdef LSP #define __bpf__ #include "../../../../include/scx/ravg.bpf.h" @@ -49,24 +49,51 @@ enum consts { struct cell { // This is a lock in the kernel and padding in the user - CELL_LOCK_T lock; + CELL_LOCK_T lock; // Assumed to be the first entry (see below) // Whether or not the cell is used u32 in_use; + // Number of CPUs in this cell u32 cpu_cnt; - // per-L3 vtimes within this cell - u64 l3_vtime_now[MAX_L3S]; - // Number of CPUs from each L3 assigned to this cell - u32 l3_cpu_cnt[MAX_L3S]; + // Number of L3s with at least one CPU in this cell u32 l3_present_cnt; - // TODO XXX remove this, only here temporarily to make the code compile - // current vtime of the cell - u64 vtime_now; + // Number of CPUs from each L3 assigned to this cell + u32 l3_cpu_cnt[MAX_L3S]; + + // per-L3 vtimes within this cell + u64 l3_vtime_now[MAX_L3S]; }; +// Putting the lock first in the struct is our convention. +// We pad this space when in Rust code that will never see the lock value. +// We intentionally avoid it in copy_cell_no_lock to keep the verifier happy. +// It is a BPF constraint that it is 4 byte aligned. + +// All assertions work for both BPF and userspace builds +_Static_assert(offsetof(struct cell, lock) == 0, + "lock/padding must be first field"); + +_Static_assert(sizeof(((struct cell *)0)->lock) == 4, + "lock/padding must be 4 bytes"); + +_Static_assert(_Alignof(CELL_LOCK_T) == 4, + "lock/padding must be 4-byte aligned"); + +_Static_assert(offsetof(struct cell, in_use) == 4, + "in_use must follow 4-byte lock/padding"); + +// Verify these are the same size in both BPF and Rust. +_Static_assert(sizeof(struct cell) == + ( (4 * sizeof(u32)) + (4 * MAX_L3S) + (8 * MAX_L3S)), + "struct cell size must be stable for Rust bindings"); + +// Ensure no unexpected padding was added +_Static_assert(sizeof(struct cell) == 208, + "struct cell must be exactly 208 bytes"); + /* Statistics */ enum cell_stat_idx { CSTAT_LOCAL, diff --git a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h index 7ed77d68c3..eb5e35c352 100644 --- a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h +++ b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h @@ -138,7 +138,7 @@ static __always_inline void recalc_cell_l3_counts(u32 cell_idx) } } // unlock RCU - + // Write to cell bpf_spin_lock(&cell->lock); for (u32 l3 = 0; l3 < nr_l3; l3++) { cell->l3_cpu_cnt[l3] = l3_cpu_cnt_tmp[l3]; @@ -159,7 +159,6 @@ static __always_inline void recalc_cell_l3_counts(u32 cell_idx) * @cell_id: The cell ID to select an L3 from * @return: L3 ID on success, L3_INVALID on error */ -// TODO: Lock static inline s32 pick_l3_for_task(u32 cell_id) { struct cell *cell; @@ -170,9 +169,15 @@ static inline s32 pick_l3_for_task(u32 cell_id) return L3_INVALID; } + // Snapshot the current state of the cell + struct cell cell_snapshot; + bpf_spin_lock(&cell->lock); + copy_cell_skip_lock(&cell_snapshot, cell); + bpf_spin_unlock(&cell->lock); + // No cpus - if (!cell->cpu_cnt) { - scx_bpf_error( "pick_l3_for_task: cell %d has no CPUs accounted yet", cell_id); + if (!cell_snapshot.cpu_cnt) { + scx_bpf_error("pick_l3_for_task: cell %d has no CPUs accounted yet", cell_id); return L3_INVALID; } @@ -180,14 +185,14 @@ static inline s32 pick_l3_for_task(u32 cell_id) * weighted selection - accumulate CPU counts until we exceed target */ /* Generate random target value in range [0, cpu_cnt) */ - u32 target = bpf_get_prandom_u32() % cell->cpu_cnt; + u32 target = bpf_get_prandom_u32() % cell_snapshot.cpu_cnt; u32 l3, cur = 0; s32 ret = L3_INVALID; // This could be a prefix sum. Find first l3 where we exceed target bpf_for(l3, 0, nr_l3) { - cur += cell->l3_cpu_cnt[l3]; + cur += cell_snapshot.l3_cpu_cnt[l3]; if (target < cur) { ret = (s32)l3; break; diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c index b920ecaf25..c1e6acf17e 100644 --- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c @@ -182,15 +182,15 @@ static inline int allocate_cell() if (!(c = lookup_cell(cell_idx))) return -1; - if (__sync_bool_compare_and_swap(&c->in_use, 0, 1)) { - // TODO XXX, I think we need to make this concurrent safe - // TODO, lock with recalc_cell...() - __builtin_memset(c->l3_cpu_cnt, 0, sizeof(c->l3_cpu_cnt)); - c->l3_present_cnt = 0; - // TODO zero cpu_cnt - // TODO Just zero the whole cell struct? + bpf_spin_lock(&c->lock); + if (c->in_use == 0) { + // Zero everything except the lock (which is first) + __builtin_memset(&c->in_use, 0, sizeof(struct cell) - sizeof(CELL_LOCK_T)); + c->in_use = 1; // Then mark as in use + bpf_spin_unlock(&c->lock); return cell_idx; } + bpf_spin_unlock(&c->lock); } scx_bpf_error("No available cells to allocate"); return -1; @@ -309,7 +309,6 @@ static inline int update_task_cpumask(struct task_struct *p, // We want to set the task vtime to that of the cell it's joining. if (tctx->all_cell_cpus_allowed) { - const struct cpumask *l3_mask = NULL; if (tctx->l3 != L3_INVALID) { l3_mask = lookup_l3_cpumask((u32)tctx->l3); @@ -346,8 +345,10 @@ static inline int update_task_cpumask(struct task_struct *p, tctx->dsq = get_cell_l3_dsq_id(tctx->cell, tctx->l3); struct cell *cell = lookup_cell(tctx->cell); - if (!cell) + if (!cell) { + scx_bpf_error("Invalid cell"); return -ENOENT; + } if (!l3_is_valid(tctx->l3)){ scx_bpf_error("Invalid L3 %d", tctx->l3); @@ -1099,7 +1100,7 @@ void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p) !(cell = lookup_cell(cctx->cell))) return; - /* + /* * If this task was stolen across L3s, retag to thief L3 and recompute * effective cpumask+DSQ. Preserve vtime to keep fairness. */ @@ -1385,7 +1386,7 @@ static __always_inline void dump_cell_state(u32 cell_idx) } scx_bpf_dump("Cell %d: in_use=%d, cpu_cnt=%d, l3_present_cnt=%d", - cell_idx, cell->in_use, cell->cpu_cnt, cell->l3_present_cnt); + cell_idx, cell->in_use, cell->cpu_cnt, cell->l3_present_cnt); u32 l3; // TODO Print vtimes for L3s @@ -1399,6 +1400,8 @@ static __always_inline void dump_cell_state(u32 cell_idx) // TODO: FIX THIS static __always_inline void dump_l3_state(){ + + } void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx) diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h index 4441a19a27..e42f7379f2 100644 --- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h @@ -46,8 +46,6 @@ extern const volatile u32 nr_l3; - - extern struct cell_map cells; enum mitosis_constants { @@ -71,8 +69,16 @@ enum mitosis_constants { ANY_NUMA = -1, }; - - +static inline void copy_cell_skip_lock(struct cell *dst, const struct cell *src) +{ + /* Copy everything AFTER the lock field. + * Since lock is first and 4 bytes (verified by static assertions), + * we skip it and copy the remainder of the struct. + */ + __builtin_memcpy(&dst->in_use, + &src->in_use, + sizeof(struct cell) - sizeof(CELL_LOCK_T)); +} static inline struct cell *lookup_cell(int idx) { From 73a86232faa2edcff9026b8910e59fcc3257e1bc Mon Sep 17 00:00:00 2001 From: tommy-u Date: Thu, 9 Oct 2025 14:17:18 -0700 Subject: [PATCH 10/12] Clang format --- scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h | 82 ++++++++---- scheds/rust/scx_mitosis/src/bpf/intf.h | 22 ++-- .../rust/scx_mitosis/src/bpf/l3_aware.bpf.h | 26 ++-- scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c | 121 +++++++++++++----- scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h | 33 +++-- 5 files changed, 195 insertions(+), 89 deletions(-) diff --git a/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h index a8a8a21c2e..fc50f17fba 100644 --- a/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h +++ b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h @@ -70,32 +70,51 @@ #endif /* ---- Bitfield widths (bits) ---- */ -#define CPU_B 28 -#define L3_B 16 -#define CELL_B 12 -#define TYPE_B 4 -#define DATA_B 28 -#define RSVD_B 32 +#define CPU_B 28 +#define L3_B 16 +#define CELL_B 12 +#define TYPE_B 4 +#define DATA_B 28 +#define RSVD_B 32 /* Sum checks (in bits) */ -_Static_assert(CPU_B + TYPE_B == 32, "CPU layout low half must be 32 bits"); -_Static_assert(L3_B + CELL_B + TYPE_B == 32, "CELL+L3 layout low half must be 32 bits"); -_Static_assert(DATA_B + TYPE_B == 32, "Common layout low half must be 32 bits"); +_Static_assert(CPU_B + TYPE_B == 32, "CPU layout low half must be 32 bits"); +_Static_assert(L3_B + CELL_B + TYPE_B == 32, + "CELL+L3 layout low half must be 32 bits"); +_Static_assert(DATA_B + TYPE_B == 32, "Common layout low half must be 32 bits"); typedef union { u64 raw; /* Per-CPU user DSQ */ - struct { u64 cpu: CPU_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } cpu_dsq; + struct { + u64 cpu : CPU_B; + u64 type : TYPE_B; + u64 rsvd : RSVD_B; + } cpu_dsq; /* Cell+L3 user DSQ */ - struct { u64 l3: L3_B; u64 cell: CELL_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } cell_l3_dsq; + struct { + u64 l3 : L3_B; + u64 cell : CELL_B; + u64 type : TYPE_B; + u64 rsvd : RSVD_B; + } cell_l3_dsq; /* Generic user view */ - struct { u64 data: DATA_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } user_dsq; + struct { + u64 data : DATA_B; + u64 type : TYPE_B; + u64 rsvd : RSVD_B; + } user_dsq; /* Built-in DSQ view */ - struct { u64 value:32; u64 rsvd:30; u64 local_on:1; u64 builtin:1; } builtin_dsq; + struct { + u64 value : 32; + u64 rsvd : 30; + u64 local_on : 1; + u64 builtin : 1; + } builtin_dsq; /* NOTE: Considered packed and aligned attributes, but that's redundant */ } dsq_id_t; @@ -105,16 +124,22 @@ typedef union { * invalid bc bit 63 clear (it's a user DSQ) && dsq_type == 0 (no type) * Good for catching uninitialized DSQ IDs. */ -#define DSQ_INVALID ((u64) 0) +#define DSQ_INVALID ((u64)0) -_Static_assert(sizeof(((dsq_id_t){0}).cpu_dsq) == sizeof(u64), "cpu view must be 8 bytes"); -_Static_assert(sizeof(((dsq_id_t){0}).cell_l3_dsq) == sizeof(u64), "cell+l3 view must be 8 bytes"); -_Static_assert(sizeof(((dsq_id_t){0}).user_dsq) == sizeof(u64), "user common view must be 8 bytes"); -_Static_assert(sizeof(((dsq_id_t){0}).builtin_dsq) == sizeof(u64), "builtin view must be 8 bytes"); +_Static_assert(sizeof(((dsq_id_t){ 0 }).cpu_dsq) == sizeof(u64), + "cpu view must be 8 bytes"); +_Static_assert(sizeof(((dsq_id_t){ 0 }).cell_l3_dsq) == sizeof(u64), + "cell+l3 view must be 8 bytes"); +_Static_assert(sizeof(((dsq_id_t){ 0 }).user_dsq) == sizeof(u64), + "user common view must be 8 bytes"); +_Static_assert(sizeof(((dsq_id_t){ 0 }).builtin_dsq) == sizeof(u64), + "builtin view must be 8 bytes"); /* Compile-time checks (in bytes) */ -_Static_assert(sizeof(dsq_id_t) == sizeof(u64), "dsq_id_t must be 8 bytes (64 bits)"); -_Static_assert(_Alignof(dsq_id_t) == sizeof(u64), "dsq_id_t must be 8-byte aligned"); +_Static_assert(sizeof(dsq_id_t) == sizeof(u64), + "dsq_id_t must be 8 bytes (64 bits)"); +_Static_assert(_Alignof(dsq_id_t) == sizeof(u64), + "dsq_id_t must be 8-byte aligned"); /* DSQ type enumeration */ enum dsq_type { @@ -124,17 +149,20 @@ enum dsq_type { }; /* Range guards */ -_Static_assert(MAX_CPUS <= (1u << CPU_B), "MAX_CPUS must fit in field"); -_Static_assert(MAX_L3S <= (1u << L3_B), "MAX_L3S must fit in field"); +_Static_assert(MAX_CPUS <= (1u << CPU_B), "MAX_CPUS must fit in field"); +_Static_assert(MAX_L3S <= (1u << L3_B), "MAX_L3S must fit in field"); _Static_assert(MAX_CELLS <= (1u << CELL_B), "MAX_CELLS must fit in field"); -_Static_assert(DSQ_TYPE_CELL_L3 < (1u << TYPE_B), "DSQ_TYPE_CELL_L3 must fit in field"); +_Static_assert(DSQ_TYPE_CELL_L3 < (1u << TYPE_B), + "DSQ_TYPE_CELL_L3 must fit in field"); /* * While I considered error propagation, I decided to bail to force errors early. */ -static inline bool is_user_dsq(dsq_id_t dsq_id){ - return !dsq_id.builtin_dsq.builtin && dsq_id.user_dsq.type != DSQ_TYPE_NONE; +static inline bool is_user_dsq(dsq_id_t dsq_id) +{ + return !dsq_id.builtin_dsq.builtin && + dsq_id.user_dsq.type != DSQ_TYPE_NONE; } // Is this a per CPU DSQ? @@ -167,5 +195,7 @@ static inline dsq_id_t get_cell_l3_dsq_id(u32 cell, u32 l3) if (cell >= MAX_CELLS || l3 >= MAX_L3S) scx_bpf_error("cell %u or l3 %u too large\n", cell, l3); - return (dsq_id_t){ .cell_l3_dsq = { .l3 = l3, .cell = cell, .type = DSQ_TYPE_CELL_L3 } }; + return (dsq_id_t){ .cell_l3_dsq = { .l3 = l3, + .cell = cell, + .type = DSQ_TYPE_CELL_L3 } }; } diff --git a/scheds/rust/scx_mitosis/src/bpf/intf.h b/scheds/rust/scx_mitosis/src/bpf/intf.h index 130f4f2480..b1612430c6 100644 --- a/scheds/rust/scx_mitosis/src/bpf/intf.h +++ b/scheds/rust/scx_mitosis/src/bpf/intf.h @@ -41,10 +41,13 @@ enum consts { /* Kernel side sees the real lock; userspace sees padded bytes of same size/alignment */ #if defined(__BPF__) -# define CELL_LOCK_T struct bpf_spin_lock +#define CELL_LOCK_T struct bpf_spin_lock #else /* userspace placeholder: kernel won’t copy spin_lock */ -# define CELL_LOCK_T struct { u32 __pad; } /* 4-byte aligned as required */ +#define CELL_LOCK_T \ + struct { \ + u32 __pad; \ + } /* 4-byte aligned as required */ #endif struct cell { @@ -74,25 +77,24 @@ struct cell { // All assertions work for both BPF and userspace builds _Static_assert(offsetof(struct cell, lock) == 0, - "lock/padding must be first field"); + "lock/padding must be first field"); _Static_assert(sizeof(((struct cell *)0)->lock) == 4, - "lock/padding must be 4 bytes"); + "lock/padding must be 4 bytes"); _Static_assert(_Alignof(CELL_LOCK_T) == 4, - "lock/padding must be 4-byte aligned"); + "lock/padding must be 4-byte aligned"); _Static_assert(offsetof(struct cell, in_use) == 4, - "in_use must follow 4-byte lock/padding"); + "in_use must follow 4-byte lock/padding"); // Verify these are the same size in both BPF and Rust. _Static_assert(sizeof(struct cell) == - ( (4 * sizeof(u32)) + (4 * MAX_L3S) + (8 * MAX_L3S)), - "struct cell size must be stable for Rust bindings"); + ((4 * sizeof(u32)) + (4 * MAX_L3S) + (8 * MAX_L3S)), + "struct cell size must be stable for Rust bindings"); -// Ensure no unexpected padding was added _Static_assert(sizeof(struct cell) == 208, - "struct cell must be exactly 208 bytes"); + "struct cell must be exactly 208 bytes"); /* Statistics */ enum cell_stat_idx { diff --git a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h index eb5e35c352..2e5281984b 100644 --- a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h +++ b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h @@ -15,7 +15,7 @@ #include "intf.h" typedef u32 l3_id_t; -#define L3_INVALID ((l3_id_t) ~0u) +#define L3_INVALID ((l3_id_t)~0u) // Configure how aggressively we steal work. // When task is detected as a steal candidate, skip it this many times @@ -51,14 +51,16 @@ extern struct cpu_to_l3_map cpu_to_l3; extern struct l3_to_cpus_map l3_to_cpus; extern struct steal_stats_map steal_stats; -static inline const bool l3_is_valid(u32 l3_id) { +static inline const bool l3_is_valid(u32 l3_id) +{ if (l3_id == L3_INVALID) return false; return (l3_id >= 0) && (l3_id < MAX_L3S); } -static inline void init_task_l3(struct task_ctx *tctx) { +static inline void init_task_l3(struct task_ctx *tctx) +{ tctx->l3 = L3_INVALID; #if MITOSIS_ENABLE_STEALING @@ -67,7 +69,6 @@ static inline void init_task_l3(struct task_ctx *tctx) { tctx->last_stolen_at = 0; tctx->steals_prevented = 0; #endif - } static inline const struct cpumask *lookup_l3_cpumask(u32 l3) @@ -101,7 +102,7 @@ static __always_inline void recalc_cell_l3_counts(u32 cell_idx) u32 l3, l3s_present = 0, total_cpus = 0; // Just so we don't hold the lock longer than necessary - u32 l3_cpu_cnt_tmp[MAX_L3S] = {0}; + u32 l3_cpu_cnt_tmp[MAX_L3S] = { 0 }; { // RCU context RCU_READ_GUARD(); @@ -109,7 +110,8 @@ static __always_inline void recalc_cell_l3_counts(u32 cell_idx) lookup_cell_cpumask(cell_idx); // RCU ptr if (!cell_mask) { - scx_bpf_error("recalc_cell_l3_counts: invalid cell mask"); + scx_bpf_error( + "recalc_cell_l3_counts: invalid cell mask"); return; } @@ -117,13 +119,15 @@ static __always_inline void recalc_cell_l3_counts(u32 cell_idx) { const struct cpumask *l3_mask = lookup_l3_cpumask(l3); if (!l3_mask) { - scx_bpf_error( "recalc_cell_l3_counts: invalid l3 mask"); + scx_bpf_error( + "recalc_cell_l3_counts: invalid l3 mask"); return; } bpf_cpumask_and(tmp_guard.mask, cell_mask, l3_mask); - u32 cnt = bpf_cpumask_weight((const struct cpumask *)tmp_guard.mask); + u32 cnt = bpf_cpumask_weight( + (const struct cpumask *)tmp_guard.mask); l3_cpu_cnt_tmp[l3] = cnt; @@ -141,7 +145,7 @@ static __always_inline void recalc_cell_l3_counts(u32 cell_idx) // Write to cell bpf_spin_lock(&cell->lock); for (u32 l3 = 0; l3 < nr_l3; l3++) { - cell->l3_cpu_cnt[l3] = l3_cpu_cnt_tmp[l3]; + cell->l3_cpu_cnt[l3] = l3_cpu_cnt_tmp[l3]; } cell->l3_present_cnt = l3s_present; @@ -177,7 +181,9 @@ static inline s32 pick_l3_for_task(u32 cell_id) // No cpus if (!cell_snapshot.cpu_cnt) { - scx_bpf_error("pick_l3_for_task: cell %d has no CPUs accounted yet", cell_id); + scx_bpf_error( + "pick_l3_for_task: cell %d has no CPUs accounted yet", + cell_id); return L3_INVALID; } diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c index c1e6acf17e..44cfee2f3d 100644 --- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c @@ -63,7 +63,8 @@ struct l3_to_cpus_map l3_to_cpus SEC(".maps"); struct function_counters_map function_counters SEC(".maps"); struct steal_stats_map steal_stats SEC(".maps"); -static inline void increment_counter(enum fn_counter_idx idx) { +static inline void increment_counter(enum fn_counter_idx idx) +{ u64 *counter; u32 key = idx; @@ -168,7 +169,6 @@ static inline struct cpu_ctx *lookup_cpu_ctx(int cpu) return cctx; } - /* * Cells are allocated concurrently in some cases (e.g. cgroup_init). * allocate_cell and free_cell enable these allocations to be done safely @@ -185,8 +185,10 @@ static inline int allocate_cell() bpf_spin_lock(&c->lock); if (c->in_use == 0) { // Zero everything except the lock (which is first) - __builtin_memset(&c->in_use, 0, sizeof(struct cell) - sizeof(CELL_LOCK_T)); - c->in_use = 1; // Then mark as in use + __builtin_memset(&c->in_use, 0, + sizeof(struct cell) - + sizeof(CELL_LOCK_T)); + c->in_use = 1; // Then mark as in use bpf_spin_unlock(&c->lock); return cell_idx; } @@ -313,7 +315,8 @@ static inline int update_task_cpumask(struct task_struct *p, if (tctx->l3 != L3_INVALID) { l3_mask = lookup_l3_cpumask((u32)tctx->l3); /* If the L3 no longer intersects the cell's cpumask, invalidate it */ - if (!l3_mask || !bpf_cpumask_intersects(cell_cpumask, l3_mask)) + if (!l3_mask || + !bpf_cpumask_intersects(cell_cpumask, l3_mask)) tctx->l3 = L3_INVALID; } @@ -333,10 +336,13 @@ static inline int update_task_cpumask(struct task_struct *p, /* --- Narrow the effective cpumask by the chosen L3 --- */ /* tctx->cpumask already contains (task_affinity ∧ cell_mask) */ if (tctx->cpumask) - bpf_cpumask_and(tctx->cpumask, (const struct cpumask *)tctx->cpumask, l3_mask); + bpf_cpumask_and(tctx->cpumask, + (const struct cpumask *)tctx->cpumask, + l3_mask); /* If empty after intersection, nothing can run here */ - if (tctx->cpumask && bpf_cpumask_empty((const struct cpumask *)tctx->cpumask)) { + if (tctx->cpumask && + bpf_cpumask_empty((const struct cpumask *)tctx->cpumask)) { scx_bpf_error("Empty cpumask after intersection"); return -ENODEV; } @@ -350,7 +356,7 @@ static inline int update_task_cpumask(struct task_struct *p, return -ENOENT; } - if (!l3_is_valid(tctx->l3)){ + if (!l3_is_valid(tctx->l3)) { scx_bpf_error("Invalid L3 %d", tctx->l3); return -EINVAL; } @@ -571,7 +577,8 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags) return; if (!l3_is_valid(tctx->l3)) { - scx_bpf_error("Invalid L3 ID for task %d in enqueue", p->pid); + scx_bpf_error("Invalid L3 ID for task %d in enqueue", + p->pid); return; } basis_vtime = READ_ONCE(cell->l3_vtime_now[tctx->l3]); @@ -640,7 +647,8 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev) /* Check the L3 queue */ if (l3 != L3_INVALID) { dsq_id_t cell_l3_dsq = get_cell_l3_dsq_id(cell, l3); - bpf_for_each(scx_dsq, p, cell_l3_dsq.raw, 0) { + bpf_for_each(scx_dsq, p, cell_l3_dsq.raw, 0) + { min_vtime = p->scx.dsq_vtime; min_vtime_dsq = cell_l3_dsq; found = true; @@ -649,7 +657,8 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev) } /* Check the CPU DSQ for a lower vtime */ - bpf_for_each(scx_dsq, p, local_dsq.raw, 0) { + bpf_for_each(scx_dsq, p, local_dsq.raw, 0) + { if (!found || time_before(p->scx.dsq_vtime, min_vtime)) { min_vtime = p->scx.dsq_vtime; min_vtime_dsq = local_dsq; @@ -664,7 +673,6 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev) * we might never wakeup. */ - if (found) { // We found a task in the local or cell-L3 DSQ @@ -824,7 +832,8 @@ int running; /* The guard is a stack variable. When it falls out of scope, * we drop the running lock. */ -static inline void __running_unlock(int *guard) { +static inline void __running_unlock(int *guard) +{ (void)guard; /* unused */ WRITE_ONCE(running, 0); } @@ -834,7 +843,6 @@ static inline void __running_unlock(int *guard) { */ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run) { - u32 local_configuration_seq = READ_ONCE(configuration_seq); if (local_configuration_seq == READ_ONCE(applied_configuration_seq)) return; @@ -908,7 +916,8 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run) * Iterate over all cgroups, check if any have a cpumask and populate them * as a separate cell. */ - bpf_for_each(css, pos, root_css, BPF_CGROUP_ITER_DESCENDANTS_PRE) { + bpf_for_each(css, pos, root_css, BPF_CGROUP_ITER_DESCENDANTS_PRE) + { cur_cgrp = pos->cgroup; /* @@ -1033,7 +1042,8 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run) int cpu_idx; bpf_for(cpu_idx, 0, nr_possible_cpus) { - if (bpf_cpumask_test_cpu( cpu_idx, (const struct cpumask *)root_bpf_cpumask)) { + if (bpf_cpumask_test_cpu(cpu_idx, (const struct cpumask *) + root_bpf_cpumask)) { struct cpu_ctx *cpu_ctx; if (!(cpu_ctx = lookup_cpu_ctx(cpu_idx))) goto out_root_cgrp; @@ -1058,10 +1068,12 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run) int cell_idx; /* Recalculate L3 counts for all active cells after CPU assignment changes */ - bpf_for(cell_idx, 1, MAX_CELLS) { + bpf_for(cell_idx, 1, MAX_CELLS) + { struct cell *cell; if (!(cell = lookup_cell(cell_idx))) { - scx_bpf_error("Lookup for cell %d failed in tick()", cell_idx); + scx_bpf_error("Lookup for cell %d failed in tick()", + cell_idx); goto out_root_cgrp; } @@ -1132,8 +1144,10 @@ void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p) * Update per-(cell, L3) vtime for cell-schedulable tasks */ if (tctx->all_cell_cpus_allowed && l3_is_valid(tctx->l3)) { - if (time_before(READ_ONCE(cell->l3_vtime_now[tctx->l3]), p->scx.dsq_vtime)) - WRITE_ONCE(cell->l3_vtime_now[tctx->l3], p->scx.dsq_vtime); + if (time_before(READ_ONCE(cell->l3_vtime_now[tctx->l3]), + p->scx.dsq_vtime)) + WRITE_ONCE(cell->l3_vtime_now[tctx->l3], + p->scx.dsq_vtime); } /* @@ -1386,22 +1400,67 @@ static __always_inline void dump_cell_state(u32 cell_idx) } scx_bpf_dump("Cell %d: in_use=%d, cpu_cnt=%d, l3_present_cnt=%d", - cell_idx, cell->in_use, cell->cpu_cnt, cell->l3_present_cnt); + cell_idx, cell->in_use, cell->cpu_cnt, + cell->l3_present_cnt); u32 l3; // TODO Print vtimes for L3s // TODO lock - bpf_for(l3, 0, nr_l3) { + bpf_for(l3, 0, nr_l3) + { if (cell->l3_cpu_cnt[l3] > 0) { - scx_bpf_dump(" L3[%d]: %d CPUs", l3, cell->l3_cpu_cnt[l3]); + scx_bpf_dump(" L3[%d]: %d CPUs", l3, + cell->l3_cpu_cnt[l3]); } } } -// TODO: FIX THIS -static __always_inline void dump_l3_state(){ +static __always_inline void dump_l3_state() +{ + u32 l3; + const struct cpumask *l3_mask; + dsq_id_t dsq_id; + + scx_bpf_dump("\n=== L3 Cache Topology ===\n"); + scx_bpf_dump("Total L3 domains: %d\n", nr_l3); + bpf_for(l3, 0, nr_l3) + { + l3_mask = lookup_l3_cpumask(l3); + if (!l3_mask) { + scx_bpf_dump( + "L3[%d]: ERROR - failed to lookup cpumask\n", + l3); + continue; + } + + scx_bpf_dump("L3[%d] CPUS=", l3); + dump_cpumask(l3_mask); + scx_bpf_dump("\n"); + scx_bpf_dump(" Per-cell DSQ stats:\n"); + u32 cell_idx; + bpf_for(cell_idx, 0, MAX_CELLS) + { + struct cell *cell = lookup_cell(cell_idx); + if (!cell || !cell->in_use) + continue; + + if (!l3_is_valid(l3)) + continue; + + dsq_id = get_cell_l3_dsq_id(cell_idx, l3); + u64 nr_queued = scx_bpf_dsq_nr_queued(dsq_id.raw); + + if (nr_queued > 0 || cell->l3_cpu_cnt[l3] > 0) { + scx_bpf_dump( + " Cell[%d]: %d CPUs, vtime=%llu, nr_queued=%llu\n", + cell_idx, cell->l3_cpu_cnt[l3], + READ_ONCE(cell->l3_vtime_now[l3]), + nr_queued); + } + } + } } void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx) @@ -1439,7 +1498,6 @@ void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx) } dump_l3_state(); - } void BPF_STRUCT_OPS(mitosis_dump_task, struct scx_dump_ctx *dctx, @@ -1485,7 +1543,8 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init) if ((u8_ptr = MEMBER_VPTR(all_cpus, [i / 8]))) { if (*u8_ptr & (1 << (i % 8))) { bpf_cpumask_set_cpu(i, cpumask); - ret = scx_bpf_create_dsq(get_cpu_dsq_id(i).raw, ANY_NUMA); + ret = scx_bpf_create_dsq(get_cpu_dsq_id(i).raw, + ANY_NUMA); if (ret < 0) { bpf_cpumask_release(cpumask); return ret; @@ -1496,7 +1555,6 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init) } } - cpumask = bpf_kptr_xchg(&all_cpumask, cpumask); if (cpumask) bpf_cpumask_release(cpumask); @@ -1553,9 +1611,12 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init) u32 l3; bpf_for(l3, 0, nr_l3) { - ret = scx_bpf_create_dsq(get_cell_l3_dsq_id(i, l3).raw, ANY_NUMA); + ret = scx_bpf_create_dsq(get_cell_l3_dsq_id(i, l3).raw, + ANY_NUMA); if (ret < 0) - scx_bpf_error( "Failed to create DSQ for cell %d, L3 %d: err %d", i, l3, ret); + scx_bpf_error( + "Failed to create DSQ for cell %d, L3 %d: err %d", + i, l3, ret); } } diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h index e42f7379f2..52738c6a21 100644 --- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h @@ -75,9 +75,8 @@ static inline void copy_cell_skip_lock(struct cell *dst, const struct cell *src) * Since lock is first and 4 bytes (verified by static assertions), * we skip it and copy the remainder of the struct. */ - __builtin_memcpy(&dst->in_use, - &src->in_use, - sizeof(struct cell) - sizeof(CELL_LOCK_T)); + __builtin_memcpy(&dst->in_use, &src->in_use, + sizeof(struct cell) - sizeof(CELL_LOCK_T)); } static inline struct cell *lookup_cell(int idx) @@ -166,35 +165,43 @@ struct rcu_read_guard { bool active; }; -static inline struct rcu_read_guard rcu_read_lock_guard(void) { +static inline struct rcu_read_guard rcu_read_lock_guard(void) +{ bpf_rcu_read_lock(); - return (struct rcu_read_guard){.active = true}; + return (struct rcu_read_guard){ .active = true }; } -static inline void rcu_read_guard_release(struct rcu_read_guard *guard) { +static inline void rcu_read_guard_release(struct rcu_read_guard *guard) +{ if (guard->active) { bpf_rcu_read_unlock(); guard->active = false; } } -#define RCU_READ_GUARD() \ - struct rcu_read_guard __rcu_guard __attribute__((__cleanup__(rcu_read_guard_release))) = rcu_read_lock_guard() +#define RCU_READ_GUARD() \ + struct rcu_read_guard __rcu_guard \ + __attribute__((__cleanup__(rcu_read_guard_release))) = \ + rcu_read_lock_guard() struct cpumask_guard { struct bpf_cpumask *mask; }; -static inline struct cpumask_guard cpumask_create_guard(void) { +static inline struct cpumask_guard cpumask_create_guard(void) +{ struct bpf_cpumask *mask = bpf_cpumask_create(); - return (struct cpumask_guard){.mask = mask}; + return (struct cpumask_guard){ .mask = mask }; } -static inline void cpumask_guard_release(struct cpumask_guard *guard) { +static inline void cpumask_guard_release(struct cpumask_guard *guard) +{ if (guard->mask) { bpf_cpumask_release(guard->mask); guard->mask = NULL; } } -#define CPUMASK_GUARD(var_name) \ - struct cpumask_guard var_name __attribute__((__cleanup__(cpumask_guard_release))) = cpumask_create_guard() +#define CPUMASK_GUARD(var_name) \ + struct cpumask_guard var_name \ + __attribute__((__cleanup__(cpumask_guard_release))) = \ + cpumask_create_guard() From d0a7eed1d999faf8d15093b2eda02eff4fc46f53 Mon Sep 17 00:00:00 2001 From: tommy-u Date: Thu, 9 Oct 2025 14:54:41 -0700 Subject: [PATCH 11/12] remove accidental code file --- code.txt | 2382 ------------------------------------------------------ 1 file changed, 2382 deletions(-) delete mode 100644 code.txt diff --git a/code.txt b/code.txt deleted file mode 100644 index 64c3002bbe..0000000000 --- a/code.txt +++ /dev/null @@ -1,2382 +0,0 @@ -]633;E;for file in scheds/rust/scx_mitosis/src/bpf/*;7dc75c10-53e2-4af4-8cab-ea0159bd7502]633;C# File: scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h -/* Copyright (c) Meta Platforms, Inc. and affiliates. */ -/* - * This software may be used and distributed according to the terms of the - * GNU General Public License version 2. - * - * This header defines the 64-bit dispatch queue (DSQ) ID encoding - * scheme for scx_mitosis, using type fields to distinguish between - * per-CPU and cell+L3 domain queues. It includes helper functions to - * construct, validate, and parse these DSQ IDs for queue management. - */ -#pragma once - -#include "intf.h" -#include "mitosis.bpf.h" - -/* - * ================================ - * BPF DSQ ID Layout (64 bits wide) - * ================================ - * - * Top-level format: - * [63] [62..0] - * [ B] [ ID ] - * - * If B == 1 it is a Built-in DSQ - * ------------------------- - * [63] [62] [61 .. 32] [31..0] - * [ 1] [ L] [ R ] [ V ] - * - * - L (bit 62): LOCAL_ON flag - * If L == 1 -> V = CPU number - * - R (30 bits): reserved / unused - * - V (32 bits): value (e.g., CPU#) - * - * If B == 0 -> User-defined DSQ - * ----------------------------- - * Only the low 32 bits are used. - * - * [63 .. 32] [31..0] - * [ 0][ unused ] [ VAL ] - * - * Mitosis uses VAL as follows: - * - * [31..28] [27..0] - * [QTYPE ] [DATA ] - * - * QTYPE encodes the queue type: - * - * QTYPE = 0x1 -> Per-CPU Q - * [31..28] [27 .. .. 0] - * [ 0001 ] [ CPU# ] - * [Q-TYPE:1] - * - * QTYPE = 0x2 -> Cell+L3 Q - * [31..28] [27 .. 16] [15 .. 0] - * [ 0010 ] [ CELL# ] [ L3ID ] - * [Q-TYPE:2] - * - */ -/* - * The use of these bitfields depends on compiler defined byte AND bit ordering. - * Make sure we're only building with Clang/LLVM and that we're little-endian. - */ -#ifndef __clang__ -#error "This code must be compiled with Clang/LLVM (eBPF: clang -target bpf)." -#endif - -#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ -#error "dsq64 bitfield layout assumes little-endian (bpfel)." -#endif - -/* ---- Bitfield widths (bits) ---- */ -#define CPU_B 28 -#define L3_B 16 -#define CELL_B 12 -#define TYPE_B 4 -#define DATA_B 28 -#define RSVD_B 32 - -/* Sum checks (in bits) */ -_Static_assert(CPU_B + TYPE_B == 32, "CPU layout low half must be 32 bits"); -_Static_assert(L3_B + CELL_B + TYPE_B == 32, "CELL+L3 layout low half must be 32 bits"); -_Static_assert(DATA_B + TYPE_B == 32, "Common layout low half must be 32 bits"); - -typedef union { - u64 raw; - - /* Per-CPU user DSQ */ - struct { u64 cpu: CPU_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } cpu_dsq; - - /* Cell+L3 user DSQ */ - struct { u64 l3: L3_B; u64 cell: CELL_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } cell_l3_dsq; - - /* Generic user view */ - struct { u64 data: DATA_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } user_dsq; - - /* Built-in DSQ view */ - struct { u64 value:32; u64 rsvd:30; u64 local_on:1; u64 builtin:1; } builtin_dsq; - - /* NOTE: Considered packed and aligned attributes, but that's redundant */ -} dsq_id_t; - -/* - * Invalid DSQ ID Sentinel: - * invalid bc bit 63 clear (it's a user DSQ) && dsq_type == 0 (no type) - * Good for catching uninitialized DSQ IDs. -*/ -#define DSQ_INVALID ((u64) 0) - -_Static_assert(sizeof(((dsq_id_t){0}).cpu_dsq) == sizeof(u64), "cpu view must be 8 bytes"); -_Static_assert(sizeof(((dsq_id_t){0}).cell_l3_dsq) == sizeof(u64), "cell+l3 view must be 8 bytes"); -_Static_assert(sizeof(((dsq_id_t){0}).user_dsq) == sizeof(u64), "user common view must be 8 bytes"); -_Static_assert(sizeof(((dsq_id_t){0}).builtin_dsq) == sizeof(u64), "builtin view must be 8 bytes"); - -/* Compile-time checks (in bytes) */ -_Static_assert(sizeof(dsq_id_t) == sizeof(u64), "dsq_id_t must be 8 bytes (64 bits)"); -_Static_assert(_Alignof(dsq_id_t) == sizeof(u64), "dsq_id_t must be 8-byte aligned"); - -/* DSQ type enumeration */ -enum dsq_type { - DSQ_TYPE_NONE, - DSQ_TYPE_CPU, - DSQ_TYPE_CELL_L3, -}; - -/* Range guards */ -_Static_assert(MAX_CPUS <= (1u << CPU_B), "MAX_CPUS must fit in field"); -_Static_assert(MAX_L3S <= (1u << L3_B), "MAX_L3S must fit in field"); -_Static_assert(MAX_CELLS <= (1u << CELL_B), "MAX_CELLS must fit in field"); -_Static_assert(DSQ_TYPE_CELL_L3 < (1u << TYPE_B), "DSQ_TYPE_CELL_L3 must fit in field"); - -/* - * While I considered error propagation, I decided to bail to force errors early. -*/ - -static inline bool is_user_dsq(dsq_id_t dsq_id){ - return !dsq_id.builtin_dsq.builtin && dsq_id.user_dsq.type != DSQ_TYPE_NONE; -} - -// Is this a per CPU DSQ? -static inline bool is_cpu_dsq(dsq_id_t dsq_id) -{ - return is_user_dsq(dsq_id) && dsq_id.user_dsq.type == DSQ_TYPE_CPU; -} - -// If this is a per cpu dsq, return the cpu -static inline u32 get_cpu_from_dsq(dsq_id_t dsq_id) -{ - if (!is_cpu_dsq(dsq_id)) - scx_bpf_error("trying to get cpu from non-cpu dsq\n"); - - return dsq_id.cpu_dsq.cpu; -} - -/* Helper functions to construct DSQ IDs */ -static inline dsq_id_t get_cpu_dsq_id(u32 cpu) -{ - // Check for valid CPU range, 0 indexed so >=. - if (cpu >= MAX_CPUS) - scx_bpf_error("invalid cpu %u\n", cpu); - - return (dsq_id_t){ .cpu_dsq = { .cpu = cpu, .type = DSQ_TYPE_CPU } }; -} - -static inline dsq_id_t get_cell_l3_dsq_id(u32 cell, u32 l3) -{ - if (cell >= MAX_CELLS || l3 >= MAX_L3S) - scx_bpf_error("cell %u or l3 %u too large\n", cell, l3); - - return (dsq_id_t){ .cell_l3_dsq = { .l3 = l3, .cell = cell, .type = DSQ_TYPE_CELL_L3 } }; -} -# File: scheds/rust/scx_mitosis/src/bpf/intf.h -// Copyright (c) Meta Platforms, Inc. and affiliates. - -// This software may be used and distributed according to the terms of the -// GNU General Public License version 2. -#ifndef __INTF_H -#define __INTF_H - -#ifndef __KERNEL__ -typedef unsigned long long u64; -typedef unsigned int u32; -typedef _Bool bool; -#endif - -#ifdef LSP -#define __bpf__ -#include "../../../../include/scx/ravg.bpf.h" -#else -#include -#endif - -/* ---- Work stealing config (compile-time) ------------------------------- */ -#ifndef MITOSIS_ENABLE_STEALING -#define MITOSIS_ENABLE_STEALING 1 -#endif -/* ----------------------------------------------------------------------- */ - -enum consts { - CACHELINE_SIZE = 64, - MAX_CPUS_SHIFT = 9, - MAX_CPUS = 1 << MAX_CPUS_SHIFT, - MAX_CPUS_U8 = MAX_CPUS / 8, - MAX_CELLS = 16, - USAGE_HALF_LIFE = 100000000, /* 100ms */ - - PCPU_BASE = 0x80000000, - MAX_CG_DEPTH = 256, -}; - -/* Statistics */ -enum cell_stat_idx { - CSTAT_LOCAL, - CSTAT_CPU_DSQ, - CSTAT_CELL_DSQ, - CSTAT_AFFN_VIOL, - NR_CSTATS, -}; - -/* Function invocation counters */ -enum fn_counter_idx { - COUNTER_SELECT_CPU, - COUNTER_ENQUEUE, - COUNTER_DISPATCH, - NR_COUNTERS, -}; - -struct cpu_ctx { - u64 cstats[MAX_CELLS][NR_CSTATS]; - u64 cell_cycles[MAX_CELLS]; - u32 cell; - u64 vtime_now; -}; - -struct cgrp_ctx { - u32 cell; - bool cell_owner; -}; - -#endif /* __INTF_H */ -# File: scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h -/* Copyright (c) Meta Platforms, Inc. and affiliates. */ -/* - * This software may be used and distributed according to the terms of the - * GNU General Public License version 2. - * - * This header assists adding L3 cache awareness to scx_mitosis by defining - * maps and fns for managing CPU-to-L3 domain mappings. It provides code to - * recalculate per-L3 CPU counts within cells and implements weighted - * random L3 selection for tasks. It also tracks work-stealing - * statistics for cross-L3 task migrations. - */ -#pragma once - -#include "mitosis.bpf.h" -#include "intf.h" - -typedef u32 l3_id_t; -#define L3_INVALID ((l3_id_t)~0u) - -// Configure how aggressively we steal work. -// When task is detected as a steal candidate, skip it this many times -// On a web server workload, 100 reduced steal count by ~90% -#ifdef MITOSIS_ENABLE_STEALING -#define PREVENT_N_STEALS 0 -#endif - -/* Work stealing statistics map - accessible from both BPF and userspace */ -struct steal_stats_map { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, u32); - __type(value, u64); - __uint(max_entries, 1); -}; - -// A CPU -> L3 cache ID map -struct cpu_to_l3_map { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, u32); - __type(value, u32); - __uint(max_entries, MAX_CPUS); -}; - -struct l3_to_cpus_map { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, u32); - __type(value, struct cpumask); - __uint(max_entries, MAX_L3S); -}; - -extern struct cpu_to_l3_map cpu_to_l3; -extern struct l3_to_cpus_map l3_to_cpus; -extern struct steal_stats_map steal_stats; - -static inline const bool l3_is_valid(u32 l3_id) -{ - if (l3_id == L3_INVALID) - return false; - - return (l3_id >= 0) && (l3_id < MAX_L3S); -} - -static inline void init_task_l3(struct task_ctx *tctx) -{ - tctx->l3 = L3_INVALID; - -#if MITOSIS_ENABLE_STEALING - tctx->pending_l3 = L3_INVALID; - tctx->steal_count = 0; - tctx->last_stolen_at = 0; - tctx->steals_prevented = 0; -#endif -} - -static inline const struct cpumask *lookup_l3_cpumask(u32 l3) -{ - struct cpumask *mask; - - if (!(mask = bpf_map_lookup_elem(&l3_to_cpus, &l3))) { - scx_bpf_error("no l3 cpumask, l3: %d, %p", l3, &l3_to_cpus); - return NULL; - } - - return mask; -} - -/* Recompute cell->l3_cpu_cnt[] after cell cpumask changes */ -// TODO: use RAII and lock around updates (races with ) -static __always_inline void recalc_cell_l3_counts(u32 cell_idx) -{ - struct cell *cell = lookup_cell(cell_idx); - if (!cell) { - scx_bpf_error("recalc_cell_l3_counts: invalid cell %d", - cell_idx); - return; - } - - CPUMASK_GUARD(tmp_guard); - if (!tmp_guard.mask) { - scx_bpf_error( - "recalc_cell_l3_counts: failed to create tmp mask"); - return; - } - - u32 l3, l3s_present = 0, total_cpus = 0; - // Just so we don't hold the lock longer than necessary - u32 l3_cpu_cnt_tmp[MAX_L3S] = {0}; - - { // RCU context - RCU_READ_GUARD(); - const struct cpumask *cell_mask = - lookup_cell_cpumask(cell_idx); // RCU ptr - - if (!cell_mask) { - scx_bpf_error( - "recalc_cell_l3_counts: invalid cell mask"); - return; - } - - bpf_for(l3, 0, nr_l3) - { - const struct cpumask *l3_mask = lookup_l3_cpumask(l3); - if (!l3_mask) { - scx_bpf_error( - "recalc_cell_l3_counts: invalid l3 mask"); - return; - } - - bpf_cpumask_and(tmp_guard.mask, cell_mask, l3_mask); - - u32 cnt = bpf_cpumask_weight( (const struct cpumask *)tmp_guard.mask); - - l3_cpu_cnt_tmp[l3] = cnt; - - bpf_printk("recalc_cell_l3_counts: cnt %d", cnt); - - // These are counted across the whole cell - total_cpus += cnt; - - if (cnt) - l3s_present++; - } - } // bpf_rcu_read_unlock(); - - // WITH_CELL_LOCK(cell, cell_idx, { - for (u32 l3 = 0; l3 < nr_l3; l3++) { - cell->l3_cpu_cnt[l3] = l3_cpu_cnt_tmp[l3]; - } - - cell->l3_present_cnt = l3s_present; - cell->cpu_cnt = total_cpus; - // }); -} - -/** - * Weighted random selection of an L3 cache domain for a task. - * - * Uses the CPU count in each L3 domain within the cell as weights to - * probabilistically select an L3. L3 domains with more CPUs in the cell - * have higher probability of being selected. - * - * @cell_id: The cell ID to select an L3 from - * @return: L3 ID on success, L3_INVALID on error - */ -// TODO: Lock -static inline s32 pick_l3_for_task(u32 cell_id) -{ - struct cell *cell; - - /* Look up the cell structure */ - if (!(cell = lookup_cell(cell_id))) { - scx_bpf_error("pick_l3_for_task: invalid cell %d", cell_id); - return L3_INVALID; - } - - // No cells - if (!cell->cpu_cnt) { - scx_bpf_error( "pick_l3_for_task: cell %d has no CPUs accounted yet", cell_id); - return L3_INVALID; - } - - /* Find the L3 domain corresponding to the target value using - * weighted selection - accumulate CPU counts until we exceed target */ - - /* Generate random target value in range [0, cpu_cnt) */ - u32 target = bpf_get_prandom_u32() % cell->cpu_cnt; - u32 l3, cur = 0; - s32 ret = L3_INVALID; - - // This could be a prefix sum. Find first l3 where we exceed target - bpf_for(l3, 0, nr_l3) - { - cur += cell->l3_cpu_cnt[l3]; - if (target < cur) { - ret = (s32)l3; - break; - } - } - - if (ret == L3_INVALID) { - scx_bpf_error("pick_l3_for_task: invalid L3"); - return L3_INVALID; - } - - return ret; -} - -#ifdef MITOSIS_ENABLE_STEALING - -static inline bool try_stealing_this_task(struct task_ctx *task_ctx, - s32 local_l3, u64 candidate_dsq) -{ - // Attempt the steal, can fail beacuse it's a race. - if (!scx_bpf_dsq_move_to_local(candidate_dsq)) - return false; - - // We got the task! - task_ctx->steal_count++; - task_ctx->last_stolen_at = scx_bpf_now(); - /* Retag to thief L3 (the one for this cpu) */ - task_ctx->pending_l3 = local_l3; - task_ctx->steals_prevented = 0; - - /* Increment steal counter in map */ - u32 key = 0; - u64 *count = bpf_map_lookup_elem(&steal_stats, &key); - // NOTE: This could get expensive, but I'm not anticipating that many steals. Percpu if we care. - if (count) - __sync_fetch_and_add(count, 1); - - return true; -} - -/* Work stealing: - * Scan sibling (cell,L3) DSQs in the same cell and steal the first queued task if it can run on this cpu -*/ -static inline bool try_stealing_work(u32 cell, s32 local_l3) -{ - if (!l3_is_valid(local_l3)) - scx_bpf_error("try_stealing_work: invalid local_l3"); - - struct cell *cell_ptr = lookup_cell(cell); - if (!cell_ptr) - scx_bpf_error("try_stealing_work: invalid cell"); - - // Loop over all other L3s, looking for a queued task to steal - u32 i; - bpf_for(i, 1, nr_l3) - { - // Start with the next one to spread out the load - u32 candidate_l3 = (local_l3 + i) % nr_l3; - - // Prevents the optimizer from removing the following conditional return - // so that the verifier knows the read wil be safe - barrier_var(candidate_l3); - - if (candidate_l3 >= MAX_L3S) - continue; - - // Skip L3s that are not present in this cell - // Note: rechecking cell_ptr for verifier - // TODO: Lock? - if (cell_ptr && cell_ptr->l3_cpu_cnt[candidate_l3] == 0) - continue; - - u64 candidate_dsq = get_cell_l3_dsq_id(cell, candidate_l3).raw; - - struct task_struct *task = NULL; - struct task_ctx *task_ctx; - // I'm only using this for the verifier - bool found_task = false; - - // Optimization: skip if faster than constructing an iterator - // Not redundant with later checking if task found (race) - if (scx_bpf_dsq_nr_queued(candidate_dsq)) - continue; - - // Just a trick for peeking the head element - bpf_for_each(scx_dsq, task, candidate_dsq, 0) - { - task_ctx = lookup_task_ctx(task); - found_task = (task_ctx != NULL); - break; - } - - // No task? Try next L3 - if (!found_task) - continue; - - // This knob throttles stealing. - // TODO: make runtime configurable - if (task_ctx->steals_prevented++ < PREVENT_N_STEALS) { - continue; - } - - if (!try_stealing_this_task(task_ctx, local_l3, candidate_dsq)) - continue; - - // Success, we got a task (no guarantee it was the one we peeked though... race) - return true; - } - return false; -} -#endif -# File: scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c -/* Copyright (c) Meta Platforms, Inc. and affiliates. */ -/* - * This software may be used and distributed according to the terms of the - * GNU General Public License version 2. - * - * scx_mitosis is a dynamic affinity scheduler. Cgroups (and their tasks) are - * assigned to Cells which are affinitized to discrete sets of CPUs. The number - * of cells is dynamic, as is cgroup to cell assignment and cell to CPU - * assignment (all are determined by userspace). - * - * Each cell has an associated DSQ which it uses for vtime scheduling of the - * cgroups belonging to the cell. - */ - -// TODO: fix debug printer. -#include "intf.h" - -#include "mitosis.bpf.h" -#include "dsq.bpf.h" -#include "l3_aware.bpf.h" - -char _license[] SEC("license") = "GPL"; - -/* - * Variables populated by userspace - */ -const volatile u32 nr_possible_cpus = 1; -const volatile bool smt_enabled = true; -const volatile unsigned char all_cpus[MAX_CPUS_U8]; - -const volatile u64 slice_ns; -const volatile u64 root_cgid = 1; - -const volatile u32 nr_l3 = 1; -/* - * CPU assignment changes aren't fully in effect until a subsequent tick() - * configuration_seq is bumped on each assignment change - * applied_configuration_seq is bumped when the effect is fully applied - */ -u32 configuration_seq; -u32 applied_configuration_seq; - -private(all_cpumask) struct bpf_cpumask __kptr *all_cpumask; -private(root_cgrp) struct cgroup __kptr *root_cgrp; - -UEI_DEFINE(uei); - -// Cells now defined as a map so we can lock. -struct cell_map cells SEC(".maps"); - -/* - * Maps used for L3-aware scheduling -*/ -#if 0 -struct cell_locks_map cell_locks SEC(".maps"); -#endif -struct cpu_to_l3_map cpu_to_l3 SEC(".maps"); -struct l3_to_cpus_map l3_to_cpus SEC(".maps"); - -/* - * Maps for statistics -*/ -struct function_counters_map function_counters SEC(".maps"); -struct steal_stats_map steal_stats SEC(".maps"); - -static inline void increment_counter(enum fn_counter_idx idx) { - u64 *counter; - u32 key = idx; - - counter = bpf_map_lookup_elem(&function_counters, &key); - if (counter) - (*counter)++; -} - -static inline struct cgroup *lookup_cgrp_ancestor(struct cgroup *cgrp, - u32 ancestor) -{ - struct cgroup *cg; - - if (!(cg = bpf_cgroup_ancestor(cgrp, ancestor))) { - scx_bpf_error("Failed to get ancestor level %d for cgid %llu", - ancestor, cgrp->kn->id); - return NULL; - } - - return cg; -} - -struct { - __uint(type, BPF_MAP_TYPE_CGRP_STORAGE); - __uint(map_flags, BPF_F_NO_PREALLOC); - __type(key, int); - __type(value, struct cgrp_ctx); -} cgrp_ctxs SEC(".maps"); - -static inline struct cgrp_ctx *lookup_cgrp_ctx_fallible(struct cgroup *cgrp) -{ - struct cgrp_ctx *cgc; - - if (!(cgc = bpf_cgrp_storage_get(&cgrp_ctxs, cgrp, 0, 0))) { - return NULL; - } - - return cgc; -} - -static inline struct cgrp_ctx *lookup_cgrp_ctx(struct cgroup *cgrp) -{ - struct cgrp_ctx *cgc = lookup_cgrp_ctx_fallible(cgrp); - - if (!cgc) - scx_bpf_error("cgrp_ctx lookup failed for cgid %llu", - cgrp->kn->id); - - return cgc; -} - -static inline struct cgroup *task_cgroup(struct task_struct *p) -{ - struct cgroup *cgrp = __COMPAT_scx_bpf_task_cgroup(p); - if (!cgrp) { - scx_bpf_error("Failed to get cgroup for task %d", p->pid); - } - return cgrp; -} - -struct { - __uint(type, BPF_MAP_TYPE_TASK_STORAGE); - __uint(map_flags, BPF_F_NO_PREALLOC); - __type(key, int); - __type(value, struct task_ctx); -} task_ctxs SEC(".maps"); - -static inline struct task_ctx *lookup_task_ctx(struct task_struct *p) -{ - struct task_ctx *tctx; - - if ((tctx = bpf_task_storage_get(&task_ctxs, p, 0, 0))) { - return tctx; - } - - scx_bpf_error("task_ctx lookup failed"); - return NULL; -} - -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, struct cpu_ctx); - __uint(max_entries, 1); -} cpu_ctxs SEC(".maps"); - -static inline struct cpu_ctx *lookup_cpu_ctx(int cpu) -{ - struct cpu_ctx *cctx; - u32 zero = 0; - - if (cpu < 0) - cctx = bpf_map_lookup_elem(&cpu_ctxs, &zero); - else - cctx = bpf_map_lookup_percpu_elem(&cpu_ctxs, &zero, cpu); - - if (!cctx) { - scx_bpf_error("no cpu_ctx for cpu %d", cpu); - return NULL; - } - - return cctx; -} - - - -/* - * Cells are allocated concurrently in some cases (e.g. cgroup_init). - * allocate_cell and free_cell enable these allocations to be done safely - */ -static inline int allocate_cell() -{ - int cell_idx; - bpf_for(cell_idx, 0, MAX_CELLS) - { - struct cell *c; - if (!(c = lookup_cell(cell_idx))) - return -1; - - if (__sync_bool_compare_and_swap(&c->in_use, 0, 1)) { - // TODO XXX, I think we need to make this concurrent safe - // TODO, lock with recalc_cell...() - __builtin_memset(c->l3_cpu_cnt, 0, sizeof(c->l3_cpu_cnt)); - c->l3_present_cnt = 0; - // TODO zero cpu_cnt - // TODO Just zero the whole cell struct? - return cell_idx; - } - } - scx_bpf_error("No available cells to allocate"); - return -1; -} - -static inline int free_cell(int cell_idx) -{ - struct cell *c; - - if (cell_idx < 0 || cell_idx >= MAX_CELLS) { - scx_bpf_error("Invalid cell %d", cell_idx); - return -1; - } - - if (!(c = lookup_cell(cell_idx))) - return -1; - - WRITE_ONCE(c->in_use, 0); - return 0; -} - -/* - * Store the cpumask for each cell (owned by BPF logic). We need this in an - * explicit map to allow for these to be kptrs. - */ -struct cell_cpumask_wrapper { - struct bpf_cpumask __kptr *cpumask; - /* - * To avoid allocation on the reconfiguration path, have a second cpumask we - * can just do an xchg on. - */ - struct bpf_cpumask __kptr *tmp_cpumask; -}; - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, u32); - __type(value, struct cell_cpumask_wrapper); - __uint(max_entries, MAX_CELLS); - __uint(map_flags, 0); -} cell_cpumasks SEC(".maps"); - -static inline const struct cpumask *lookup_cell_cpumask(int idx) -{ - struct cell_cpumask_wrapper *cpumaskw; - - if (!(cpumaskw = bpf_map_lookup_elem(&cell_cpumasks, &idx))) { - scx_bpf_error("no cell cpumask"); - return NULL; - } - - return (const struct cpumask *)cpumaskw->cpumask; -} - -/* - * Helper functions for bumping per-cell stats - */ -static void cstat_add(enum cell_stat_idx idx, u32 cell, struct cpu_ctx *cctx, - s64 delta) -{ - u64 *vptr; - - if ((vptr = MEMBER_VPTR(*cctx, .cstats[cell][idx]))) - (*vptr) += delta; - else - scx_bpf_error("invalid cell or stat idxs: %d, %d", idx, cell); -} - -static void cstat_inc(enum cell_stat_idx idx, u32 cell, struct cpu_ctx *cctx) -{ - cstat_add(idx, cell, cctx, 1); -} - -static inline int update_task_cpumask(struct task_struct *p, - struct task_ctx *tctx) -{ - const struct cpumask *cell_cpumask; - struct cpu_ctx *cpu_ctx; - u32 cpu; - - if (!(cell_cpumask = lookup_cell_cpumask(tctx->cell))) - return -ENOENT; - - if (!tctx->cpumask) - return -EINVAL; - - /* - * Calculate the intersection of CPUs that are both: - * 1. In this task's assigned cell (cell_cpumask) - * 2. Allowed by the task's CPU affinity (p->cpus_ptr) - * Store result in tctx->cpumask - this becomes the effective CPU set - * where this task can actually run. - */ - bpf_cpumask_and(tctx->cpumask, cell_cpumask, p->cpus_ptr); - - /* - * Check if the task can run on ALL CPUs in its assigned cell. - * If cell_cpumask is a subset of p->cpus_ptr, it means the task's - * CPU affinity doesn't restrict it within the cell - it can use - * any CPU in the cell. This affects scheduling decisions later. - * True if all the bits in cell_cpumask are set in p->cpus_ptr. - */ - tctx->all_cell_cpus_allowed = - bpf_cpumask_subset(cell_cpumask, p->cpus_ptr); - - /* - * XXX - To be correct, we'd need to calculate the vtime - * delta in the previous dsq, scale it by the load - * fraction difference and then offset from the new - * dsq's vtime_now. For now, just do the simple thing - * and assume the offset to be zero. - * - * Revisit if high frequency dynamic cell switching - * needs to be supported. - */ - - // We want to set the task vtime to that of the cell it's joining. - if (tctx->all_cell_cpus_allowed) { - - const struct cpumask *l3_mask = NULL; - if (tctx->l3 != L3_INVALID) { - l3_mask = lookup_l3_cpumask((u32)tctx->l3); - /* If the L3 no longer intersects the cell's cpumask, invalidate it */ - if (!l3_mask || !bpf_cpumask_intersects(cell_cpumask, l3_mask)) - tctx->l3 = L3_INVALID; - } - - /* --- Pick a new L3 if needed --- */ - if (tctx->l3 == L3_INVALID) { - s32 new_l3 = pick_l3_for_task(tctx->cell); - if (new_l3 < 0) { - scx_bpf_error("bad L3: %d", new_l3); - return -ENODEV; - } - tctx->l3 = new_l3; - l3_mask = lookup_l3_cpumask((u32)tctx->l3); - if (!l3_mask) - return -ENOENT; - } - - /* --- Narrow the effective cpumask by the chosen L3 --- */ - /* tctx->cpumask already contains (task_affinity ∧ cell_mask) */ - if (tctx->cpumask) - bpf_cpumask_and(tctx->cpumask, (const struct cpumask *)tctx->cpumask, l3_mask); - - /* If empty after intersection, nothing can run here */ - if (tctx->cpumask && bpf_cpumask_empty((const struct cpumask *)tctx->cpumask)) { - scx_bpf_error("Empty cpumask after intersection"); - return -ENODEV; - } - - /* --- Point to the correct (cell,L3) DSQ and set vtime baseline --- */ - tctx->dsq = get_cell_l3_dsq_id(tctx->cell, tctx->l3); - - struct cell *cell = lookup_cell(tctx->cell); - if (!cell) - return -ENOENT; - - if (!l3_is_valid(tctx->l3)){ - scx_bpf_error("Invalid L3 %d", tctx->l3); - return -EINVAL; - } - - p->scx.dsq_vtime = READ_ONCE(cell->l3_vtime_now[tctx->l3]); - } else { - /* Task is CPU-restricted, use task mask */ - cpu = bpf_cpumask_any_distribute(p->cpus_ptr); - if (!(cpu_ctx = lookup_cpu_ctx(cpu))) - return -ENOENT; - tctx->dsq = get_cpu_dsq_id(cpu); - p->scx.dsq_vtime = READ_ONCE(cpu_ctx->vtime_now); - } - - return 0; -} - -/* - * Figure out the task's cell, dsq and store the corresponding cpumask in the - * task_ctx. - */ -static inline int update_task_cell(struct task_struct *p, struct task_ctx *tctx, - struct cgroup *cg) -{ - struct cgrp_ctx *cgc; - - if (!(cgc = lookup_cgrp_ctx(cg))) - return -ENOENT; - - /* - * This ordering is pretty important, we read applied_configuration_seq - * before reading everything else expecting that the updater will update - * everything and then bump applied_configuration_seq last. This ensures - * that we cannot miss an update. - */ - tctx->configuration_seq = READ_ONCE(applied_configuration_seq); - barrier(); - tctx->cell = cgc->cell; - - return update_task_cpumask(p, tctx); -} - -/* Helper function for picking an idle cpu out of a candidate set */ -static s32 pick_idle_cpu_from(struct task_struct *p, - const struct cpumask *cand_cpumask, s32 prev_cpu, - const struct cpumask *idle_smtmask) -{ - bool prev_in_cand = bpf_cpumask_test_cpu(prev_cpu, cand_cpumask); - s32 cpu; - - /* - * If CPU has SMT, any wholly idle CPU is likely a better pick than - * partially idle @prev_cpu. - */ - if (smt_enabled) { - if (prev_in_cand && - bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) && - scx_bpf_test_and_clear_cpu_idle(prev_cpu)) - return prev_cpu; - - cpu = scx_bpf_pick_idle_cpu(cand_cpumask, SCX_PICK_IDLE_CORE); - if (cpu >= 0) - return cpu; - } - - if (prev_in_cand && scx_bpf_test_and_clear_cpu_idle(prev_cpu)) - return prev_cpu; - - return scx_bpf_pick_idle_cpu(cand_cpumask, 0); -} - -/* Check if we need to update the cell/cpumask mapping */ -static __always_inline int maybe_refresh_cell(struct task_struct *p, - struct task_ctx *tctx) -{ - struct cgroup *cgrp; - int ret = 0; - if (tctx->configuration_seq != READ_ONCE(applied_configuration_seq)) { - if (!(cgrp = task_cgroup(p))) - return -1; - if (update_task_cell(p, tctx, cgrp)) - ret = -1; - bpf_cgroup_release(cgrp); - } - return ret; -} - -static __always_inline s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, - struct cpu_ctx *cctx, - struct task_ctx *tctx) -{ - struct cpumask *task_cpumask; - const struct cpumask *idle_smtmask; - s32 cpu; - - if (!(task_cpumask = (struct cpumask *)tctx->cpumask) || - !(idle_smtmask = scx_bpf_get_idle_smtmask())) { - scx_bpf_error("Failed to get task cpumask or idle smtmask"); - return -1; - } - - /* No overlap between cell cpus and task cpus, just find some idle cpu */ - if (bpf_cpumask_empty(task_cpumask)) { - cstat_inc(CSTAT_AFFN_VIOL, tctx->cell, cctx); - cpu = pick_idle_cpu_from(p, p->cpus_ptr, prev_cpu, - idle_smtmask); - goto out; - } - - cpu = pick_idle_cpu_from(p, task_cpumask, prev_cpu, idle_smtmask); -out: - scx_bpf_put_idle_cpumask(idle_smtmask); - return cpu; -} - -/* - * select_cpu is where we update each task's cell assignment and then try to - * dispatch to an idle core in the cell if possible - */ -s32 BPF_STRUCT_OPS(mitosis_select_cpu, struct task_struct *p, s32 prev_cpu, - u64 wake_flags) -{ - s32 cpu; - struct cpu_ctx *cctx; - struct task_ctx *tctx; - - increment_counter(COUNTER_SELECT_CPU); - - if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p))) - return prev_cpu; - - if (maybe_refresh_cell(p, tctx) < 0) - return prev_cpu; - - /* Pinned path: only if our task really requires a per-CPU queue. */ - if (!tctx->all_cell_cpus_allowed) { - cstat_inc(CSTAT_AFFN_VIOL, tctx->cell, cctx); - cpu = get_cpu_from_dsq(tctx->dsq); - if (scx_bpf_test_and_clear_cpu_idle(cpu)) - scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0); - return cpu; - } - - // Grab an idle core - if ((cpu = pick_idle_cpu(p, prev_cpu, cctx, tctx)) >= 0) { - cstat_inc(CSTAT_LOCAL, tctx->cell, cctx); - scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0); - return cpu; - } - - if (!tctx->cpumask) { - scx_bpf_error("tctx->cpumask should never be NULL"); - return prev_cpu; - } - /* - * All else failed, send it to the prev cpu (if that's valid), otherwise any - * valid cpu. - */ - if (!bpf_cpumask_test_cpu(prev_cpu, cast_mask(tctx->cpumask)) && - tctx->cpumask) - cpu = bpf_cpumask_any_distribute(cast_mask(tctx->cpumask)); - else - cpu = prev_cpu; - - return cpu; -} - -void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags) -{ - struct cpu_ctx *cctx; - struct task_ctx *tctx; - struct cell *cell; - s32 task_cpu = scx_bpf_task_cpu(p); - u64 vtime = p->scx.dsq_vtime; - s32 cpu = -1; - u64 basis_vtime; - - increment_counter(COUNTER_ENQUEUE); - - if (!(tctx = lookup_task_ctx(p)) || !(cctx = lookup_cpu_ctx(-1))) - return; - - if (maybe_refresh_cell(p, tctx) < 0) - return; - - // Cpu pinned work - if (!tctx->all_cell_cpus_allowed) { - cpu = get_cpu_from_dsq(tctx->dsq); - } else if (!__COMPAT_is_enq_cpu_selected(enq_flags)) { - /* - * If we haven't selected a cpu, then we haven't looked for and kicked an - * idle CPU. Let's do the lookup now and kick at the end. - */ - if (!(cctx = lookup_cpu_ctx(-1))) - return; - cpu = pick_idle_cpu(p, task_cpu, cctx, tctx); - if (cpu == -1) - return; - if (cpu == -EBUSY) { - /* - * Verifier gets unhappy claiming two different pointer types for - * the same instruction here. This fixes it - */ - barrier_var(tctx); - if (tctx->cpumask) - cpu = bpf_cpumask_any_distribute( - (const struct cpumask *)tctx->cpumask); - } - } - - if (tctx->all_cell_cpus_allowed) { - // This is a task that can run on any cpu in the cell - - cstat_inc(CSTAT_CELL_DSQ, tctx->cell, cctx); - - /* Task can use any CPU in its cell, set basis_vtime from per-(cell, L3) vtime */ - if (!(cell = lookup_cell(tctx->cell))) - return; - - if (!l3_is_valid(tctx->l3)) { - scx_bpf_error("Invalid L3 ID for task %d in enqueue", p->pid); - return; - } - basis_vtime = READ_ONCE(cell->l3_vtime_now[tctx->l3]); - - } else { - // This is a task that can only run on a specific cpu - cstat_inc(CSTAT_CPU_DSQ, tctx->cell, cctx); - - /* - * cctx is the local core cpu (where enqueue is running), not the core - * the task belongs to. Fetch the right cctx - */ - if (!(cctx = lookup_cpu_ctx(cpu))) - return; - /* Task is pinned to specific CPUs, use per-CPU DSQ */ - basis_vtime = READ_ONCE(cctx->vtime_now); - } - - tctx->basis_vtime = basis_vtime; - - if (time_after(vtime, - basis_vtime + VTIME_MAX_FUTURE_MULTIPLIER * slice_ns)) { - scx_bpf_error("vtime is too far in the future for %d", p->pid); - return; - } - /* - * Limit the amount of budget that an idling task can accumulate - * to one slice. - */ - // TODO: Should this be time_before64? - if (time_before(vtime, basis_vtime - slice_ns)) - vtime = basis_vtime - slice_ns; - - scx_bpf_dsq_insert_vtime(p, tctx->dsq.raw, slice_ns, vtime, enq_flags); - - /* Kick the CPU if needed */ - if (!__COMPAT_is_enq_cpu_selected(enq_flags) && cpu >= 0) - scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); -} - -void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev) -{ - struct cpu_ctx *cctx; - u32 cell; - - increment_counter(COUNTER_DISPATCH); - - if (!(cctx = lookup_cpu_ctx(-1))) - return; - - cell = READ_ONCE(cctx->cell); - - /* Start from a valid DSQ */ - dsq_id_t local_dsq = get_cpu_dsq_id(cpu); - - bool found = false; - dsq_id_t min_vtime_dsq = local_dsq; - u64 min_vtime = ~0ULL; /* U64_MAX */ - struct task_struct *p; - - // Get L3 - u32 cpu_key = (u32)cpu; - u32 *l3_ptr = bpf_map_lookup_elem(&cpu_to_l3, &cpu_key); - s32 l3 = l3_ptr ? (s32)*l3_ptr : L3_INVALID; - - /* Check the L3 queue */ - if (l3 != L3_INVALID) { - dsq_id_t cell_l3_dsq = get_cell_l3_dsq_id(cell, l3); - bpf_for_each(scx_dsq, p, cell_l3_dsq.raw, 0) { - min_vtime = p->scx.dsq_vtime; - min_vtime_dsq = cell_l3_dsq; - found = true; - break; - } - } - - /* Check the CPU DSQ for a lower vtime */ - bpf_for_each(scx_dsq, p, local_dsq.raw, 0) { - if (!found || time_before(p->scx.dsq_vtime, min_vtime)) { - min_vtime = p->scx.dsq_vtime; - min_vtime_dsq = local_dsq; - found = true; - } - break; - } - - /* - * The move_to_local can fail if we raced with some other cpu in the cell - * and now the cell is empty. We have to ensure to try the cpu_dsq or else - * we might never wakeup. - */ - - - if (found) { - // We found a task in the local or cell-L3 DSQ - - // If it was in the per cpu DSQ, there is no competation, grab it and return - if (min_vtime_dsq.raw == local_dsq.raw) { - scx_bpf_dsq_move_to_local(min_vtime_dsq.raw); - return; - } - - // If it was in the cell L3 DSQ, we are competing with other cpus in the cell-l3 - // try to move it to the local DSQ - if (scx_bpf_dsq_move_to_local(min_vtime_dsq.raw)) { - // We won the race and got the task, return - return; - } - } - -#if MITOSIS_ENABLE_STEALING - // We didn't find a task in either DSQ, or lost the race. - // Instead of going straight to idle, attempt to steal a task from another - // L3 in the cell. - - // Try stealing. If successful, this moves the task to the local runqueue - try_stealing_work(cell, l3); -#endif -} - -struct cpumask_entry { - unsigned long cpumask[CPUMASK_LONG_ENTRIES]; - u64 used; -}; - -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, struct cpumask_entry); - __uint(max_entries, MAX_CPUMASK_ENTRIES); -} cgrp_init_percpu_cpumask SEC(".maps"); - -static inline struct cpumask_entry *allocate_cpumask_entry() -{ - int cpumask_idx; - bpf_for(cpumask_idx, 0, MAX_CPUMASK_ENTRIES) - { - struct cpumask_entry *ent = bpf_map_lookup_elem( - &cgrp_init_percpu_cpumask, &cpumask_idx); - if (!ent) { - scx_bpf_error("Failed to fetch cpumask_entry"); - return NULL; - } - if (__sync_bool_compare_and_swap(&ent->used, 0, 1)) - return ent; - } - scx_bpf_error("All cpumask entries are in use"); - return NULL; -} - -static inline void free_cpumask_entry(struct cpumask_entry *entry) -{ - WRITE_ONCE(entry->used, 0); -} - -/* For use by cleanup attribute */ -static inline void __free_cpumask_entry(struct cpumask_entry **entry) -{ - if (entry) - if (*entry) - free_cpumask_entry(*entry); -} - -#define DECLARE_CPUMASK_ENTRY(var) \ - struct cpumask_entry *var __attribute__((cleanup(__free_cpumask_entry))) - -/* Define types for cpumasks in-situ vs as a ptr in struct cpuset */ -struct cpumask___local {}; - -typedef struct cpumask___local *cpumask_var_t___ptr; - -struct cpuset___cpumask_ptr { - cpumask_var_t___ptr cpus_allowed; -}; - -typedef struct cpumask___local cpumask_var_t___arr[1]; - -struct cpuset___cpumask_arr { - cpumask_var_t___arr cpus_allowed; -}; - -/* - * Given a cgroup, get its cpumask (populated in entry), returns 0 if no - * cpumask, < 0 on error and > 0 on a populated cpumask. - */ -static inline int get_cgroup_cpumask(struct cgroup *cgrp, - struct cpumask_entry *entry) -{ - if (!cgrp->subsys[cpuset_cgrp_id]) - return 0; - - struct cpuset *cpuset = - container_of(cgrp->subsys[cpuset_cgrp_id], struct cpuset, css); - - if (!cpuset) - return 0; - - unsigned long runtime_cpumask_size = bpf_core_type_size(struct cpumask); - if (runtime_cpumask_size > CPUMASK_SIZE) { - scx_bpf_error( - "Definition of struct cpumask is too large. Please increase CPUMASK_LONG_ENTRIES"); - return -EINVAL; - } - - int err; - if (bpf_core_type_matches(struct cpuset___cpumask_arr)) { - struct cpuset___cpumask_arr *cpuset_typed = - (void *)bpf_core_cast(cpuset, struct cpuset); - err = bpf_core_read(&entry->cpumask, runtime_cpumask_size, - &cpuset_typed->cpus_allowed); - } else if (bpf_core_type_matches(struct cpuset___cpumask_ptr)) { - struct cpuset___cpumask_ptr *cpuset_typed = - (void *)bpf_core_cast(cpuset, struct cpuset); - err = bpf_core_read(&entry->cpumask, runtime_cpumask_size, - cpuset_typed->cpus_allowed); - } else { - scx_bpf_error( - "Definition of struct cpuset did not match any expected struct"); - return -EINVAL; - } - - if (err < 0) { - scx_bpf_error( - "bpf_core_read of cpuset->cpus_allowed failed for cgid %llu", - cgrp->kn->id); - return err; - } - - if (bpf_cpumask_empty((const struct cpumask *)&entry->cpumask)) - return 0; - - if (!all_cpumask) { - scx_bpf_error("all_cpumask should not be NULL"); - return -EINVAL; - } - - if (bpf_cpumask_subset((const struct cpumask *)all_cpumask, - (const struct cpumask *)&entry->cpumask)) - return 0; - - return 1; -} - -/* - * This array keeps track of the cgroup ancestor's cell as we iterate over the - * cgroup hierarchy. - */ -u32 level_cells[MAX_CG_DEPTH]; -int running; - -/* The guard is a stack variable. When it falls out of scope, - * we drop the running lock. */ -static inline void __running_unlock(int *guard) { - (void)guard; /* unused */ - WRITE_ONCE(running, 0); -} - -/* - * On tick, we identify new cells and apply CPU assignment - */ -void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run) -{ - - u32 local_configuration_seq = READ_ONCE(configuration_seq); - if (local_configuration_seq == READ_ONCE(applied_configuration_seq)) - return; - - int zero = 0; - if (!__atomic_compare_exchange_n(&running, &zero, 1, false, - __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) - return; - - int __attribute__((cleanup(__running_unlock), unused)) __running_guard; - - DECLARE_CPUMASK_ENTRY(entry) = allocate_cpumask_entry(); - if (!entry) - return; - - /* Get the root cell (cell 0) and its cpumask */ - struct cell_cpumask_wrapper *root_cell_cpumaskw; - if (!(root_cell_cpumaskw = - bpf_map_lookup_elem(&cell_cpumasks, &zero))) { - scx_bpf_error("Failed to find root cell cpumask"); - return; - } - - struct bpf_cpumask *root_bpf_cpumask; - root_bpf_cpumask = - bpf_kptr_xchg(&root_cell_cpumaskw->tmp_cpumask, NULL); - if (!root_bpf_cpumask) { - scx_bpf_error("tmp_cpumask should never be null"); - return; - } - if (!root_cell_cpumaskw->cpumask) { - scx_bpf_error("root cpumasks should never be null"); - goto out; - } - - if (!all_cpumask) { - scx_bpf_error("NULL all_cpumask"); - goto out; - } - - /* - * Initialize root cell cpumask to all cpus, and then remove from it as we go - */ - bpf_cpumask_copy(root_bpf_cpumask, (const struct cpumask *)all_cpumask); - - struct cgroup_subsys_state *root_css, *pos; - struct cgroup *cur_cgrp, *root_cgrp_ref; - - if (!root_cgrp) { - scx_bpf_error("root_cgrp should not be null"); - goto out; - } - - struct cgrp_ctx *root_cgrp_ctx; - if (!(root_cgrp_ctx = lookup_cgrp_ctx(root_cgrp))) - goto out; - - if (!root_cgrp) { - scx_bpf_error("root_cgrp should not be null"); - goto out; - } - - if (!(root_cgrp_ref = bpf_cgroup_acquire(root_cgrp))) { - scx_bpf_error("Failed to acquire reference to root_cgrp"); - goto out; - } - root_css = &root_cgrp_ref->self; - - bpf_rcu_read_lock(); - /* - * Iterate over all cgroups, check if any have a cpumask and populate them - * as a separate cell. - */ - bpf_for_each(css, pos, root_css, BPF_CGROUP_ITER_DESCENDANTS_PRE) { - cur_cgrp = pos->cgroup; - - /* - * We can iterate over dying cgroups, in which case this lookup will - * fail. These cgroups can't have tasks in them so just continue. - */ - struct cgrp_ctx *cgrp_ctx; - if (!(cgrp_ctx = lookup_cgrp_ctx_fallible(cur_cgrp))) - continue; - - int rc = get_cgroup_cpumask(cur_cgrp, entry); - if (!rc) { - /* - * TODO: If this was a cell owner that just had its cpuset removed, - * it should free the cell. Doing so would require draining - * in-flight tasks scheduled to the dsq. - */ - /* No cpuset, assign to parent cell and continue */ - if (cur_cgrp->kn->id != root_cgid) { - u32 level = cur_cgrp->level; - if (level <= 0 || level >= MAX_CG_DEPTH) { - scx_bpf_error( - "Cgroup hierarchy is too deep: %d", - level); - goto out_rcu_unlock; - } - /* - * This is a janky way of getting the parent cell, ideally we'd - * lookup the parent cgrp_ctx and get it that way, but some - * cgroup lookups don't work here because they are (erroneously) - * only operating on the cgroup namespace of current. Given this - * is a tick() it could be anything. See - * https://lore.kernel.org/bpf/20250811175045.1055202-1-memxor@gmail.com/ - * for details. - * - * Instead, we just track the parent cells as we walk the cgroup - * hierarchy in a separate array. Because the iteration is - * pre-order traversal, we're guaranteed to have the current - * cgroup's ancestor's cells in level_cells. - */ - u32 parent_cell = level_cells[level - 1]; - WRITE_ONCE(cgrp_ctx->cell, parent_cell); - level_cells[level] = parent_cell; - } - continue; - } else if (rc < 0) - goto out_rcu_unlock; - - /* - * cgroup has a cpumask, allocate a new cell if needed, and assign cpus - */ - int cell_idx = READ_ONCE(cgrp_ctx->cell); - if (!cgrp_ctx->cell_owner) { - cell_idx = allocate_cell(); - if (cell_idx < 0) - goto out_rcu_unlock; - cgrp_ctx->cell_owner = true; - } - - struct cell_cpumask_wrapper *cell_cpumaskw; - if (!(cell_cpumaskw = - bpf_map_lookup_elem(&cell_cpumasks, &cell_idx))) { - scx_bpf_error("Failed to find cell cpumask: %d", - cell_idx); - goto out_rcu_unlock; - } - - struct bpf_cpumask *bpf_cpumask; - bpf_cpumask = bpf_kptr_xchg(&cell_cpumaskw->tmp_cpumask, NULL); - if (!bpf_cpumask) { - scx_bpf_error("tmp_cpumask should never be null"); - goto out_rcu_unlock; - } - bpf_cpumask_copy(bpf_cpumask, - (const struct cpumask *)&entry->cpumask); - int cpu_idx; - bpf_for(cpu_idx, 0, nr_possible_cpus) - { - if (bpf_cpumask_test_cpu( - cpu_idx, - (const struct cpumask *)&entry->cpumask)) { - struct cpu_ctx *cpu_ctx; - if (!(cpu_ctx = lookup_cpu_ctx(cpu_idx))) { - bpf_cpumask_release(bpf_cpumask); - goto out_rcu_unlock; - } - cpu_ctx->cell = cell_idx; - bpf_cpumask_clear_cpu(cpu_idx, - root_bpf_cpumask); - } - } - bpf_cpumask = - bpf_kptr_xchg(&cell_cpumaskw->cpumask, bpf_cpumask); - if (!bpf_cpumask) { - scx_bpf_error("cpumask should never be null"); - goto out_rcu_unlock; - } - - bpf_cpumask = - bpf_kptr_xchg(&cell_cpumaskw->tmp_cpumask, bpf_cpumask); - if (bpf_cpumask) { - scx_bpf_error("tmp_cpumask should be null"); - bpf_cpumask_release(bpf_cpumask); - goto out_rcu_unlock; - } - - barrier(); - WRITE_ONCE(cgrp_ctx->cell, cell_idx); - u32 level = cur_cgrp->level; - if (level <= 0 || level >= MAX_CG_DEPTH) { - scx_bpf_error("Cgroup hierarchy is too deep: %d", - level); - goto out_rcu_unlock; - } - level_cells[level] = cell_idx; - } - bpf_rcu_read_unlock(); - - /* - * assign root cell cpus that are left over - */ - int cpu_idx; - bpf_for(cpu_idx, 0, nr_possible_cpus) - { - if (bpf_cpumask_test_cpu( cpu_idx, (const struct cpumask *)root_bpf_cpumask)) { - struct cpu_ctx *cpu_ctx; - if (!(cpu_ctx = lookup_cpu_ctx(cpu_idx))) - goto out_root_cgrp; - cpu_ctx->cell = 0; - } - } - - root_bpf_cpumask = - bpf_kptr_xchg(&root_cell_cpumaskw->cpumask, root_bpf_cpumask); - if (!root_bpf_cpumask) { - scx_bpf_error("root cpumask should never be null"); - bpf_cgroup_release(root_cgrp_ref); - return; - } - - root_bpf_cpumask = bpf_kptr_xchg(&root_cell_cpumaskw->tmp_cpumask, - root_bpf_cpumask); - if (root_bpf_cpumask) { - scx_bpf_error("root tmp_cpumask should be null"); - goto out_root_cgrp; - } - - int cell_idx; - /* Recalculate L3 counts for all active cells after CPU assignment changes */ - bpf_for(cell_idx, 1, MAX_CELLS) { - struct cell *cell; - if (!(cell = lookup_cell(cell_idx))) { - scx_bpf_error("Lookup for cell %d failed in tick()", cell_idx); - goto out_root_cgrp; - } - - if (!cell->in_use) - continue; - - /* Recalculate L3 counts for each active cell */ - recalc_cell_l3_counts(cell_idx); - } - - /* Recalculate root cell's L3 counts after cpumask update */ - recalc_cell_l3_counts(ROOT_CELL_ID); - - barrier(); - WRITE_ONCE(applied_configuration_seq, local_configuration_seq); - - bpf_cgroup_release(root_cgrp_ref); - return; - -out_rcu_unlock: - bpf_rcu_read_unlock(); -out_root_cgrp: - bpf_cgroup_release(root_cgrp_ref); -out: - if (root_bpf_cpumask) - bpf_cpumask_release(root_bpf_cpumask); -} - -void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p) -{ - struct cpu_ctx *cctx; - struct task_ctx *tctx; - struct cell *cell; - - if (!(tctx = lookup_task_ctx(p)) || !(cctx = lookup_cpu_ctx(-1)) || - !(cell = lookup_cell(cctx->cell))) - return; - - /* - * If this task was stolen across L3s, retag to thief L3 and recompute - * effective cpumask+DSQ. Preserve vtime to keep fairness. - */ -#if MITOSIS_ENABLE_STEALING - if (l3_is_valid(tctx->pending_l3)) { - u64 save_v = p->scx.dsq_vtime; - tctx->l3 = tctx->pending_l3; - tctx->pending_l3 = L3_INVALID; - update_task_cpumask(p, tctx); - p->scx.dsq_vtime = save_v; - } -#endif - - /* Validate task's DSQ before it starts running */ - if (tctx->dsq.raw == DSQ_INVALID) { - if (tctx->all_cell_cpus_allowed) { - scx_bpf_error( - "Task %d has invalid DSQ 0 in running callback (CELL-SCHEDULABLE task, can run on any CPU in cell %d)", - p->pid, tctx->cell); - } else { - scx_bpf_error( - "Task %d has invalid DSQ 0 in running callback (CORE-PINNED task, restricted to specific CPUs)", - p->pid); - } - return; - } - - /* - * Update per-(cell, L3) vtime for cell-schedulable tasks - */ - if (tctx->all_cell_cpus_allowed && l3_is_valid(tctx->l3)) { - if (time_before(READ_ONCE(cell->l3_vtime_now[tctx->l3]), p->scx.dsq_vtime)) - WRITE_ONCE(cell->l3_vtime_now[tctx->l3], p->scx.dsq_vtime); - } - - /* - * Update CPU vtime for CPU-pinned tasks - */ - if (time_before(READ_ONCE(cctx->vtime_now), p->scx.dsq_vtime)) - WRITE_ONCE(cctx->vtime_now, p->scx.dsq_vtime); - - tctx->started_running_at = scx_bpf_now(); -} - -void BPF_STRUCT_OPS(mitosis_stopping, struct task_struct *p, bool runnable) -{ - struct cpu_ctx *cctx; - struct task_ctx *tctx; - struct cell *cell; - u64 now, used; - u32 cidx; - - if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p))) - return; - - cidx = tctx->cell; - if (!(cell = lookup_cell(cidx))) - return; - - now = scx_bpf_now(); - used = now - tctx->started_running_at; - tctx->started_running_at = now; - /* scale the execution time by the inverse of the weight and charge */ - p->scx.dsq_vtime += used * DEFAULT_WEIGHT_MULTIPLIER / p->scx.weight; - - if (cidx != 0 || tctx->all_cell_cpus_allowed) { - u64 *cell_cycles = MEMBER_VPTR(cctx->cell_cycles, [cidx]); - if (!cell_cycles) { - scx_bpf_error("Cell index is too large: %d", cidx); - return; - } - *cell_cycles += used; - - /* - * For cell-schedulable tasks, also accumulate vtime into - * per-cell per-L3 queues - */ - if (tctx->all_cell_cpus_allowed && l3_is_valid(tctx->l3)) { - /* Accumulate weighted execution time into per-(cell, L3) vtime */ - cell->l3_vtime_now[tctx->l3] += - used * DEFAULT_WEIGHT_MULTIPLIER / - p->scx.weight; - } - } -} - -SEC("fentry/cpuset_write_resmask") -int BPF_PROG(fentry_cpuset_write_resmask, struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off, ssize_t retval) -{ - /* - * On a write to cpuset.cpus, we'll need to configure new cells, bump - * configuration_seq so tick() does that. - */ - __atomic_add_fetch(&configuration_seq, 1, __ATOMIC_RELEASE); - return 0; -} - -s32 BPF_STRUCT_OPS(mitosis_cgroup_init, struct cgroup *cgrp, - struct scx_cgroup_init_args *args) -{ - struct cgrp_ctx *cgc; - if (!(cgc = bpf_cgrp_storage_get(&cgrp_ctxs, cgrp, 0, - BPF_LOCAL_STORAGE_GET_F_CREATE))) { - scx_bpf_error("cgrp_ctx creation failed for cgid %llu", - cgrp->kn->id); - return -ENOENT; - } - - // Special case for root cell - if (cgrp->kn->id == root_cgid) { - WRITE_ONCE(cgc->cell, ROOT_CELL_ID); - return 0; - } - - DECLARE_CPUMASK_ENTRY(entry) = allocate_cpumask_entry(); - if (!entry) - return -EINVAL; - int rc = get_cgroup_cpumask(cgrp, entry); - if (rc < 0) - return rc; - else if (rc > 0) { - /* - * This cgroup has a cpuset, bump configuration_seq so tick() - * configures it. - */ - __atomic_add_fetch(&configuration_seq, 1, __ATOMIC_RELEASE); - } - - /* Initialize to parent's cell */ - struct cgroup *parent_cg; - if (!(parent_cg = lookup_cgrp_ancestor(cgrp, cgrp->level - 1))) - return -ENOENT; - - struct cgrp_ctx *parent_cgc; - if (!(parent_cgc = lookup_cgrp_ctx(parent_cg))) { - bpf_cgroup_release(parent_cg); - return -ENOENT; - } - - bpf_cgroup_release(parent_cg); - cgc->cell = parent_cgc->cell; - return 0; -} - -s32 BPF_STRUCT_OPS(mitosis_cgroup_exit, struct cgroup *cgrp) -{ - struct cgrp_ctx *cgc; - if (!(cgc = bpf_cgrp_storage_get(&cgrp_ctxs, cgrp, 0, - BPF_LOCAL_STORAGE_GET_F_CREATE))) { - scx_bpf_error("cgrp_ctx creation failed for cgid %llu", - cgrp->kn->id); - return -ENOENT; - } - - if (cgc->cell_owner) { - int ret; - if ((ret = free_cell(cgc->cell))) - return ret; - /* - * Need to make sure the cpus of this cell are freed back to the root - * cell and the root cell cpumask can be expanded. Bump - * configuration_seq so tick() does that. - */ - __atomic_add_fetch(&configuration_seq, 1, __ATOMIC_RELEASE); - } - - return 0; -} - -void BPF_STRUCT_OPS(mitosis_cgroup_move, struct task_struct *p, - struct cgroup *from, struct cgroup *to) -{ - struct task_ctx *tctx; - - if (!(tctx = lookup_task_ctx(p))) - return; - - update_task_cell(p, tctx, to); -} - -void BPF_STRUCT_OPS(mitosis_set_cpumask, struct task_struct *p, - const struct cpumask *cpumask) -{ - struct task_ctx *tctx; - - if (!(tctx = lookup_task_ctx(p))) - return; - - if (!all_cpumask) { - scx_bpf_error("NULL all_cpumask"); - return; - } - - update_task_cpumask(p, tctx); -} - -s32 BPF_STRUCT_OPS(mitosis_init_task, struct task_struct *p, - struct scx_init_task_args *args) -{ - struct task_ctx *tctx; - struct bpf_cpumask *cpumask; - int ret; - - tctx = bpf_task_storage_get(&task_ctxs, p, 0, - BPF_LOCAL_STORAGE_GET_F_CREATE); - if (!tctx) { - scx_bpf_error("task_ctx allocation failure"); - return -ENOMEM; - } - - cpumask = bpf_cpumask_create(); - if (!cpumask) - return -ENOMEM; - - cpumask = bpf_kptr_xchg(&tctx->cpumask, cpumask); - if (cpumask) { - /* Should never happen as we just inserted it above. */ - bpf_cpumask_release(cpumask); - scx_bpf_error("tctx cpumask is unexpectedly populated on init"); - return -EINVAL; - } - - if (!all_cpumask) { - scx_bpf_error("missing all_cpumask"); - return -EINVAL; - } - - /* Initialize L3 to invalid before cell assignment */ - init_task_l3(tctx); - - // TODO clean this up - if ((ret = update_task_cell(p, tctx, args->cgroup))) { - return ret; - } - - return 0; -} - -__hidden void dump_cpumask_word(s32 word, const struct cpumask *cpumask) -{ - u32 u, v = 0; - - bpf_for(u, 0, BITS_PER_U32) - { - s32 cpu = BITS_PER_U32 * word + u; - if (cpu < nr_possible_cpus && - bpf_cpumask_test_cpu(cpu, cpumask)) - v |= 1 << u; - } - scx_bpf_dump("%08x", v); -} - -static void dump_cpumask(const struct cpumask *cpumask) -{ - u32 word, nr_words = (nr_possible_cpus + 31) / 32; - - bpf_for(word, 0, nr_words) - { - if (word) - scx_bpf_dump(","); - dump_cpumask_word(nr_words - word - 1, cpumask); - } -} - -static void dump_cell_cpumask(int id) -{ - const struct cpumask *cell_cpumask; - - if (!(cell_cpumask = lookup_cell_cpumask(id))) - return; - - dump_cpumask(cell_cpumask); -} - -/* Print cell state for debugging */ -static __always_inline void dump_cell_state(u32 cell_idx) -{ - struct cell *cell = lookup_cell(cell_idx); - if (!cell) { - scx_bpf_dump("Cell %d: NOT FOUND", cell_idx); - return; - } - - scx_bpf_dump("Cell %d: in_use=%d, cpu_cnt=%d, l3_present_cnt=%d", - cell_idx, cell->in_use, cell->cpu_cnt, cell->l3_present_cnt); - - u32 l3; - // TODO Print vtimes for L3s - // TODO lock - bpf_for(l3, 0, nr_l3) { - if (cell->l3_cpu_cnt[l3] > 0) { - scx_bpf_dump(" L3[%d]: %d CPUs", l3, cell->l3_cpu_cnt[l3]); - } - } -} - -// TODO: FIX THIS -static __always_inline void dump_l3_state(){ -} - -void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx) -{ - dsq_id_t dsq_id; - int i; - struct cell *cell; - struct cpu_ctx *cpu_ctx; - - scx_bpf_dump_header(); - - bpf_for(i, 0, MAX_CELLS) - { - if (!(cell = lookup_cell(i))) - return; - - if (!cell->in_use) - continue; - - scx_bpf_dump("CELL[%d] CPUS=", i); - dump_cell_cpumask(i); - scx_bpf_dump("\n"); - dump_cell_state(i); - } - - bpf_for(i, 0, nr_possible_cpus) - { - if (!(cpu_ctx = lookup_cpu_ctx(i))) - return; - - dsq_id = get_cpu_dsq_id(i); - scx_bpf_dump("CPU[%d] cell=%d vtime=%llu nr_queued=%d\n", i, - cpu_ctx->cell, READ_ONCE(cpu_ctx->vtime_now), - scx_bpf_dsq_nr_queued(dsq_id.raw)); - } - - dump_l3_state(); - -} - -void BPF_STRUCT_OPS(mitosis_dump_task, struct scx_dump_ctx *dctx, - struct task_struct *p) -{ - struct task_ctx *tctx; - - if (!(tctx = lookup_task_ctx(p))) - return; - - scx_bpf_dump( - "Task[%d] vtime=%llu basis_vtime=%llu cell=%u dsq=%llu all_cell_cpus_allowed=%d\n", - p->pid, p->scx.dsq_vtime, tctx->basis_vtime, tctx->cell, - tctx->dsq.raw, tctx->all_cell_cpus_allowed); - scx_bpf_dump("Task[%d] CPUS=", p->pid); - dump_cpumask(p->cpus_ptr); - scx_bpf_dump("\n"); -} - -s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init) -{ - struct bpf_cpumask *cpumask; - u32 i; - s32 ret; - - struct cgroup *rootcg; - if (!(rootcg = bpf_cgroup_from_id(root_cgid))) - return -ENOENT; - - rootcg = bpf_kptr_xchg(&root_cgrp, rootcg); - if (rootcg) - bpf_cgroup_release(rootcg); - - /* setup all_cpumask */ - cpumask = bpf_cpumask_create(); - if (!cpumask) - return -ENOMEM; - - bpf_for(i, 0, nr_possible_cpus) - { - const volatile u8 *u8_ptr; - - if ((u8_ptr = MEMBER_VPTR(all_cpus, [i / 8]))) { - if (*u8_ptr & (1 << (i % 8))) { - bpf_cpumask_set_cpu(i, cpumask); - ret = scx_bpf_create_dsq(get_cpu_dsq_id(i).raw, ANY_NUMA); - if (ret < 0) { - bpf_cpumask_release(cpumask); - return ret; - } - } - } else { - return -EINVAL; - } - } - - - cpumask = bpf_kptr_xchg(&all_cpumask, cpumask); - if (cpumask) - bpf_cpumask_release(cpumask); - - /* setup cell cpumasks */ - bpf_for(i, 0, MAX_CELLS) - { - struct cell_cpumask_wrapper *cpumaskw; - if (!(cpumaskw = bpf_map_lookup_elem(&cell_cpumasks, &i))) - return -ENOENT; - - cpumask = bpf_cpumask_create(); - if (!cpumask) - return -ENOMEM; - - /* - * Start with all full cpumask for all cells. They'll get setup in - * cgroup_init - */ - bpf_cpumask_setall(cpumask); - - cpumask = bpf_kptr_xchg(&cpumaskw->cpumask, cpumask); - if (cpumask) { - /* Should be impossible, we just initialized the cell cpumask */ - bpf_cpumask_release(cpumask); - return -EINVAL; - } - - cpumask = bpf_cpumask_create(); - if (!cpumask) - return -ENOMEM; - cpumask = bpf_kptr_xchg(&cpumaskw->tmp_cpumask, cpumask); - if (cpumask) { - /* Should be impossible, we just initialized the cell tmp_cpumask */ - bpf_cpumask_release(cpumask); - return -EINVAL; - } - } - - // cells[0].in_use = true; - lookup_cell(0)->in_use = true; - - /* Configure root cell (cell 0) topology at init time using nr_l3 and l3_to_cpu masks */ - recalc_cell_l3_counts(ROOT_CELL_ID); - - /* Create (cell,L3) DSQs for all pairs. Userspace will populate maps. */ - // This is a crazy over-estimate - bpf_for(i, 0, MAX_CELLS) - { - u32 l3; - bpf_for(l3, 0, nr_l3) - { - ret = scx_bpf_create_dsq(get_cell_l3_dsq_id(i, l3).raw, ANY_NUMA); - if (ret < 0) - scx_bpf_error( "Failed to create DSQ for cell %d, L3 %d: err %d", i, l3, ret); - } - } - - return 0; -} - -void BPF_STRUCT_OPS(mitosis_exit, struct scx_exit_info *ei) -{ - // int i; - // bpf_for(i, 0, MAX_CELLS); { - // dump_cell_state((u32)i); - // } - - UEI_RECORD(uei, ei); -} - -SEC(".struct_ops.link") -struct sched_ext_ops mitosis = { - .select_cpu = (void *)mitosis_select_cpu, - .enqueue = (void *)mitosis_enqueue, - .dispatch = (void *)mitosis_dispatch, - .tick = (void *)mitosis_tick, - .running = (void *)mitosis_running, - .stopping = (void *)mitosis_stopping, - .set_cpumask = (void *)mitosis_set_cpumask, - .init_task = (void *)mitosis_init_task, - .cgroup_init = (void *)mitosis_cgroup_init, - .cgroup_exit = (void *)mitosis_cgroup_exit, - .cgroup_move = (void *)mitosis_cgroup_move, - .dump = (void *)mitosis_dump, - .dump_task = (void *)mitosis_dump_task, - .init = (void *)mitosis_init, - .exit = (void *)mitosis_exit, - .name = "mitosis", -}; -# File: scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h -/* Copyright (c) Meta Platforms, Inc. and affiliates. */ -/* - * This software may be used and distributed according to the terms of the - * GNU General Public License version 2. - * - * This defines the core data structures, types, and constants - * for the scx_mitosis scheduler, primarily containing `struct cell` - * and `struct task_ctx`. - */ - -#pragma once - -#ifdef LSP -#define __bpf__ -#include "../../../../include/scx/common.bpf.h" -#include "../../../../include/scx/ravg_impl.bpf.h" -#else -#include -#include -#endif - -#include "intf.h" - -#define MAX_L3S 16 - -#include "dsq.bpf.h" - -/* - * A couple of tricky things about checking a cgroup's cpumask: - * - * First, we need an RCU pointer to pass to cpumask kfuncs. The only way to get - * this right now is to copy the cpumask to a map entry. Given that cgroup init - * could be re-entrant we have a few per-cpu entries in a map to make this - * doable. - * - * Second, cpumask can sometimes be stored as an array in-situ or as a pointer - * and with different lengths. Some bpf_core_type_matches finagling can make - * this all work. - */ -#define MAX_CPUMASK_ENTRIES (4) - -/* - * We don't know how big struct cpumask is at compile time, so just allocate a - * large space and check that it is big enough at runtime - * TODO: This should be deduplicated with the rust code and put in intf.h - */ -#define CPUMASK_LONG_ENTRIES (128) -#define CPUMASK_SIZE (sizeof(long) * CPUMASK_LONG_ENTRIES) - -extern const volatile u32 nr_l3; - -extern struct cell_map cells; - - -enum mitosis_constants { - - /* Root cell index */ - ROOT_CELL_ID = 0, - - /* Invalid/unset L3 value */ - // INVALID_L3_ID = -1, - - /* Default weight divisor for vtime calculation */ - DEFAULT_WEIGHT_MULTIPLIER = 100, - - /* Vtime validation multiplier (slice_ns * 8192) */ - VTIME_MAX_FUTURE_MULTIPLIER = 8192, - - /* Bits per u32 for cpumask operations */ - BITS_PER_U32 = 32, - - /* No NUMA constraint for DSQ creation */ - ANY_NUMA = -1, -}; - -struct cell { - struct bpf_spin_lock lock; - - // Whether or not the cell is used or not - u32 in_use; - // Number of CPUs in this cell - u32 cpu_cnt; - // per-L3 vtimes within this cell - u64 l3_vtime_now[MAX_L3S]; - // Number of CPUs from each L3 assigned to this cell - u32 l3_cpu_cnt[MAX_L3S]; - // Number of L3s with at least one CPU in this cell - u32 l3_present_cnt; - - // TODO XXX remove this, only here temporarily to make the code compile - // current vtime of the cell - u64 vtime_now; -}; - -// #if 0 -/* Wrap the spin lock in a struct for verifier */ -// struct cell_lock_wrapper { -// struct bpf_spin_lock lock; -// }; - -// struct cell_locks_map { -// __uint(type, BPF_MAP_TYPE_ARRAY); -// __type(key, u32); -// __type(value, struct cell_lock_wrapper); -// __uint(max_entries, MAX_CELLS); -// }; - -#define WITH_CELL_LOCK(cell_ptr, cell_idx, block) \ - do { \ - struct bpf_spin_lock *lock = get_cell_lock(cell_idx); \ - if (!lock) { \ - scx_bpf_error("Failed to get lock for cell %d", \ - cell_idx); \ - break; \ - } \ - bpf_spin_lock(lock); \ - block bpf_spin_unlock(lock); \ - } while (0) - -static inline struct cell *lookup_cell(int idx) -{ - struct cell *cell; - - // cell = MEMBER_VPTR(cells, [idx]); - cell = bpf_map_lookup_elem(&cells, &idx); - - - if (!cell) { - scx_bpf_error("Invalid cell %d", idx); - return NULL; - } - return cell; -} - -static inline struct bpf_spin_lock *get_cell_lock(u32 cell_idx) -{ - if (cell_idx >= MAX_CELLS) { - scx_bpf_error("Invalid cell index %d", cell_idx); - return NULL; - } - - struct cell *cell = lookup_cell(cell_idx); - if (!cell) { - scx_bpf_error("Cell %d not found", cell_idx); - return NULL; - } - return &cell->lock; -} -// #endif - -/* - * task_ctx is the per-task information kept by scx_mitosis - */ -struct task_ctx { - /* cpumask is the set of valid cpus this task can schedule on */ - /* (tasks cpumask anded with its cell cpumask) */ - struct bpf_cpumask __kptr *cpumask; - /* started_running_at for recording runtime */ - u64 started_running_at; - u64 basis_vtime; - /* For the sake of monitoring, each task is owned by a cell */ - u32 cell; - /* For the sake of scheduling, a task is exclusively owned by either a cell - * or a cpu */ - dsq_id_t dsq; - /* latest configuration that was applied for this task */ - /* (to know if it has to be re-applied) */ - u32 configuration_seq; - /* Is this task allowed on all cores of its cell? */ - bool all_cell_cpus_allowed; - // Which L3 this task is assigned to - s32 l3; - -#if MITOSIS_ENABLE_STEALING - /* When a task is stolen, dispatch() marks the destination L3 here. - * running() applies the retag and recomputes cpumask (vtime preserved). - */ - s32 pending_l3; - u32 steal_count; /* how many times this task has been stolen */ - u64 last_stolen_at; /* ns timestamp of the last steal (scx_bpf_now) */ - u32 steals_prevented; /* how many times this task has been prevented from being stolen */ -#endif -}; - -// These could go in mitosis.bpf.h, but we'll cross that bridge when we get -static inline const struct cpumask *lookup_cell_cpumask(int idx); - -static inline struct task_ctx *lookup_task_ctx(struct task_struct *p); - -/* MAP TYPES */ -struct function_counters_map { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, u64); - __uint(max_entries, NR_COUNTERS); -}; - -struct cell_map { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, u32); - __type(value, struct cell); - __uint(max_entries, MAX_CELLS); -}; - -struct rcu_read_guard { - bool active; -}; - -static inline struct rcu_read_guard rcu_read_lock_guard(void) -{ - bpf_rcu_read_lock(); - return (struct rcu_read_guard){ .active = true }; -} - -static inline void rcu_read_guard_release(struct rcu_read_guard *guard) -{ - if (guard->active) { - bpf_rcu_read_unlock(); - guard->active = false; - } -} -#define RCU_READ_GUARD() \ - struct rcu_read_guard __rcu_guard \ - __attribute__((__cleanup__(rcu_read_guard_release))) = \ - rcu_read_lock_guard() - -struct cpumask_guard { - struct bpf_cpumask *mask; -}; - -static inline struct cpumask_guard cpumask_create_guard(void) -{ - struct bpf_cpumask *mask = bpf_cpumask_create(); - return (struct cpumask_guard){ .mask = mask }; -} - -static inline void cpumask_guard_release(struct cpumask_guard *guard) -{ - if (guard->mask) { - bpf_cpumask_release(guard->mask); - guard->mask = NULL; - } -} - -#define CPUMASK_GUARD(var_name) \ - struct cpumask_guard var_name \ - __attribute__((__cleanup__(cpumask_guard_release))) = \ - cpumask_create_guard() From 0b126e969bb266799307904d074f4b35a55a8ba2 Mon Sep 17 00:00:00 2001 From: tommy-u Date: Fri, 10 Oct 2025 14:33:24 -0700 Subject: [PATCH 12/12] Fix work stealing bug --- scheds/rust/scx_mitosis/src/bpf/intf.h | 2 -- scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h | 6 ++---- scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c | 5 ----- 3 files changed, 2 insertions(+), 11 deletions(-) diff --git a/scheds/rust/scx_mitosis/src/bpf/intf.h b/scheds/rust/scx_mitosis/src/bpf/intf.h index b1612430c6..b1fcbf7941 100644 --- a/scheds/rust/scx_mitosis/src/bpf/intf.h +++ b/scheds/rust/scx_mitosis/src/bpf/intf.h @@ -20,9 +20,7 @@ typedef _Bool bool; #endif /* ---- Work stealing config (compile-time) ------------------------------- */ -#ifndef MITOSIS_ENABLE_STEALING #define MITOSIS_ENABLE_STEALING 1 -#endif /* ----------------------------------------------------------------------- */ enum consts { diff --git a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h index 2e5281984b..492b2723c7 100644 --- a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h +++ b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h @@ -20,9 +20,7 @@ typedef u32 l3_id_t; // Configure how aggressively we steal work. // When task is detected as a steal candidate, skip it this many times // On a web server workload, 100 reduced steal count by ~90% -#ifdef MITOSIS_ENABLE_STEALING #define PREVENT_N_STEALS 0 -#endif /* Work stealing statistics map - accessible from both BPF and userspace */ struct steal_stats_map { @@ -213,7 +211,7 @@ static inline s32 pick_l3_for_task(u32 cell_id) return ret; } -#ifdef MITOSIS_ENABLE_STEALING +#if MITOSIS_ENABLE_STEALING static inline bool try_stealing_this_task(struct task_ctx *task_ctx, s32 local_l3, u64 candidate_dsq) @@ -280,7 +278,7 @@ static inline bool try_stealing_work(u32 cell, s32 local_l3) // Optimization: skip if faster than constructing an iterator // Not redundant with later checking if task found (race) - if (scx_bpf_dsq_nr_queued(candidate_dsq)) + if (!scx_bpf_dsq_nr_queued(candidate_dsq)) continue; // Just a trick for peeking the head element diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c index 44cfee2f3d..3e1eac406e 100644 --- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c @@ -1625,11 +1625,6 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init) void BPF_STRUCT_OPS(mitosis_exit, struct scx_exit_info *ei) { - // int i; - // bpf_for(i, 0, MAX_CELLS); { - // dump_cell_state((u32)i); - // } - UEI_RECORD(uei, ei); }