From 1b11d2bb14ebfaaec8cc2792c72d36b5ba7bfa08 Mon Sep 17 00:00:00 2001
From: tommy-u <tunger10@gmail.com>
Date: Mon, 8 Sep 2025 11:27:59 -0700
Subject: [PATCH 01/12] Only print stats when cell is in_use

---
 scheds/rust/scx_mitosis/src/main.rs | 33 +++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)
diff --git a/scheds/rust/scx_mitosis/src/main.rs b/scheds/rust/scx_mitosis/src/main.rs
index 8f42568df6..c8740c918d 100644
--- a/scheds/rust/scx_mitosis/src/main.rs
+++ b/scheds/rust/scx_mitosis/src/main.rs
@@ -159,6 +159,13 @@ impl Display for DistributionStats {
 }
 
 impl<'a> Scheduler<'a> {
+    fn is_cell_in_use(&self, cell_id: u32) -> bool {
+        let cells = &self.skel.maps.bss_data.as_ref().unwrap().cells;
+        let bpf_cell = cells[cell_id as usize];
+        let in_use = unsafe { std::ptr::read_volatile(&bpf_cell.in_use as *const u32) };
+        in_use != 0
+    }
+
     fn init(opts: &Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
         let topology = Topology::new()?;
 
@@ -327,8 +334,8 @@ impl<'a> Scheduler<'a> {
                 .map(|&stat| cell_stats_delta[cell][stat as usize])
                 .sum::<u64>();
 
-            // FIXME: This should really query if the cell is enabled or not.
-            if cell_queue_decisions == 0 {
+            // Only print stats for cells that are in use and have decisions
+            if cell_queue_decisions == 0 || !self.is_cell_in_use(cell as u32) {
                 continue;
             }
 
@@ -418,7 +425,29 @@ impl<'a> Scheduler<'a> {
         self.log_all_queue_stats(&cell_stats_delta)?;
 
         for (cell_id, cell) in &self.cells {
+            // Check if cell is actually in use from BPF before printing
+            if !self.is_cell_in_use(*cell_id) {
+                continue;
+            }
+            
             trace!("CELL[{}]: {}", cell_id, cell.cpus);
+            
+            // Read current CPU assignments directly from BPF for comparison
+            let mut bpf_cpus = Cpumask::new();
+            let cpu_ctxs = read_cpu_ctxs(&self.skel)?;
+            for (i, cpu_ctx) in cpu_ctxs.iter().enumerate() {
+                if cpu_ctx.cell == *cell_id {
+                    bpf_cpus.set_cpu(i).expect("set cpu in bpf mask");
+                }
+            }
+
+            trace!("CELL[{}]: BPF={}", cell_id, bpf_cpus);
+            
+            // Flag potential staleness
+            if cell.cpus != bpf_cpus {
+                warn!("STALENESS DETECTED: CELL[{}] userspace={} != bpf={}", 
+                      cell_id, cell.cpus, bpf_cpus);
+            }
         }
 
         for (cell_id, cell) in self.cells.iter() {

From 0b7a1fc9de5583730845173102bdfe8ca33a9497 Mon Sep 17 00:00:00 2001
From: tommy-u <tunger10@gmail.com>
Date: Mon, 8 Sep 2025 11:39:39 -0700
Subject: [PATCH 02/12] Fix cpumask cleanup. RAII for running guard.

---
 scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
index b40c06e79d..266a8cb30b 100644
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
@@ -758,17 +758,18 @@ static inline int get_cgroup_cpumask(struct cgroup *cgrp,
 u32 level_cells[MAX_CG_DEPTH];
 int running;
 
+/* The guard is a stack variable. When it falls out of scope,
+ * we drop the running lock. */
+static inline void __running_unlock(int *guard) {
+	(void)guard; /* unused */
+	WRITE_ONCE(running, 0);
+}
+
 /*
  * On tick, we identify new cells and apply CPU assignment
  */
 void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run)
 {
-	/*
-	 * We serialize tick() on core 0 and ensure only one tick running at a time
-	 * to ensure this can only happen once.
-	 */
-	if (bpf_get_smp_processor_id())
-		return;
 
 	u32 local_configuration_seq = READ_ONCE(configuration_seq);
 	if (local_configuration_seq == READ_ONCE(applied_configuration_seq))
@@ -779,6 +780,8 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run)
 					 __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
 		return;
 
+	int __attribute__((cleanup(__running_unlock), unused)) __running_guard;
+
 	DECLARE_CPUMASK_ENTRY(entry) = allocate_cpumask_entry();
 	if (!entry)
 		return;
@@ -967,13 +970,11 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run)
 	int cpu_idx;
 	bpf_for(cpu_idx, 0, nr_possible_cpus)
 	{
-		if (bpf_cpumask_test_cpu(
-			    cpu_idx, (const struct cpumask *)&entry->cpumask)) {
+		if (bpf_cpumask_test_cpu( cpu_idx, (const struct cpumask *)root_bpf_cpumask)) {
 			struct cpu_ctx *cpu_ctx;
 			if (!(cpu_ctx = lookup_cpu_ctx(cpu_idx)))
 				goto out_root_cgrp;
 			cpu_ctx->cell = 0;
-			bpf_cpumask_clear_cpu(cpu_idx, root_bpf_cpumask);
 		}
 	}
 
@@ -994,7 +995,6 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run)
 
 	barrier();
 	WRITE_ONCE(applied_configuration_seq, local_configuration_seq);
-	WRITE_ONCE(running, 0);
 
 	bpf_cgroup_release(root_cgrp_ref);
 	return;

From 0c3c7bb828b08075f2328ac8c4de40a20911966c Mon Sep 17 00:00:00 2001
From: tommy-u <tunger10@gmail.com>
Date: Mon, 8 Sep 2025 15:22:27 -0700
Subject: [PATCH 03/12] Preparing datastructures and helper functions for core
 scheduler modification.

---
 scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h     | 132 +++++++++++++++
 scheds/rust/scx_mitosis/src/bpf/intf.h        |  17 +-
 .../rust/scx_mitosis/src/bpf/l3_aware.bpf.h   | 150 ++++++++++++++++++
 scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c |  60 ++-----
 scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h | 104 ++++++++++++
 5 files changed, 404 insertions(+), 59 deletions(-)
 create mode 100644 scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h
 create mode 100644 scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
 create mode 100644 scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h

diff --git a/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h
new file mode 100644
index 0000000000..a545cb72ad
--- /dev/null
+++ b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h
@@ -0,0 +1,132 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates. */
+/*
+ * This software may be used and distributed according to the terms of the
+ * GNU General Public License version 2.
+ *
+ * This header defines the 32-bit dispatch queue (DSQ) ID encoding
+ * scheme for scx_mitosis, using type fields to distinguish between
+ * per-CPU and cell+L3 domain queues. It includes helper functions to
+ * construct, validate, and parse these DSQ IDs for queue management.
+ */
+#pragma once
+
+#include "intf.h"
+#include "mitosis.bpf.h"
+
+/*
+ * ================================
+ * BPF DSQ ID Layout (64 bits wide)
+ * ================================
+ *
+ * Top-level format:
+ *   [63] [62..0]
+ *   [ B] [  ID ]
+ *
+ * If B == 1 it is a Built-in DSQ
+ * -------------------------
+ *   [63] [62] [61 .. 32]  [31..0]
+ *   [ 1] [ L] [   R    ]  [  V  ]
+ *
+ *   - L (bit 62): LOCAL_ON flag
+ *       If L == 1 -> V = CPU number
+ *   - R (30 bits): reserved / unused
+ *   - V (32 bits): value (e.g., CPU#)
+ *
+ * If B == 0 -> User-defined DSQ
+ * -----------------------------
+ * Only the low 32 bits are used.
+ *
+ *   [63     ..     32] [31..0]
+ *   [  0s or unused  ] [ VAL ]
+ *
+ *   Mitosis uses VAL as follows:
+ *
+ *   [31..24] [23..0]
+ *   [QTYPE ] [DATA ]
+ *
+ *   QTYPE encodes the queue type (exactly one bit set):
+ *
+ *     QTYPE = 0x1 -> Per-CPU Q
+ *       [31 .. 24] [23 .. 16] [15    ..      0]
+ *       [00000001] [00000000] [      CPU#     ]
+ *       [Q-TYPE:1]
+ *
+ *     QTYPE = 0x2 -> Cell+L3 Q
+ *       [31 .. 24] [23 .. 16] [15      ..    0]
+ *       [00000010] [  CELL# ] [      L3ID     ]
+ *       [Q-TYPE:2]
+ *
+ */
+
+#define DSQ_ERROR 0xFFFFFFFF; /* Error value for DSQ functions */
+
+/* DSQ type enumeration */
+enum dsq_type {
+	DSQ_UNKNOWN,
+	DSQ_TYPE_CPU,
+	DSQ_TYPE_CELL_L3,
+};
+
+/* DSQ ID structure using unions for type-safe access */
+struct dsq_cpu {
+	u32 cpu : 16;
+	u32 unused : 8;
+	u32 type : 8;
+} __attribute__((packed));
+
+struct dsq_cell_l3 {
+	u32 l3 : 16;
+	u32 cell : 8;
+	u32 type : 8;
+} __attribute__((packed));
+
+union dsq_id {
+	u32 raw;
+	struct dsq_cpu cpu;
+	struct dsq_cell_l3 cell_l3;
+	struct {
+		u32 data : 24;
+		u32 type : 8;
+	} common;
+} __attribute__((packed));
+
+/* Static assertions to ensure correct sizes */
+/* Verify that all DSQ structures are exactly 32 bits */
+_Static_assert(sizeof(struct dsq_cpu) == 4, "dsq_cpu must be 32 bits");
+_Static_assert(sizeof(struct dsq_cell_l3) == 4, "dsq_cell_l3 must be 32 bits");
+_Static_assert(sizeof(union dsq_id) == 4, "dsq_id union must be 32 bits");
+
+/* Inline helper functions for DSQ ID manipulation */
+
+// Is this a per CPU DSQ?
+static inline bool is_cpu_dsq(u32 dsq_id)
+{
+	union dsq_id id = { .raw = dsq_id };
+	return id.common.type == DSQ_TYPE_CPU;
+}
+
+// If this is a per cpu dsq, return the cpu
+static inline u32 get_cpu_from_dsq(u32 dsq_id)
+{
+	union dsq_id id = { .raw = dsq_id };
+	if (id.common.type != DSQ_TYPE_CPU)
+		return DSQ_ERROR;
+	return id.cpu.cpu;
+}
+
+/* Helper functions to construct DSQ IDs */
+static inline u32 get_cpu_dsq_id(u32 cpu)
+{
+	if (cpu >= MAX_CPUS)
+		return DSQ_ERROR;
+	union dsq_id id = { .cpu = { .cpu = cpu, .unused = 0, .type = DSQ_TYPE_CPU } };
+	return id.raw;
+}
+
+static inline u32 get_cell_l3_dsq_id(u32 cell, u32 l3)
+{
+	if (cell >= MAX_CELLS || l3 >= MAX_L3S)
+		return DSQ_ERROR;
+	union dsq_id id = { .cell_l3 = {.l3 = l3, .cell = cell, .type = DSQ_TYPE_CELL_L3 } };
+	return id.raw;
+}
diff --git a/scheds/rust/scx_mitosis/src/bpf/intf.h b/scheds/rust/scx_mitosis/src/bpf/intf.h
index 0658734545..01e1490aa5 100644
--- a/scheds/rust/scx_mitosis/src/bpf/intf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/intf.h
@@ -18,6 +18,12 @@ typedef _Bool bool;
 #include <scx/ravg.bpf.h>
 #endif
 
+/* ---- Work stealing config (compile-time) ------------------------------- */
+#ifndef MITOSIS_ENABLE_STEALING
+#define MITOSIS_ENABLE_STEALING 0
+#endif
+/* ----------------------------------------------------------------------- */
+
 enum consts {
 	CACHELINE_SIZE = 64,
 	MAX_CPUS_SHIFT = 9,
@@ -28,6 +34,7 @@ enum consts {
 
 	PCPU_BASE = 0x80000000,
 	MAX_CG_DEPTH = 256,
+
 };
 
 /* Statistics */
@@ -51,14 +58,4 @@ struct cgrp_ctx {
 	bool cell_owner;
 };
 
-/*
- * cell is the per-cell book-keeping
-*/
-struct cell {
-	// current vtime of the cell
-	u64 vtime_now;
-	// Whether or not the cell is used or not
-	u32 in_use;
-};
-
 #endif /* __INTF_H */
diff --git a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
new file mode 100644
index 0000000000..0ced3fa78b
--- /dev/null
+++ b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
@@ -0,0 +1,150 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates. */
+/*
+ * This software may be used and distributed according to the terms of the
+ * GNU General Public License version 2.
+ *
+ * This header adds L3 cache awareness to scx_mitosis by defining BPF
+ * maps for CPU-to-L3 domain mappings. It provides functions to
+ * recalculate per-L3 CPU counts within cells and implements weighted
+ * random L3 selection for tasks. It also tracks work-stealing
+ * statistics for cross-L3 task migrations.
+ */
+#pragma once
+
+#include "mitosis.bpf.h"
+#include "intf.h"
+
+// It's also an option to just compute this from the cpu_to_l3 map.
+struct l3_cpu_mask {
+	unsigned long cpumask[CPUMASK_LONG_ENTRIES];
+};
+
+/* Work stealing statistics map - accessible from both BPF and userspace */
+struct steal_stats_map {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, u64);
+	__uint(max_entries, 1);
+};
+
+// A CPU -> L3 cache ID map
+struct cpu_to_l3_map {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, u32);
+	__uint(max_entries, MAX_CPUS);
+};
+
+struct l3_to_cpus_map {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, struct l3_cpu_mask);
+	__uint(max_entries, MAX_L3S);
+};
+
+extern struct cpu_to_l3_map cpu_to_l3 SEC(".maps");
+extern struct l3_to_cpus_map l3_to_cpus SEC(".maps");
+extern struct steal_stats_map steal_stats SEC(".maps");
+
+static inline const struct cpumask *lookup_l3_cpumask(u32 l3)
+{
+	struct l3_cpu_mask *mask;
+
+	if (!(mask = bpf_map_lookup_elem(&l3_to_cpus, &l3))) {
+		scx_bpf_error("no l3 cpumask, l3: %d, %p", l3, &l3_to_cpus);
+		return NULL;
+	}
+
+	return (const struct cpumask *)mask;
+}
+
+/* Recompute cell->l3_cpu_cnt[] after cell cpumask changes (no persistent kptrs). */
+static __always_inline void recalc_cell_l3_counts(u32 cell_idx)
+{
+	struct cell *cell = lookup_cell(cell_idx);
+	if (!cell)
+		return;
+
+	struct bpf_cpumask *tmp = bpf_cpumask_create();
+	if (!tmp)
+		return;
+
+	u32 l3, present = 0, total_cpus = 0;
+
+	bpf_rcu_read_lock();
+	const struct cpumask *cell_mask =
+		lookup_cell_cpumask(cell_idx); // RCU ptr
+	if (!cell_mask) {
+		bpf_rcu_read_unlock();
+		bpf_cpumask_release(tmp);
+		return;
+	}
+
+	bpf_for(l3, 0, nr_l3)
+	{
+		const struct cpumask *l3_mask =
+			lookup_l3_cpumask(l3); // plain map memory
+		if (!l3_mask) {
+			cell->l3_cpu_cnt[l3] = 0;
+			continue;
+		}
+
+		/* ok: dst is bpf_cpumask*, sources are (RCU cpumask*, plain cpumask*) */
+		bpf_cpumask_and(tmp, cell_mask, l3_mask);
+
+		u32 cnt = bpf_cpumask_weight((const struct cpumask *)tmp);
+		cell->l3_cpu_cnt[l3] = cnt;
+		total_cpus += cnt;
+		if (cnt)
+			present++;
+	}
+	bpf_rcu_read_unlock();
+
+	cell->l3_present_cnt = present;
+	cell->cpu_cnt = total_cpus;
+	bpf_cpumask_release(tmp);
+}
+
+/**
+ * Weighted random selection of an L3 cache domain for a task.
+ *
+ * Uses the CPU count in each L3 domain within the cell as weights to
+ * probabilistically select an L3. L3 domains with more CPUs in the cell
+ * have higher probability of being selected.
+ *
+ * @cell_id: The cell ID to select an L3 from
+ * @return: L3 ID on success, INVALID_L3_ID on error, or 0 as fallback
+ */
+static inline s32 pick_l3_for_task(u32 cell_id)
+{
+	struct cell *cell;
+	u32 l3, target, cur = 0;
+	s32 ret = INVALID_L3_ID;
+
+	/* Look up the cell structure */
+	if (!(cell = lookup_cell(cell_id)))
+		return INVALID_L3_ID;
+
+	/* Handle case where cell has no CPUs assigned yet */
+	if (!cell->cpu_cnt) {
+		scx_bpf_error(
+			"pick_l3_for_task: cell %d has no CPUs accounted yet",
+			cell_id);
+		return INVALID_L3_ID;
+	}
+
+	/* Generate random target value in range [0, cpu_cnt) */
+	target = bpf_get_prandom_u32() % cell->cpu_cnt;
+
+	/* Find the L3 domain corresponding to the target value using
+	 * weighted selection - accumulate CPU counts until we exceed target */
+	bpf_for(l3, 0, nr_l3)
+	{
+		cur += cell->l3_cpu_cnt[l3];
+		if (target < cur) {
+			ret = (s32)l3;
+			break;
+		}
+	}
+	return ret;
+}
diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
index 266a8cb30b..2b197c87fc 100644
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
@@ -14,14 +14,9 @@
 
 #include "intf.h"
 
-#ifdef LSP
-#define __bpf__
-#include "../../../../include/scx/common.bpf.h"
-#include "../../../../include/scx/ravg_impl.bpf.h"
-#else
-#include <scx/common.bpf.h>
-#include <scx/ravg_impl.bpf.h>
-#endif
+#include "mitosis.bpf.h"
+#include "dsq.bpf.h"
+#include "l3_aware.bpf.h"
 
 char _license[] SEC("license") = "GPL";
 
@@ -35,6 +30,7 @@ const volatile unsigned char all_cpus[MAX_CPUS_U8];
 const volatile u64 slice_ns;
 const volatile u64 root_cgid = 1;
 
+const volatile u32 nr_l3 = 1;
 /*
  * CPU assignment changes aren't fully in effect until a subsequent tick()
  * configuration_seq is bumped on each assignment change
@@ -48,6 +44,13 @@ private(root_cgrp) struct cgroup __kptr *root_cgrp;
 
 UEI_DEFINE(uei);
 
+/*
+ * Maps used for L3-aware scheduling
+*/
+struct cpu_to_l3_map cpu_to_l3 SEC(".maps");
+struct l3_to_cpus_map l3_to_cpus SEC(".maps");
+struct steal_stats_map steal_stats SEC(".maps");
+
 /*
  * We store per-cpu values along with per-cell values. Helper functions to
  * translate.
@@ -119,27 +122,6 @@ static inline struct cgroup *task_cgroup(struct task_struct *p)
 	return cgrp;
 }
 
-/*
- * task_ctx is the per-task information kept by scx_mitosis
- */
-struct task_ctx {
-	/* cpumask is the set of valid cpus this task can schedule on */
-	/* (tasks cpumask anded with its cell cpumask) */
-	struct bpf_cpumask __kptr *cpumask;
-	/* started_running_at for recording runtime */
-	u64 started_running_at;
-	u64 basis_vtime;
-	/* For the sake of monitoring, each task is owned by a cell */
-	u32 cell;
-	/* For the sake of scheduling, a task is exclusively owned by either a cell
-	 * or a cpu */
-	u32 dsq;
-	/* latest configuration that was applied for this task */
-	/* (to know if it has to be re-applied) */
-	u32 configuration_seq;
-	/* Is this task allowed on all cores of its cell? */
-	bool all_cell_cpus_allowed;
-};
 
 struct {
 	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
@@ -607,26 +589,6 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev)
 		scx_bpf_dsq_move_to_local(dsq);
 }
 
-/*
- * A couple of tricky things about checking a cgroup's cpumask:
- *
- * First, we need an RCU pointer to pass to cpumask kfuncs. The only way to get
- * this right now is to copy the cpumask to a map entry. Given that cgroup init
- * could be re-entrant we have a few per-cpu entries in a map to make this
- * doable.
- *
- * Second, cpumask can sometimes be stored as an array in-situ or as a pointer
- * and with different lengths. Some bpf_core_type_matches finagling can make
- * this all work.
- */
-#define MAX_CPUMASK_ENTRIES (4)
-
-/*
- * We don't know how big struct cpumask is at compile time, so just allocate a
- * large space and check that it is big enough at runtime
- */
-#define CPUMASK_LONG_ENTRIES (128)
-#define CPUMASK_SIZE (sizeof(long) * CPUMASK_LONG_ENTRIES)
 
 struct cpumask_entry {
 	unsigned long cpumask[CPUMASK_LONG_ENTRIES];
diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
new file mode 100644
index 0000000000..e39bbd92d8
--- /dev/null
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
@@ -0,0 +1,104 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates. */
+/*
+ * This software may be used and distributed according to the terms of the
+ * GNU General Public License version 2.
+ *
+ * This defines the core data structures, types, and constants
+ * for the scx_mitosis scheduler, primarily containing `struct cell`
+ * and `struct task_ctx`.
+ */
+
+#pragma once
+
+#ifdef LSP
+#define __bpf__
+#include "../../../../include/scx/common.bpf.h"
+#include "../../../../include/scx/ravg_impl.bpf.h"
+#else
+#include <scx/common.bpf.h>
+#include <scx/ravg_impl.bpf.h>
+#endif
+
+#include "intf.h"
+
+#define MAX_L3S 16
+
+/*
+ * A couple of tricky things about checking a cgroup's cpumask:
+ *
+ * First, we need an RCU pointer to pass to cpumask kfuncs. The only way to get
+ * this right now is to copy the cpumask to a map entry. Given that cgroup init
+ * could be re-entrant we have a few per-cpu entries in a map to make this
+ * doable.
+ *
+ * Second, cpumask can sometimes be stored as an array in-situ or as a pointer
+ * and with different lengths. Some bpf_core_type_matches finagling can make
+ * this all work.
+ */
+#define MAX_CPUMASK_ENTRIES (4)
+
+extern const volatile u32 nr_l3;
+
+/*
+ * We don't know how big struct cpumask is at compile time, so just allocate a
+ * large space and check that it is big enough at runtime
+ */
+#define CPUMASK_LONG_ENTRIES (128)
+#define CPUMASK_SIZE (sizeof(long) * CPUMASK_LONG_ENTRIES)
+
+enum mitosis_constants {
+	/* Invalid/unset L3 value */
+	INVALID_L3_ID = -1,
+};
+
+struct cell {
+	// Whether or not the cell is used or not
+	u32 in_use;
+	// Number of CPUs in this cell
+	u32 cpu_cnt;
+	// per-L3 vtimes within this cell
+	u64 l3_vtime_now[MAX_L3S];
+	// Number of CPUs from each L3 assigned to this cell
+	u32 l3_cpu_cnt[MAX_L3S];
+	// Number of L3s with at least one CPU in this cell
+	u32 l3_present_cnt;
+
+  // TODO XXX remove this, only here temporarily to make the code compile
+  // current vtime of the cell
+	u64 vtime_now;
+};
+
+/*
+ * task_ctx is the per-task information kept by scx_mitosis
+ */
+struct task_ctx {
+	/* cpumask is the set of valid cpus this task can schedule on */
+	/* (tasks cpumask anded with its cell cpumask) */
+	struct bpf_cpumask __kptr *cpumask;
+	/* started_running_at for recording runtime */
+	u64 started_running_at;
+	u64 basis_vtime;
+	/* For the sake of monitoring, each task is owned by a cell */
+	u32 cell;
+	/* For the sake of scheduling, a task is exclusively owned by either a cell
+	 * or a cpu */
+	u32 dsq;
+	/* latest configuration that was applied for this task */
+	/* (to know if it has to be re-applied) */
+	u32 configuration_seq;
+	/* Is this task allowed on all cores of its cell? */
+	bool all_cell_cpus_allowed;
+
+#if MITOSIS_ENABLE_STEALING
+	/* When a task is stolen, dispatch() marks the destination L3 here.
+	 * running() applies the retag and recomputes cpumask (vtime preserved).
+	*/
+	s32 pending_l3;
+	u32 steal_count; /* how many times this task has been stolen */
+	u64 last_stolen_at; /* ns timestamp of the last steal (scx_bpf_now) */
+#endif
+};
+
+// These could go in mitosis.bpf.h, but we'll cross that bridge when we get
+static inline struct cell *lookup_cell(int idx);
+static inline const struct cpumask *lookup_cell_cpumask(int idx);

From ba1924b18fff93f28fec50b67e92c420a28d5301 Mon Sep 17 00:00:00 2001
From: tommy-u <tunger10@gmail.com>
Date: Mon, 8 Sep 2025 18:11:30 -0700
Subject: [PATCH 04/12] Prepare rust side for l3 awareness

---
 scheds/rust/scx_mitosis/src/bpf/intf.h        |   8 +
 scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c |   5 +
 scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h |  13 +-
 scheds/rust/scx_mitosis/src/main.rs           | 333 ++++++++++++++++--
 .../scx_mitosis/src/mitosis_topology_utils.rs | 168 +++++++++
 scheds/rust/scx_mitosis/src/stats.rs          |   2 +
 6 files changed, 504 insertions(+), 25 deletions(-)
 create mode 100644 scheds/rust/scx_mitosis/src/mitosis_topology_utils.rs

diff --git a/scheds/rust/scx_mitosis/src/bpf/intf.h b/scheds/rust/scx_mitosis/src/bpf/intf.h
index 01e1490aa5..89c0096fd7 100644
--- a/scheds/rust/scx_mitosis/src/bpf/intf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/intf.h
@@ -46,6 +46,14 @@ enum cell_stat_idx {
 	NR_CSTATS,
 };
 
+/* Function invocation counters */
+enum counter_idx {
+	COUNTER_SELECT_CPU,
+	COUNTER_ENQUEUE,
+	COUNTER_DISPATCH,
+	NR_COUNTERS,
+};
+
 struct cpu_ctx {
 	u64 cstats[MAX_CELLS][NR_CSTATS];
 	u64 cell_cycles[MAX_CELLS];
diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
index 2b197c87fc..dd3f2cf240 100644
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
@@ -49,6 +49,11 @@ UEI_DEFINE(uei);
 */
 struct cpu_to_l3_map cpu_to_l3 SEC(".maps");
 struct l3_to_cpus_map l3_to_cpus SEC(".maps");
+
+/*
+ * Maps for statistics
+*/
+struct function_counters_map function_counters SEC(".maps");
 struct steal_stats_map steal_stats SEC(".maps");
 
 /*
diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
index e39bbd92d8..a4569f883e 100644
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
@@ -37,15 +37,16 @@
  */
 #define MAX_CPUMASK_ENTRIES (4)
 
-extern const volatile u32 nr_l3;
-
 /*
  * We don't know how big struct cpumask is at compile time, so just allocate a
  * large space and check that it is big enough at runtime
+ * TODO: This should be deduplicated with the rust code and put in intf.h
  */
 #define CPUMASK_LONG_ENTRIES (128)
 #define CPUMASK_SIZE (sizeof(long) * CPUMASK_LONG_ENTRIES)
 
+extern const volatile u32 nr_l3;
+
 enum mitosis_constants {
 	/* Invalid/unset L3 value */
 	INVALID_L3_ID = -1,
@@ -102,3 +103,11 @@ struct task_ctx {
 // These could go in mitosis.bpf.h, but we'll cross that bridge when we get
 static inline struct cell *lookup_cell(int idx);
 static inline const struct cpumask *lookup_cell_cpumask(int idx);
+
+/* MAP TYPES */
+struct function_counters_map {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, u32);
+	__type(value, u64);
+	__uint(max_entries, NR_COUNTERS);
+};
diff --git a/scheds/rust/scx_mitosis/src/main.rs b/scheds/rust/scx_mitosis/src/main.rs
index c8740c918d..c68a1476b1 100644
--- a/scheds/rust/scx_mitosis/src/main.rs
+++ b/scheds/rust/scx_mitosis/src/main.rs
@@ -6,6 +6,7 @@ mod bpf_skel;
 pub use bpf_skel::*;
 pub mod bpf_intf;
 mod stats;
+mod mitosis_topology_utils;
 
 use std::cmp::max;
 use std::collections::HashMap;
@@ -16,14 +17,14 @@ use std::sync::atomic::AtomicBool;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
 use std::time::Duration;
+use std::sync::Mutex;
 
 use anyhow::bail;
 use anyhow::Context;
 use anyhow::Result;
 use clap::Parser;
 use crossbeam::channel::RecvTimeoutError;
-use libbpf_rs::MapCore as _;
-use libbpf_rs::OpenObject;
+use libbpf_rs::{MapCore, OpenObject};
 use log::debug;
 use log::info;
 use log::trace;
@@ -46,11 +47,42 @@ use scx_utils::NR_CPUS_POSSIBLE;
 
 use stats::CellMetrics;
 use stats::Metrics;
+use crate::mitosis_topology_utils::{populate_topology_maps, MapKind};
 
 const SCHEDULER_NAME: &str = "scx_mitosis";
 const MAX_CELLS: usize = bpf_intf::consts_MAX_CELLS as usize;
 const NR_CSTATS: usize = bpf_intf::cell_stat_idx_NR_CSTATS as usize;
 
+// Can we deduplicate this with mitosis.bpf.h?
+const CPUMASK_LONG_ENTRIES: usize = 128;
+
+// Global debug flags
+// TODO: These will be runtime adjustable via a CLI option.
+static DEBUG_FLAGS: std::sync::LazyLock<Mutex<HashMap<String, bool>>> = std::sync::LazyLock::new(|| {
+    let mut flags = HashMap::new();
+    flags.insert("cpu_to_l3".to_string(),  false);
+    flags.insert("l3_to_cpus".to_string(), false);
+    flags.insert("cells".to_string(),      true );
+    flags.insert("counters".to_string(),   true );
+    flags.insert("steals".to_string(),     true );
+    flags.insert("metrics".to_string(),    true );
+    Mutex::new(flags)
+});
+
+/// Debug Printers
+const ANSI_RED: &str = "\x1b[31m";
+const ANSI_GREEN: &str = "\x1b[32m";
+const ANSI_RESET: &str = "\x1b[0m";
+
+/// Check if a debug flag is enabled
+fn is_debug_flag_enabled(flag: &str) -> bool {
+    if let Ok(flags) = DEBUG_FLAGS.lock() {
+        flags.get(flag).copied().unwrap_or(false)
+    } else {
+        false
+    }
+}
+
 /// scx_mitosis: A dynamic affinity scheduler
 ///
 /// Cgroups are assigned to a dynamic number of Cells which are assigned to a
@@ -117,9 +149,11 @@ struct Scheduler<'a> {
     // These are the per-cell cstats.
     // Note these are accumulated across all CPUs.
     prev_cell_stats: [[u64; NR_CSTATS]; MAX_CELLS],
+    prev_total_steals: u64,
     metrics: Metrics,
     stats_server: StatsServer<(), Metrics>,
     last_configuration_seq: Option<u32>,
+    iteration_count: u64,
 }
 
 struct DistributionStats {
@@ -146,7 +180,7 @@ impl Display for DistributionStats {
         );
         write!(
             f,
-            "{:width$} {:5.1}% | Local:{:4.1}% From: CPU:{:4.1}% Cell:{:4.1}% | V:{:4.1}%",
+            "{:width$} {:5.1}% | Local:{:5.1}% From: CPU:{:4.1}% Cell:{:5.1}% | V:{:4.1}%",
             self.total_decisions,
             self.share_of_decisions_pct,
             self.local_q_pct,
@@ -189,12 +223,23 @@ impl<'a> Scheduler<'a> {
             skel.maps.rodata_data.as_mut().unwrap().all_cpus[cpu / 8] |= 1 << (cpu % 8);
         }
 
+        skel.maps.rodata_data.as_mut().unwrap().nr_l3 = topology.all_llcs.len() as u32;
+
+        // print the number of l3s we detected
+        info!("Found {} L3s", topology.all_llcs.len());
+
         match *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP {
             0 => info!("Kernel does not support queued wakeup optimization."),
             v => skel.struct_ops.mitosis_mut().flags |= v,
         }
 
-        let skel = scx_ops_load!(skel, mitosis, uei)?;
+        let mut skel = scx_ops_load!(skel, mitosis, uei)?;
+
+        // Set up CPU to L3 topology mapping using the common functionality
+        populate_topology_maps(&mut skel, MapKind::CpuToL3, None)?;
+
+        // Set up L3 to CPUs mapping using the common functionality
+        populate_topology_maps(&mut skel, MapKind::L3ToCpus, None)?;
 
         let stats_server = StatsServer::new(stats::server_data()).launch()?;
 
@@ -203,9 +248,11 @@ impl<'a> Scheduler<'a> {
             monitor_interval: Duration::from_secs(opts.monitor_interval_s),
             cells: HashMap::new(),
             prev_cell_stats: [[0; NR_CSTATS]; MAX_CELLS],
+            prev_total_steals: 0,
             metrics: Metrics::default(),
             stats_server,
             last_configuration_seq: None,
+            iteration_count: 0,
         })
     }
 
@@ -217,6 +264,7 @@ impl<'a> Scheduler<'a> {
         let (res_ch, req_ch) = self.stats_server.channels();
 
         while !shutdown.load(Ordering::Relaxed) && !uei_exited!(&self.skel, uei) {
+            self.iteration_count += 1;
             self.refresh_bpf_cells()?;
             self.collect_metrics()?;
 
@@ -299,7 +347,7 @@ impl<'a> Scheduler<'a> {
             }
         }
 
-        let prefix = "Total Decisions:";
+        let prefix = "  Total:  ";
 
         // Here we want to sum the affinity violations over all cells.
         let scope_affn_viols: u64 = cell_stats_delta
@@ -317,7 +365,10 @@ impl<'a> Scheduler<'a> {
 
         self.metrics.update(&stats);
 
-        trace!("{} {}", prefix, stats);
+        if is_debug_flag_enabled("metrics") {
+            trace!("{}{}{}", ANSI_GREEN, "metrics:", ANSI_RESET);
+            trace!("{} {}", prefix, stats);
+        }
 
         Ok(())
     }
@@ -335,7 +386,7 @@ impl<'a> Scheduler<'a> {
                 .sum::<u64>();
 
             // Only print stats for cells that are in use and have decisions
-            if cell_queue_decisions == 0 || !self.is_cell_in_use(cell as u32) {
+            if !self.is_cell_in_use(cell as u32) {
                 continue;
             }
 
@@ -347,7 +398,7 @@ impl<'a> Scheduler<'a> {
             const MIN_CELL_WIDTH: usize = 2;
             let cell_width: usize = max(MIN_CELL_WIDTH, (MAX_CELLS as f64).log10().ceil() as usize);
 
-            let prefix = format!("        Cell {:width$}:", cell, width = cell_width);
+            let prefix = format!("  Cell {:width$}:", cell, width = cell_width);
 
             // Sum affinity violations for this cell
             let scope_affn_viols: u64 =
@@ -366,7 +417,9 @@ impl<'a> Scheduler<'a> {
                 .or_default()
                 .update(&stats);
 
-            trace!("{} {}", prefix, stats);
+            if is_debug_flag_enabled("metrics") {
+                trace!("{} {}", prefix, stats);
+            }
         }
         Ok(())
     }
@@ -417,36 +470,67 @@ impl<'a> Scheduler<'a> {
         }
         Ok(cell_stats_delta)
     }
+    /// Print debug printer status summary
+    fn print_debug_status(&self) {
+        if let Ok(flags) = DEBUG_FLAGS.lock() {
+            let mut disabled: Vec<_> = flags.iter().filter_map(|(flag, &enabled)| (!enabled).then_some(format!("{}~{}{}", ANSI_RED, flag, ANSI_RESET))).collect();
+            let mut enabled: Vec<_> = flags.iter().filter_map(|(flag, &enabled)| enabled.then_some(format!("{}+{}{}", ANSI_GREEN, flag, ANSI_RESET))).collect();
+            disabled.extend(enabled);
+            trace!("Debug Flags: {}", if disabled.is_empty() { "none".to_string() } else { disabled.join(" ") });
+            // trace!("hint: sudo ./scx_mitosis cli debug ~/+<flag_name>");
+        }
+    }
 
     /// Collect metrics and out various debugging data like per cell stats, per-cpu stats, etc.
     fn collect_metrics(&mut self) -> Result<()> {
+        trace!("");
+        trace!("Iteration #{}", self.iteration_count);
+
         let cell_stats_delta = self.calculate_cell_stat_delta()?;
 
         self.log_all_queue_stats(&cell_stats_delta)?;
 
+        // TODO: I don't really understand this.
         for (cell_id, cell) in &self.cells {
             // Check if cell is actually in use from BPF before printing
             if !self.is_cell_in_use(*cell_id) {
                 continue;
             }
-            
             trace!("CELL[{}]: {}", cell_id, cell.cpus);
-            
-            // Read current CPU assignments directly from BPF for comparison
-            let mut bpf_cpus = Cpumask::new();
-            let cpu_ctxs = read_cpu_ctxs(&self.skel)?;
-            for (i, cpu_ctx) in cpu_ctxs.iter().enumerate() {
-                if cpu_ctx.cell == *cell_id {
-                    bpf_cpus.set_cpu(i).expect("set cpu in bpf mask");
+        }
+
+        // Read total steals from BPF and update metrics
+        self.update_steal_metrics()?;
+
+        // Read and print function counters
+        self.print_and_reset_function_counters()?;
+        if is_debug_flag_enabled("cells") {
+            trace!("{}cells:{}", ANSI_GREEN, ANSI_RESET);
+            for i in 0..self.cells.len() {
+                if let Some(cell) = self.cells.get(&(i as u32)) {
+                    trace!("  CELL[{}]: {} ({:3} CPUs)", i, cell.cpus, cell.cpus.weight());
                 }
             }
+        }
+
+        if is_debug_flag_enabled("cpu_to_l3") {
+            let cpu_to_l3 = read_cpu_to_l3(&self.skel)?;
+            let cpu_l3_pairs: Vec<String> = cpu_to_l3.iter().enumerate()
+                .map(|(cpu, l3)| format!("{:3}:{:2}", cpu, l3))
+                .collect();
+            let chunked_output = cpu_l3_pairs
+                .chunks(16)
+                .map(|chunk| chunk.join(" "))
+                .collect::<Vec<_>>()
+                .join("\n");
+            trace!("{}cpu_to_l3:{}\n{}", ANSI_GREEN, ANSI_RESET, chunked_output);
+        }
 
-            trace!("CELL[{}]: BPF={}", cell_id, bpf_cpus);
-            
-            // Flag potential staleness
-            if cell.cpus != bpf_cpus {
-                warn!("STALENESS DETECTED: CELL[{}] userspace={} != bpf={}", 
-                      cell_id, cell.cpus, bpf_cpus);
+        if is_debug_flag_enabled("l3_to_cpus") {
+            trace!("{}l3_to_cpus:{}", ANSI_GREEN, ANSI_RESET);
+            let l3_to_cpus = read_l3_to_cpus(&self.skel)?;
+            for (l3_id, mask) in l3_to_cpus.iter() {
+                trace!("l3_to_cpus: [{:2}] = {}", l3_id, mask);
             }
         }
 
@@ -459,9 +543,167 @@ impl<'a> Scheduler<'a> {
         }
         self.metrics.num_cells = self.cells.len() as u32;
 
+        // Print debug printer status at the end of each cycle
+        self.print_debug_status();
+
         Ok(())
     }
 
+    fn print_and_reset_function_counters(&mut self) -> Result<()> {
+        if !is_debug_flag_enabled("counters") {
+            return Ok(());
+        }
+        trace!("{}counters:{}", ANSI_GREEN, ANSI_RESET);
+
+        let counter_names = ["select", "enqueue", "dispatch"];
+        let max_name_len = counter_names.iter().map(|name| name.len()).max().unwrap_or(0);
+        let mut all_counters = Vec::new();
+
+        // Read counters for each function
+        for counter_idx in 0..bpf_intf::counter_idx_NR_COUNTERS {
+            let key = (counter_idx as u32).to_ne_bytes();
+
+            // Read per-CPU values
+            let percpu_values = self.skel
+                .maps
+                .function_counters
+                .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
+                .context("Failed to lookup function counter")?
+                .unwrap_or_default();
+
+            let mut cpu_values = Vec::new();
+            for cpu in 0..*NR_CPUS_POSSIBLE {
+                if cpu < percpu_values.len() {
+                    let value = u64::from_ne_bytes(
+                        percpu_values[cpu].as_slice().try_into()
+                            .context("Failed to convert counter bytes")?
+                    );
+                    cpu_values.push(value);
+                }
+            }
+
+            all_counters.push(cpu_values);
+        }
+
+        // Calculate and print statistics for each counter
+        for (idx, counter_values) in all_counters.iter().enumerate() {
+            if idx >= counter_names.len() {
+                break;
+            }
+
+            let name = counter_names[idx];
+            let non_zero_values: Vec<u64> = counter_values.iter().filter(|&&v| v > 0).copied().collect();
+
+            if non_zero_values.is_empty() {
+                trace!("  Fn[{:<width$}]: no activity", name, width = max_name_len);
+                continue;
+            }
+
+            let total: u64 = non_zero_values.iter().sum();
+            let min = *non_zero_values.iter().min().unwrap();
+            let max = *non_zero_values.iter().max().unwrap();
+
+            // Calculate median
+            let mut sorted_values = non_zero_values.clone();
+            sorted_values.sort();
+            let median = if sorted_values.len() % 2 == 0 {
+                let mid = sorted_values.len() / 2;
+                (sorted_values[mid - 1] + sorted_values[mid]) / 2
+            } else {
+                sorted_values[sorted_values.len() / 2]
+            };
+
+            trace!(
+                "  Fn[{:<width$}]: tot={:>6} min={:>4} med={:>4} max={:>5} ({:3} CPUs)",
+                name, total, min, median, max, non_zero_values.len(), width = max_name_len
+            );
+        }
+
+        // Zero out all counters after printing
+        for counter_idx in 0..bpf_intf::counter_idx_NR_COUNTERS {
+            let key = (counter_idx as u32).to_ne_bytes();
+            let zero_value = 0u64.to_ne_bytes().to_vec();
+
+            // Create per-CPU values array (all zeros)
+            let percpu_values: Vec<Vec<u8>> = (0..*NR_CPUS_POSSIBLE)
+                .map(|_| zero_value.clone())
+                .collect();
+
+            self.skel
+                .maps
+                .function_counters
+                .update_percpu(&key, &percpu_values, libbpf_rs::MapFlags::ANY)
+                .context("Failed to reset function counter")?;
+        }
+
+        Ok(())
+    }
+
+fn update_steal_metrics(&mut self) -> Result<()> {
+    let steals_debug = is_debug_flag_enabled("steals");
+
+    // Early out if stealing is compiled out.
+    if bpf_intf::MITOSIS_ENABLE_STEALING == 0 {
+        self.metrics.total_steals = 0;
+        if steals_debug {
+            trace!("{}steals:{}", ANSI_GREEN, ANSI_RESET);
+            trace!("  Work stealing disabled at compile time (MITOSIS_ENABLE_STEALING=0)");
+        }
+        return Ok(());
+    }
+
+    let key = 0u32.to_ne_bytes();
+
+    // Read the count; lazily initialize the slot to 0 if it doesn't exist.
+    let steal_count = match self.skel.maps.steal_stats.lookup(&key, libbpf_rs::MapFlags::ANY) {
+        Ok(Some(data)) if data.len() >= 8 => {
+            u64::from_ne_bytes(data[..8].try_into().unwrap())
+        }
+        Ok(Some(_)) => {
+            if steals_debug {
+                debug!("steal_stats map data too small");
+            }
+            0
+        }
+        Ok(None) => {
+            let zero = 0u64.to_ne_bytes();
+            if let Err(e) = self.skel.maps.steal_stats.update(&key, &zero, libbpf_rs::MapFlags::ANY) {
+                if steals_debug {
+                    debug!("Failed to initialize steal_stats map: {e}");
+                }
+            }
+            0
+        }
+        Err(e) => {
+            if steals_debug {
+                debug!("Failed to read steal_stats map: {e}");
+            }
+            0
+        }
+    };
+
+    // Calculate steals since last update (delta)
+    let steals_delta = steal_count - self.prev_total_steals;
+    self.prev_total_steals = steal_count;
+    self.metrics.total_steals = steals_delta;
+
+    // Early out if we aren't logging.
+    if !steals_debug {
+        return Ok(());
+    }
+
+    if steals_delta > 0 {
+        trace!("{}steals:{}", ANSI_GREEN, ANSI_RESET);
+        trace!("  Work stealing active: steals_since_last={}", steals_delta);
+    } else {
+        trace!("{}steals:{}", ANSI_GREEN, ANSI_RESET);
+        trace!("  Work stealing enabled but no new steals: steals_since_last={}", steals_delta);
+    }
+
+    Ok(())
+}
+
+
     fn refresh_bpf_cells(&mut self) -> Result<()> {
         let applied_configuration = unsafe {
             std::ptr::read_volatile(
@@ -538,7 +780,52 @@ fn read_cpu_ctxs(skel: &BpfSkel) -> Result<Vec<bpf_intf::cpu_ctx>> {
     Ok(cpu_ctxs)
 }
 
+fn read_cpu_to_l3(skel: &BpfSkel) -> Result<Vec<u32>> {
+    let mut cpu_to_l3 = vec![];
+    for cpu in 0..*NR_CPUS_POSSIBLE {
+        let key = (cpu as u32).to_ne_bytes();
+        let val = skel
+            .maps
+            .cpu_to_l3
+            .lookup(&key, libbpf_rs::MapFlags::ANY)?
+            .map(|v| u32::from_ne_bytes(v.try_into().unwrap()))
+            .unwrap_or(0);
+        cpu_to_l3.push(val);
+    }
+    Ok(cpu_to_l3)
+}
+
+fn read_l3_to_cpus(skel: &BpfSkel) -> Result<Vec<(u32, Cpumask)>> {
+    let mut l3_to_cpus = vec![];
+
+    // Get the number of L3 caches from the BPF rodata
+    let nr_l3 = skel.maps.rodata_data.as_ref().unwrap().nr_l3;
+
+    for l3 in 0..nr_l3 {
+        let key = (l3 as u32).to_ne_bytes();
+        let mask = if let Some(v) = skel
+            .maps
+            .l3_to_cpus
+            .lookup(&key, libbpf_rs::MapFlags::ANY)?
+        {
+            let bytes = v.as_slice();
+            let mut longs = [0u64; CPUMASK_LONG_ENTRIES];
+            let mut i = 0;
+            while i < CPUMASK_LONG_ENTRIES && i * 8 + 8 <= bytes.len() {
+                longs[i] = u64::from_ne_bytes(bytes[i * 8..i * 8 + 8].try_into().unwrap());
+                i += 1;
+            }
+            Cpumask::from_vec(longs.to_vec())
+        } else {
+            Cpumask::new()
+        };
+        l3_to_cpus.push((l3, mask));
+    }
+    Ok(l3_to_cpus)
+}
+
 fn main() -> Result<()> {
+
     let opts = Opts::parse();
 
     if opts.version {
diff --git a/scheds/rust/scx_mitosis/src/mitosis_topology_utils.rs b/scheds/rust/scx_mitosis/src/mitosis_topology_utils.rs
new file mode 100644
index 0000000000..de19b1e02d
--- /dev/null
+++ b/scheds/rust/scx_mitosis/src/mitosis_topology_utils.rs
@@ -0,0 +1,168 @@
+use anyhow::{bail, Context, Result};
+use libbpf_rs::{MapCore, MapFlags};
+use scx_utils::Topology;
+use std::collections::HashMap;
+use std::io::{self, BufRead, BufReader};
+use std::path::Path;
+
+use crate::bpf_skel::BpfSkel;
+
+const CPUMASK_LONG_ENTRIES: usize = 128;
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum MapKind {
+    CpuToL3,
+    L3ToCpus,
+}
+
+impl std::str::FromStr for MapKind {
+    type Err = anyhow::Error;
+    fn from_str(s: &str) -> Result<Self> {
+        match s {
+            "cpu_to_l3" => Ok(MapKind::CpuToL3),
+            "l3_to_cpus" => Ok(MapKind::L3ToCpus),
+            _ => bail!("unknown map {s}"),
+        }
+    }
+}
+
+impl std::fmt::Display for MapKind {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(match self {
+            MapKind::CpuToL3 => "cpu_to_l3",
+            MapKind::L3ToCpus => "l3_to_cpus",
+        })
+    }
+}
+
+pub const SUPPORTED_MAPS: &[MapKind] = &[MapKind::CpuToL3, MapKind::L3ToCpus];
+
+/// Parse lines of the form `cpu,l3` from the provided reader.
+fn parse_cpu_l3_map<R: BufRead>(reader: R) -> Result<Vec<(usize, usize)>> {
+    let mut pairs = Vec::new();
+    for line in reader.lines() {
+        let line = line?;
+        let line = line.trim();
+        // Ignore blank lines and comments
+        if line.is_empty() || line.starts_with('#') {
+            continue;
+        }
+        let mut parts = line.split(',');
+        let cpu = parts
+            .next()
+            .ok_or_else(|| anyhow::anyhow!("missing cpu"))?
+            .trim()
+            .parse::<usize>()?;
+        let l3 = parts
+            .next()
+            .ok_or_else(|| anyhow::anyhow!("missing l3"))?
+            .trim()
+            .parse::<usize>()?;
+        pairs.push((cpu, l3));
+    }
+    Ok(pairs)
+}
+
+/// Read CPU/L3 pairs either from a file or standard input.
+fn read_cpu_l3_map(path: &str) -> Result<Vec<(usize, usize)>> {
+    if path == "-" {
+        println!("reading from stdin");
+        let stdin = io::stdin();
+        let reader = BufReader::new(stdin.lock());
+        parse_cpu_l3_map(reader)
+    } else {
+        println!("reading from {path}");
+        let file = std::fs::File::open(Path::new(path))?;
+        let reader = BufReader::new(file);
+        parse_cpu_l3_map(reader)
+    }
+}
+
+/// Update map entries either from a file or from the host topology.
+/// This function can be used by both the main scheduler and CLI tools.
+pub fn populate_topology_maps(skel: &mut BpfSkel, map: MapKind, file: Option<String>) -> Result<()> {
+    match map {
+        MapKind::CpuToL3 => {
+            let map_entries = if let Some(path) = file {
+                println!("loading from {path}");
+                read_cpu_l3_map(&path)?
+            } else {
+                println!("loading from host topology");
+                let topo = Topology::new()?;
+                (0..*scx_utils::NR_CPUS_POSSIBLE)
+                    // Use 0 if a CPU is missing from the topology
+                    .map(|cpu| (cpu, topo.all_cpus.get(&cpu).map(|c| c.l3_id).unwrap_or(0)))
+                    .collect()
+            };
+            for (cpu, l3) in map_entries {
+                // Each CPU index is stored as a 32bit key mapping to its L3 id
+                let key = (cpu as u32).to_ne_bytes();
+                let val = (l3 as u32).to_ne_bytes();
+                skel.maps.cpu_to_l3.update(&key, &val, MapFlags::ANY)?;
+            }
+        }
+        MapKind::L3ToCpus => {
+            if file.is_some() {
+                anyhow::bail!("Loading l3_to_cpus from file is not supported yet");
+            }
+
+            println!("loading l3_to_cpus from host topology");
+            let topo = Topology::new()?;
+
+            // Group CPUs by L3 cache ID
+            let mut l3_to_cpus: HashMap<usize, Vec<usize>> = HashMap::new();
+            for cpu in topo.all_cpus.values() {
+                l3_to_cpus.entry(cpu.l3_id).or_default().push(cpu.id);
+            }
+
+            // For each L3 cache, create a cpumask and populate the map
+            for (l3_id, cpus) in l3_to_cpus {
+                let key = (l3_id as u32).to_ne_bytes();
+
+                // Create a cpumask structure that matches the BPF side
+                let mut cpumask_longs = [0u64; CPUMASK_LONG_ENTRIES];
+
+                // Set bits for each CPU in this L3 cache
+                for cpu in cpus {
+                    let long_idx = cpu / 64;
+                    let bit_idx = cpu % 64;
+                    if long_idx < CPUMASK_LONG_ENTRIES {
+                        cpumask_longs[long_idx] |= 1u64 << bit_idx;
+                    }
+                }
+
+                // Convert to bytes for the map update
+                let mut value_bytes = Vec::new();
+                for long_val in cpumask_longs {
+                    value_bytes.extend_from_slice(&long_val.to_ne_bytes());
+                }
+
+                skel.maps.l3_to_cpus.update(&key, &value_bytes, MapFlags::ANY)
+                    .context(format!("Failed to update l3_to_cpus map for L3 {}", l3_id))?;
+            }
+        }
+    }
+    Ok(())
+}
+
+
+/// Display CPU to L3 cache relationships discovered from the host topology.
+pub fn print_topology() -> Result<()> {
+    let topo = Topology::new()?;
+    println!("Number L3 caches: {}", topo.all_llcs.len());
+    println!("CPU -> L3 id:");
+    for cpu in topo.all_cpus.values() {
+        println!("cpu {} -> {}", cpu.id, cpu.l3_id);
+    }
+    println!("\nL3 id -> [cpus]:");
+    let mut by_l3: std::collections::BTreeMap<usize, Vec<usize>> =
+        std::collections::BTreeMap::new();
+    for cpu in topo.all_cpus.values() {
+        by_l3.entry(cpu.l3_id).or_default().push(cpu.id);
+    }
+    for (l3, mut cpus) in by_l3 {
+        cpus.sort_unstable();
+        println!("{l3} -> {:?}", cpus);
+    }
+    Ok(())
+}
diff --git a/scheds/rust/scx_mitosis/src/stats.rs b/scheds/rust/scx_mitosis/src/stats.rs
index 749296a4ff..0cfc001667 100644
--- a/scheds/rust/scx_mitosis/src/stats.rs
+++ b/scheds/rust/scx_mitosis/src/stats.rs
@@ -65,6 +65,8 @@ pub struct Metrics {
     pub share_of_decisions_pct: f64,
     #[stat(desc = "Cell scheduling decisions")]
     total_decisions: u64,
+    #[stat(desc = "Work steals since last update")]
+    pub total_steals: u64,
     #[stat(desc = "Per-cell metrics")] // TODO: cell names
     pub cells: BTreeMap<u32, CellMetrics>,
 }

From 0dd6be6d4d306b616d4927f95da4de972bc32a77 Mon Sep 17 00:00:00 2001
From: tommy-u <tunger10@gmail.com>
Date: Thu, 11 Sep 2025 16:53:15 -0700
Subject: [PATCH 05/12] scx_mitosis: add L3 awareness and work stealing

---
 scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h     | 154 ++++---
 scheds/rust/scx_mitosis/src/bpf/intf.h        |   3 +-
 scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c | 405 ++++++++++++++----
 scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h |  23 +-
 4 files changed, 446 insertions(+), 139 deletions(-)

diff --git a/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h
index a545cb72ad..a6b899d2f5 100644
--- a/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h
@@ -3,7 +3,7 @@
  * This software may be used and distributed according to the terms of the
  * GNU General Public License version 2.
  *
- * This header defines the 32-bit dispatch queue (DSQ) ID encoding
+ * This header defines the 64-bit dispatch queue (DSQ) ID encoding
  * scheme for scx_mitosis, using type fields to distinguish between
  * per-CPU and cell+L3 domain queues. It includes helper functions to
  * construct, validate, and parse these DSQ IDs for queue management.
@@ -37,96 +37,138 @@
  * Only the low 32 bits are used.
  *
  *   [63     ..     32] [31..0]
- *   [  0s or unused  ] [ VAL ]
+ *   [ 0][   unused   ] [ VAL ]
  *
  *   Mitosis uses VAL as follows:
  *
- *   [31..24] [23..0]
+ *   [31..28] [27..0]
  *   [QTYPE ] [DATA ]
  *
- *   QTYPE encodes the queue type (exactly one bit set):
+ *   QTYPE encodes the queue type:
  *
  *     QTYPE = 0x1 -> Per-CPU Q
- *       [31 .. 24] [23 .. 16] [15    ..      0]
- *       [00000001] [00000000] [      CPU#     ]
+ *       [31..28] [27 ..          ..        0]
+ *       [ 0001 ] [          CPU#            ]
  *       [Q-TYPE:1]
  *
  *     QTYPE = 0x2 -> Cell+L3 Q
- *       [31 .. 24] [23 .. 16] [15      ..    0]
- *       [00000010] [  CELL# ] [      L3ID     ]
+ *       [31..28] [27 .. 16] [15      ..    0]
+ *       [ 0010 ] [  CELL# ] [      L3ID     ]
  *       [Q-TYPE:2]
  *
  */
+/*
+ * The use of these bitfields depends on compiler defined byte AND bit ordering.
+ * Make sure we're only building with Clang/LLVM and that we're little-endian.
+ */
+#ifndef __clang__
+#error "This code must be compiled with Clang/LLVM (eBPF: clang -target bpf)."
+#endif
+
+#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+#error "dsq64 bitfield layout assumes little-endian (bpfel)."
+#endif
+
+/* ---- Bitfield widths (bits) ---- */
+#define CPU_B     28
+#define L3_B      16
+#define CELL_B    12
+#define TYPE_B     4
+#define DATA_B    28
+#define RSVD_B    32
+
+/* Sum checks (in bits) */
+_Static_assert(CPU_B  + TYPE_B          == 32, "CPU layout low half must be 32 bits");
+_Static_assert(L3_B   + CELL_B + TYPE_B == 32, "CELL+L3 layout low half must be 32 bits");
+_Static_assert(DATA_B + TYPE_B          == 32, "Common layout low half must be 32 bits");
+
+typedef union {
+	u64 raw;
 
-#define DSQ_ERROR 0xFFFFFFFF; /* Error value for DSQ functions */
+	/* Per-CPU user DSQ */
+	struct { u64 cpu: CPU_B;   u64 type: TYPE_B; u64 rsvd: RSVD_B; } cpu_dsq;
+
+	/* Cell+L3 user DSQ */
+	struct { u64 l3: L3_B;     u64 cell: CELL_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } cell_l3_dsq;
+
+	/* Generic user view */
+	struct { u64 data: DATA_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } user_dsq;
+
+	/* Built-in DSQ view */
+	struct { u64 value:32; u64 rsvd:30; u64 local_on:1; u64 builtin:1; } builtin_dsq;
+
+	/* NOTE: Considered packed and aligned attributes, but that's redundant */
+} dsq_id_t;
+
+/*
+ * Invalid DSQ ID Sentinel:
+ * invalid bc bit 63 clear (it's a user DSQ) && dsq_type == 0 (no type)
+ * Good for catching uninitialized DSQ IDs.
+*/
+#define DSQ_INVALID ((u64) 0)
+
+_Static_assert(sizeof(((dsq_id_t){0}).cpu_dsq)     == sizeof(u64), "cpu view must be 8 bytes");
+_Static_assert(sizeof(((dsq_id_t){0}).cell_l3_dsq) == sizeof(u64), "cell+l3 view must be 8 bytes");
+_Static_assert(sizeof(((dsq_id_t){0}).user_dsq)    == sizeof(u64), "user common view must be 8 bytes");
+_Static_assert(sizeof(((dsq_id_t){0}).builtin_dsq) == sizeof(u64), "builtin view must be 8 bytes");
+
+/* Compile-time checks (in bytes) */
+_Static_assert(sizeof(dsq_id_t)   == sizeof(u64), "dsq_id_t must be 8 bytes (64 bits)");
+_Static_assert(_Alignof(dsq_id_t) == sizeof(u64), "dsq_id_t must be 8-byte aligned");
 
 /* DSQ type enumeration */
 enum dsq_type {
-	DSQ_UNKNOWN,
+	DSQ_TYPE_NONE,
 	DSQ_TYPE_CPU,
 	DSQ_TYPE_CELL_L3,
 };
 
-/* DSQ ID structure using unions for type-safe access */
-struct dsq_cpu {
-	u32 cpu : 16;
-	u32 unused : 8;
-	u32 type : 8;
-} __attribute__((packed));
-
-struct dsq_cell_l3 {
-	u32 l3 : 16;
-	u32 cell : 8;
-	u32 type : 8;
-} __attribute__((packed));
-
-union dsq_id {
-	u32 raw;
-	struct dsq_cpu cpu;
-	struct dsq_cell_l3 cell_l3;
-	struct {
-		u32 data : 24;
-		u32 type : 8;
-	} common;
-} __attribute__((packed));
-
-/* Static assertions to ensure correct sizes */
-/* Verify that all DSQ structures are exactly 32 bits */
-_Static_assert(sizeof(struct dsq_cpu) == 4, "dsq_cpu must be 32 bits");
-_Static_assert(sizeof(struct dsq_cell_l3) == 4, "dsq_cell_l3 must be 32 bits");
-_Static_assert(sizeof(union dsq_id) == 4, "dsq_id union must be 32 bits");
-
-/* Inline helper functions for DSQ ID manipulation */
+/* Range guards */
+_Static_assert(MAX_CPUS  <= (1u << CPU_B),  "MAX_CPUS must fit in field");
+_Static_assert(MAX_L3S   <= (1u << L3_B),   "MAX_L3S must fit in field");
+_Static_assert(MAX_CELLS <= (1u << CELL_B), "MAX_CELLS must fit in field");
+_Static_assert(DSQ_TYPE_CELL_L3 < (1u << TYPE_B), "DSQ_TYPE_CELL_L3 must fit in field");
+
+/*
+ * While I considered error propagation, I decided to bail to force errors early.
+*/
+
+static inline bool is_user_dsq(dsq_id_t dsq_id){
+	return !dsq_id.builtin_dsq.builtin && dsq_id.user_dsq.type != DSQ_TYPE_NONE;
+}
 
 // Is this a per CPU DSQ?
-static inline bool is_cpu_dsq(u32 dsq_id)
+static inline bool is_cpu_dsq(dsq_id_t dsq_id)
 {
-	union dsq_id id = { .raw = dsq_id };
-	return id.common.type == DSQ_TYPE_CPU;
+	return is_user_dsq(dsq_id) && dsq_id.user_dsq.type == DSQ_TYPE_CPU;
 }
 
 // If this is a per cpu dsq, return the cpu
-static inline u32 get_cpu_from_dsq(u32 dsq_id)
+static inline u32 get_cpu_from_dsq(u64 id)
 {
-	union dsq_id id = { .raw = dsq_id };
-	if (id.common.type != DSQ_TYPE_CPU)
-		return DSQ_ERROR;
-	return id.cpu.cpu;
+	dsq_id_t dsq_id = (dsq_id_t) {.raw = id};
+	if (!is_cpu_dsq(dsq_id))
+		scx_bpf_error("trying to get cpu from non-cpu dsq\n");
+
+	return dsq_id.cpu_dsq.cpu;
 }
 
 /* Helper functions to construct DSQ IDs */
-static inline u32 get_cpu_dsq_id(u32 cpu)
+static inline u64 get_cpu_dsq_id(u32 cpu)
 {
+	// Check for valid CPU range, 0 indexed so >=.
 	if (cpu >= MAX_CPUS)
-		return DSQ_ERROR;
-	union dsq_id id = { .cpu = { .cpu = cpu, .unused = 0, .type = DSQ_TYPE_CPU } };
-	return id.raw;
+		scx_bpf_error("invalid cpu %u\n", cpu);
+	dsq_id_t dsq_id = { .cpu_dsq = { .cpu = cpu, .type = DSQ_TYPE_CPU } };
+
+	return dsq_id.raw;
 }
 
-static inline u32 get_cell_l3_dsq_id(u32 cell, u32 l3)
+static inline u64 get_cell_l3_dsq_id(u32 cell, u32 l3)
 {
 	if (cell >= MAX_CELLS || l3 >= MAX_L3S)
-		return DSQ_ERROR;
-	union dsq_id id = { .cell_l3 = {.l3 = l3, .cell = cell, .type = DSQ_TYPE_CELL_L3 } };
-	return id.raw;
+		scx_bpf_error("cell %u or l3 %u too large\n", cell, l3);
+	dsq_id_t dsq_id = { .cell_l3_dsq = { .l3 = l3, .cell = cell, .type = DSQ_TYPE_CELL_L3 } };
+
+	return dsq_id.raw;
 }
diff --git a/scheds/rust/scx_mitosis/src/bpf/intf.h b/scheds/rust/scx_mitosis/src/bpf/intf.h
index 89c0096fd7..64c0e27e87 100644
--- a/scheds/rust/scx_mitosis/src/bpf/intf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/intf.h
@@ -20,7 +20,7 @@ typedef _Bool bool;
 
 /* ---- Work stealing config (compile-time) ------------------------------- */
 #ifndef MITOSIS_ENABLE_STEALING
-#define MITOSIS_ENABLE_STEALING 0
+#define MITOSIS_ENABLE_STEALING 1
 #endif
 /* ----------------------------------------------------------------------- */
 
@@ -34,7 +34,6 @@ enum consts {
 
 	PCPU_BASE = 0x80000000,
 	MAX_CG_DEPTH = 256,
-
 };
 
 /* Statistics */
diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
index dd3f2cf240..76cacf134b 100644
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
@@ -12,6 +12,7 @@
  * cgroups belonging to the cell.
  */
 
+// TODO: fix debug printer.
 #include "intf.h"
 
 #include "mitosis.bpf.h"
@@ -56,23 +57,13 @@ struct l3_to_cpus_map l3_to_cpus SEC(".maps");
 struct function_counters_map function_counters SEC(".maps");
 struct steal_stats_map steal_stats SEC(".maps");
 
-/*
- * We store per-cpu values along with per-cell values. Helper functions to
- * translate.
- */
-static inline u32 cpu_dsq(u32 cpu)
-{
-	return PCPU_BASE | cpu;
-}
-
-static inline u32 cell_dsq(u32 cell)
-{
-	return cell;
-}
+static inline void increment_counter(enum counter_idx idx) {
+	u64 *counter;
+	u32 key = idx;
 
-static inline u32 dsq_to_cpu(u32 dsq)
-{
-	return dsq & ~PCPU_BASE;
+	counter = bpf_map_lookup_elem(&function_counters, &key);
+	if (counter)
+		(*counter)++;
 }
 
 static inline struct cgroup *lookup_cgrp_ancestor(struct cgroup *cgrp,
@@ -127,7 +118,6 @@ static inline struct cgroup *task_cgroup(struct task_struct *p)
 	return cgrp;
 }
 
-
 struct {
 	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
 	__uint(map_flags, BPF_F_NO_PREALLOC);
@@ -199,8 +189,12 @@ static inline int allocate_cell()
 		if (!(c = lookup_cell(cell_idx)))
 			return -1;
 
-		if (__sync_bool_compare_and_swap(&c->in_use, 0, 1))
+		if (__sync_bool_compare_and_swap(&c->in_use, 0, 1)) {
+			// TODO XXX, I think we need to make this concurrent safe
+			__builtin_memset(c->l3_cpu_cnt, 0, sizeof(c->l3_cpu_cnt));
+			c->l3_present_cnt = 0;
 			return cell_idx;
+		}
 	}
 	scx_bpf_error("No available cells to allocate");
 	return -1;
@@ -279,7 +273,6 @@ static inline int update_task_cpumask(struct task_struct *p,
 {
 	const struct cpumask *cell_cpumask;
 	struct cpu_ctx *cpu_ctx;
-	struct cell *cell;
 	u32 cpu;
 
 	if (!(cell_cpumask = lookup_cell_cpumask(tctx->cell)))
@@ -288,11 +281,24 @@ static inline int update_task_cpumask(struct task_struct *p,
 	if (!tctx->cpumask)
 		return -EINVAL;
 
+	/*
+	 * Calculate the intersection of CPUs that are both:
+	 * 1. In this task's assigned cell (cell_cpumask)
+	 * 2. Allowed by the task's CPU affinity (p->cpus_ptr)
+	 * Store result in tctx->cpumask - this becomes the effective CPU set
+	 * where this task can actually run.
+	 */
 	bpf_cpumask_and(tctx->cpumask, cell_cpumask, p->cpus_ptr);
 
-	if (cell_cpumask)
-		tctx->all_cell_cpus_allowed =
-			bpf_cpumask_subset(cell_cpumask, p->cpus_ptr);
+	/*
+	 * Check if the task can run on ALL CPUs in its assigned cell.
+	 * If cell_cpumask is a subset of p->cpus_ptr, it means the task's
+	 * CPU affinity doesn't restrict it within the cell - it can use
+	 * any CPU in the cell. This affects scheduling decisions later.
+	 * True if all the bits in cell_cpumask are set in p->cpus_ptr.
+	 */
+	tctx->all_cell_cpus_allowed =
+		bpf_cpumask_subset(cell_cpumask, p->cpus_ptr);
 
 	/*
 	 * XXX - To be correct, we'd need to calculate the vtime
@@ -304,16 +310,56 @@ static inline int update_task_cpumask(struct task_struct *p,
 	 * Revisit if high frequency dynamic cell switching
 	 * needs to be supported.
 	 */
+
+	// We want to set the task vtime to that of the cell it's joining.
+	// This used to be done by looking up the cell's dsq
+	// but now each cell has potentially multiple per l3 dsqs.
 	if (tctx->all_cell_cpus_allowed) {
-		tctx->dsq = cell_dsq(tctx->cell);
-		if (!(cell = lookup_cell(tctx->cell)))
+
+		const struct cpumask *l3_mask = NULL;
+		if (tctx->l3 != INVALID_L3_ID) {
+			l3_mask = lookup_l3_cpumask((u32)tctx->l3);
+			/* If the L3 no longer intersects the cell's cpumask, invalidate it */
+			if (!l3_mask || !bpf_cpumask_intersects(cell_cpumask, l3_mask))
+				tctx->l3 = INVALID_L3_ID;
+		}
+
+		/* --- Pick a new L3 if needed --- */
+		if (tctx->l3 == INVALID_L3_ID) {
+			s32 new_l3 = pick_l3_for_task(tctx->cell);
+			if (new_l3 < 0)
+				return -ENODEV;
+			tctx->l3 = new_l3;
+			l3_mask = lookup_l3_cpumask((u32)tctx->l3);
+			if (!l3_mask)
+				return -ENOENT;
+		}
+
+		/* --- Narrow the effective cpumask by the chosen L3 --- */
+		/* tctx->cpumask already contains (task_affinity ∧ cell_mask) */
+		if (tctx->cpumask)
+			bpf_cpumask_and(tctx->cpumask, (const struct cpumask *)tctx->cpumask, l3_mask);
+
+		/* If empty after intersection, nothing can run here */
+		if (tctx->cpumask && bpf_cpumask_empty((const struct cpumask *)tctx->cpumask))
+			return -ENODEV;
+
+		/* --- Point to the correct (cell,L3) DSQ and set vtime baseline --- */
+		tctx->dsq = get_cell_l3_dsq_id(tctx->cell, tctx->l3);
+
+		struct cell *cell = lookup_cell(tctx->cell);
+		if (!cell)
 			return -ENOENT;
-		p->scx.dsq_vtime = READ_ONCE(cell->vtime_now);
+
+		if (tctx->l3 < 0 || tctx->l3 >= MAX_L3S)
+			return -EINVAL;
+		p->scx.dsq_vtime = READ_ONCE(cell->l3_vtime_now[tctx->l3]);
 	} else {
+		/* Task is CPU-restricted, use task mask */
 		cpu = bpf_cpumask_any_distribute(p->cpus_ptr);
 		if (!(cpu_ctx = lookup_cpu_ctx(cpu)))
 			return -ENOENT;
-		tctx->dsq = cpu_dsq(cpu);
+		tctx->dsq = get_cpu_dsq_id(cpu);
 		p->scx.dsq_vtime = READ_ONCE(cpu_ctx->vtime_now);
 	}
 
@@ -429,20 +475,24 @@ s32 BPF_STRUCT_OPS(mitosis_select_cpu, struct task_struct *p, s32 prev_cpu,
 	struct cpu_ctx *cctx;
 	struct task_ctx *tctx;
 
+	increment_counter(COUNTER_SELECT_CPU);
+
 	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
 		return prev_cpu;
 
 	if (maybe_refresh_cell(p, tctx) < 0)
 		return prev_cpu;
 
+	/* Pinned path: only if our task really requires a per-CPU queue. */
 	if (!tctx->all_cell_cpus_allowed) {
 		cstat_inc(CSTAT_AFFN_VIOL, tctx->cell, cctx);
-		cpu = dsq_to_cpu(tctx->dsq);
+		cpu = get_cpu_from_dsq(tctx->dsq);
 		if (scx_bpf_test_and_clear_cpu_idle(cpu))
 			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
 		return cpu;
 	}
 
+	// Grab an idle core
 	if ((cpu = pick_idle_cpu(p, prev_cpu, cctx, tctx)) >= 0) {
 		cstat_inc(CSTAT_LOCAL, tctx->cell, cctx);
 		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
@@ -476,14 +526,17 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags)
 	s32 cpu = -1;
 	u64 basis_vtime;
 
+	increment_counter(COUNTER_ENQUEUE);
+
 	if (!(tctx = lookup_task_ctx(p)) || !(cctx = lookup_cpu_ctx(-1)))
 		return;
 
 	if (maybe_refresh_cell(p, tctx) < 0)
 		return;
 
+	// Cpu pinned work
 	if (!tctx->all_cell_cpus_allowed) {
-		cpu = dsq_to_cpu(tctx->dsq);
+		cpu = get_cpu_from_dsq(tctx->dsq);
 	} else if (!__COMPAT_is_enq_cpu_selected(enq_flags)) {
 		/*
 		 * If we haven't selected a cpu, then we haven't looked for and kicked an
@@ -507,12 +560,22 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags)
 	}
 
 	if (tctx->all_cell_cpus_allowed) {
+		// This is a task that can run on any cpu in the cell
+
 		cstat_inc(CSTAT_CELL_DSQ, tctx->cell, cctx);
-		/* Task can use any CPU in its cell, so use the cell DSQ */
+
+		/* Task can use any CPU in its cell, set basis_vtime from per-(cell, L3) vtime */
 		if (!(cell = lookup_cell(tctx->cell)))
 			return;
-		basis_vtime = READ_ONCE(cell->vtime_now);
+
+		if (tctx->l3 < 0 || tctx->l3 >= MAX_L3S) {
+			scx_bpf_error("Invalid L3 ID for task %d in enqueue", p->pid);
+			return;
+		}
+		basis_vtime = READ_ONCE(cell->l3_vtime_now[tctx->l3]);
+
 	} else {
+		// This is a task that can only run on a specific cpu
 		cstat_inc(CSTAT_CPU_DSQ, tctx->cell, cctx);
 
 		/*
@@ -527,7 +590,8 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags)
 
 	tctx->basis_vtime = basis_vtime;
 
-	if (time_after(vtime, basis_vtime + 8192 * slice_ns)) {
+	if (time_after(vtime,
+		       basis_vtime + VTIME_MAX_FUTURE_MULTIPLIER * slice_ns)) {
 		scx_bpf_error("vtime is too far in the future for %d", p->pid);
 		return;
 	}
@@ -535,6 +599,7 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags)
 	 * Limit the amount of budget that an idling task can accumulate
 	 * to one slice.
 	 */
+	// TODO: Should this be time_before64?
 	if (time_before(vtime, basis_vtime - slice_ns))
 		vtime = basis_vtime - slice_ns;
 
@@ -550,51 +615,107 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev)
 	struct cpu_ctx *cctx;
 	u32 cell;
 
+	increment_counter(COUNTER_DISPATCH);
+
 	if (!(cctx = lookup_cpu_ctx(-1)))
 		return;
 
 	cell = READ_ONCE(cctx->cell);
 
-	bool found = false;
-	u64 min_vtime_dsq;
-	u64 min_vtime;
+	/* Start from a valid DSQ */
+	u64 local_dsq = get_cpu_dsq_id(cpu);
 
+	bool found = false;
+	u64 min_vtime_dsq = local_dsq;
+	u64 min_vtime = ~0ULL; /* U64_MAX */
 	struct task_struct *p;
-	bpf_for_each(scx_dsq, p, cell, 0) {
-		min_vtime = p->scx.dsq_vtime;
-		min_vtime_dsq = cell;
-		found = true;
-		break;
+
+	// Get L3
+	u32 cpu_key = (u32)cpu;
+	u32 *l3_ptr = bpf_map_lookup_elem(&cpu_to_l3, &cpu_key);
+	s32 l3 = l3_ptr ? (s32)*l3_ptr : INVALID_L3_ID;
+
+	/* Check the L3 queue */
+	if (l3 != INVALID_L3_ID) {
+		u64 cell_l3_dsq = get_cell_l3_dsq_id(cell, l3);
+		bpf_for_each(scx_dsq, p, cell_l3_dsq, 0) {
+			min_vtime = p->scx.dsq_vtime;
+			min_vtime_dsq = cell_l3_dsq;
+			found = true;
+			break;
+		}
 	}
 
-	u64 dsq = cpu_dsq(cpu);
-	bpf_for_each(scx_dsq, p, dsq, 0) {
+	/* Check the CPU DSQ for a lower vtime */
+	bpf_for_each(scx_dsq, p, local_dsq, 0) {
 		if (!found || time_before(p->scx.dsq_vtime, min_vtime)) {
 			min_vtime = p->scx.dsq_vtime;
-			min_vtime_dsq = dsq;
+			min_vtime_dsq = local_dsq;
 			found = true;
 		}
 		break;
 	}
 
 	/*
-	 * If we failed to find an eligible task, scx will keep running prev if
-	 * prev->scx.flags & SCX_TASK_QUEUED (we don't set SCX_OPS_ENQ_LAST), and
-	 * otherwise go idle.
-	 */
-	if (!found)
-		return;
-	/*
-	 * The move_to_local can fail if we raced with some other cpu in the cell
-	 * and now the cell is empty. We have to ensure to try the cpu_dsq or else
-	 * we might never wakeup.
-	 */
-
-	if (!scx_bpf_dsq_move_to_local(min_vtime_dsq) && min_vtime_dsq != dsq)
-		scx_bpf_dsq_move_to_local(dsq);
+	* The move_to_local can fail if we raced with some other cpu in the cell
+	* and now the cell is empty. We have to ensure to try the cpu_dsq or else
+	* we might never wakeup.
+	*/
+	// TODO: The upstream side has "&& min_vtime_dsq != dsq" as part of this condition.
+	// Do we care?
+	if (!scx_bpf_dsq_move_to_local(min_vtime_dsq)) {
+#if MITOSIS_ENABLE_STEALING
+		/* Dead-simple work stealing:
+		 * If our local choices are empty, scan sibling (cell,L3) DSQs in the
+		 * same cell and steal the head task if it can run on @cpu.
+		 * No thresholds/cooldowns/lag heuristics—just the first eligible head.
+		 */
+		bool moved = false;
+		if (l3 != INVALID_L3_ID) {
+			// TODO: This math is kinda dumb and confusing.
+			u32 start = ((u32)l3 + 1) % nr_l3;
+			u32 off;
+			// TODO: This might try a bunch of L3s outside of the cell
+			bpf_for (off, 0, nr_l3) {
+				u32 cand = (start + off) % nr_l3;
+				if (cand == (u32)l3)
+					continue;
+				u64 src = get_cell_l3_dsq_id(cell, cand);
+
+				struct task_struct *q;
+				/* Peek only at the head. */
+				bpf_for_each(scx_dsq, q, src, 0) {
+					// TODO maybe this should use if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, q, SCX_DSQ_LOCAL, 0) == 0)
+					if (scx_bpf_dsq_move_to_local(src)) {
+						struct task_ctx *qt = lookup_task_ctx(q);
+						if (qt) {
+							qt->steal_count++;
+							qt->last_stolen_at = scx_bpf_now();
+							/* Retag to thief L3 */
+							qt->pending_l3 = l3;
+						}
+						/* Increment steal counter in map */
+						u32 key = 0;
+						u64 *count = bpf_map_lookup_elem(&steal_stats, &key);
+						// NOTE: This could get expensive, but I'm not
+						// anticipating that many steals. Percpu if we care.
+						if (count)
+							__sync_fetch_and_add(count, 1);
+						moved = true;
+					}
+					/* head only */
+					break;
+				}
+				if (moved)
+					break;
+			}
+		}
+		if (!moved)
+#endif
+			scx_bpf_dsq_move_to_local(local_dsq);
+	}
 }
 
-
 struct cpumask_entry {
 	unsigned long cpumask[CPUMASK_LONG_ENTRIES];
 	u64 used;
@@ -779,8 +900,7 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run)
 	}
 
 	/*
-	 * Initialize root cell cpumask to all cpus, and then remove from it as we
-	 * go
+	 * Initialize root cell cpumask to all cpus, and then remove from it as we go
 	 */
 	bpf_cpumask_copy(root_bpf_cpumask, (const struct cpumask *)all_cpumask);
 
@@ -960,17 +1080,38 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run)
 		goto out_root_cgrp;
 	}
 
+	int cell_idx;
+	/* Recalculate L3 counts for all active cells after CPU assignment changes */
+	bpf_for(cell_idx, 1, MAX_CELLS) {
+		struct cell *cell;
+		if (!(cell = lookup_cell(cell_idx))) {
+			scx_bpf_error("Lookup for cell %d failed in tick()", cell_idx);
+			goto out_root_cgrp;
+		}
+
+		if (!cell->in_use)
+			continue;
+
+		/* Recalculate L3 counts for each active cell */
+		recalc_cell_l3_counts(cell_idx);
+	}
+
+	/* Recalculate root cell's L3 counts after cpumask update */
+	recalc_cell_l3_counts(ROOT_CELL_ID);
+
 	barrier();
 	WRITE_ONCE(applied_configuration_seq, local_configuration_seq);
 
 	bpf_cgroup_release(root_cgrp_ref);
 	return;
+
 out_rcu_unlock:
 	bpf_rcu_read_unlock();
 out_root_cgrp:
 	bpf_cgroup_release(root_cgrp_ref);
 out:
-	bpf_cpumask_release(root_bpf_cpumask);
+	if (root_bpf_cpumask)
+		bpf_cpumask_release(root_bpf_cpumask);
 }
 
 void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p)
@@ -984,12 +1125,44 @@ void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p)
 		return;
 
 	/*
-	 * Update both the CPU's cell and the cpu's vtime so the vtime's are
-	 * comparable at dispatch time.
+	 * If this task was stolen across L3s, retag to thief L3 and recompute
+	 * effective cpumask+DSQ. Preserve vtime to keep fairness.
+	 */
+#if MITOSIS_ENABLE_STEALING
+	if (tctx->pending_l3 >= 0 && tctx->pending_l3 < MAX_L3S) {
+		u64 save_v = p->scx.dsq_vtime;
+		tctx->l3 = tctx->pending_l3;
+		tctx->pending_l3 = INVALID_L3_ID;
+		update_task_cpumask(p, tctx);
+		p->scx.dsq_vtime = save_v;
+	}
+#endif
+
+	/* Validate task's DSQ before it starts running */
+	if (tctx->dsq == DSQ_INVALID) {
+		if (tctx->all_cell_cpus_allowed) {
+			scx_bpf_error(
+				"Task %d has invalid DSQ 0 in running callback (CELL-SCHEDULABLE task, can run on any CPU in cell %d)",
+				p->pid, tctx->cell);
+		} else {
+			scx_bpf_error(
+				"Task %d has invalid DSQ 0 in running callback (CORE-PINNED task, restricted to specific CPUs)",
+				p->pid);
+		}
+		return;
+	}
+
+	/*
+	 * Update per-(cell, L3) vtime for cell-schedulable tasks
 	 */
-	if (time_before(READ_ONCE(cell->vtime_now), p->scx.dsq_vtime))
-		WRITE_ONCE(cell->vtime_now, p->scx.dsq_vtime);
+	if (tctx->all_cell_cpus_allowed && tctx->l3 >= 0 && tctx->l3 < MAX_L3S) {
+		if (time_before(READ_ONCE(cell->l3_vtime_now[tctx->l3]), p->scx.dsq_vtime))
+			WRITE_ONCE(cell->l3_vtime_now[tctx->l3], p->scx.dsq_vtime);
+	}
 
+	/*
+	 * Update CPU vtime for CPU-pinned tasks
+	 */
 	if (time_before(READ_ONCE(cctx->vtime_now), p->scx.dsq_vtime))
 		WRITE_ONCE(cctx->vtime_now, p->scx.dsq_vtime);
 
@@ -1015,7 +1188,7 @@ void BPF_STRUCT_OPS(mitosis_stopping, struct task_struct *p, bool runnable)
 	used = now - tctx->started_running_at;
 	tctx->started_running_at = now;
 	/* scale the execution time by the inverse of the weight and charge */
-	p->scx.dsq_vtime += used * 100 / p->scx.weight;
+	p->scx.dsq_vtime += used * DEFAULT_WEIGHT_MULTIPLIER / p->scx.weight;
 
 	if (cidx != 0 || tctx->all_cell_cpus_allowed) {
 		u64 *cell_cycles = MEMBER_VPTR(cctx->cell_cycles, [cidx]);
@@ -1024,6 +1197,18 @@ void BPF_STRUCT_OPS(mitosis_stopping, struct task_struct *p, bool runnable)
 			return;
 		}
 		*cell_cycles += used;
+
+		/*
+		 * For cell-schedulable tasks, also accumulate vtime into
+		 * per-cell per-L3 queues
+		 */
+		if (tctx->all_cell_cpus_allowed && tctx->l3 >= 0 &&
+		    tctx->l3 < MAX_L3S) {
+			/* Accumulate weighted execution time into per-(cell, L3) vtime */
+			cell->l3_vtime_now[tctx->l3] +=
+				used * DEFAULT_WEIGHT_MULTIPLIER /
+				p->scx.weight;
+		}
 	}
 }
 
@@ -1050,8 +1235,9 @@ s32 BPF_STRUCT_OPS(mitosis_cgroup_init, struct cgroup *cgrp,
 		return -ENOENT;
 	}
 
+	// Special case for root cell
 	if (cgrp->kn->id == root_cgid) {
-		WRITE_ONCE(cgc->cell, 0);
+		WRITE_ONCE(cgc->cell, ROOT_CELL_ID);
 		return 0;
 	}
 
@@ -1142,6 +1328,7 @@ s32 BPF_STRUCT_OPS(mitosis_init_task, struct task_struct *p,
 {
 	struct task_ctx *tctx;
 	struct bpf_cpumask *cpumask;
+	int ret;
 
 	tctx = bpf_task_storage_get(&task_ctxs, p, 0,
 				    BPF_LOCAL_STORAGE_GET_F_CREATE);
@@ -1167,16 +1354,29 @@ s32 BPF_STRUCT_OPS(mitosis_init_task, struct task_struct *p,
 		return -EINVAL;
 	}
 
-	return update_task_cell(p, tctx, args->cgroup);
+	/* Initialize L3 to invalid before cell assignment */
+	tctx->l3 = INVALID_L3_ID;
+#if MITOSIS_ENABLE_STEALING
+	tctx->pending_l3 = INVALID_L3_ID;
+	tctx->steal_count = 0;
+	tctx->last_stolen_at = 0;
+#endif
+
+	// TODO clean this up
+	if ((ret = update_task_cell(p, tctx, args->cgroup))) {
+		return ret;
+	}
+
+	return 0;
 }
 
 __hidden void dump_cpumask_word(s32 word, const struct cpumask *cpumask)
 {
 	u32 u, v = 0;
 
-	bpf_for(u, 0, 32)
+	bpf_for(u, 0, BITS_PER_U32)
 	{
-		s32 cpu = 32 * word + u;
+		s32 cpu = BITS_PER_U32 * word + u;
 		if (cpu < nr_possible_cpus &&
 		    bpf_cpumask_test_cpu(cpu, cpumask))
 			v |= 1 << u;
@@ -1206,6 +1406,31 @@ static void dump_cell_cpumask(int id)
 	dump_cpumask(cell_cpumask);
 }
 
+/* Print cell state for debugging */
+static __always_inline void dump_cell_state(u32 cell_idx)
+{
+	struct cell *cell = lookup_cell(cell_idx);
+	if (!cell) {
+		scx_bpf_dump("Cell %d: NOT FOUND", cell_idx);
+		return;
+	}
+
+	scx_bpf_dump("Cell %d: in_use=%d, cpu_cnt=%d, l3_present_cnt=%d",
+		   cell_idx, cell->in_use, cell->cpu_cnt, cell->l3_present_cnt);
+
+	u32 l3;
+	// Print vtimes for L3s
+	bpf_for(l3, 0, nr_l3) {
+		if (cell->l3_cpu_cnt[l3] > 0) {
+			scx_bpf_dump("  L3[%d]: %d CPUs", l3, cell->l3_cpu_cnt[l3]);
+		}
+	}
+}
+
+// TODO: FIX THIS
+static __always_inline void dump_l3_state(){
+}
+
 void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx)
 {
 	u64 dsq_id;
@@ -1226,9 +1451,7 @@ void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx)
 		scx_bpf_dump("CELL[%d] CPUS=", i);
 		dump_cell_cpumask(i);
 		scx_bpf_dump("\n");
-		scx_bpf_dump("CELL[%d] vtime=%llu nr_queued=%d\n", i,
-			     READ_ONCE(cell->vtime_now),
-			     scx_bpf_dsq_nr_queued(i));
+		dump_cell_state(i);
 	}
 
 	bpf_for(i, 0, nr_possible_cpus)
@@ -1236,11 +1459,14 @@ void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx)
 		if (!(cpu_ctx = lookup_cpu_ctx(i)))
 			return;
 
-		dsq_id = cpu_dsq(i);
+		dsq_id = get_cpu_dsq_id(i);
 		scx_bpf_dump("CPU[%d] cell=%d vtime=%llu nr_queued=%d\n", i,
 			     cpu_ctx->cell, READ_ONCE(cpu_ctx->vtime_now),
 			     scx_bpf_dsq_nr_queued(dsq_id));
 	}
+
+	dump_l3_state();
+
 }
 
 void BPF_STRUCT_OPS(mitosis_dump_task, struct scx_dump_ctx *dctx,
@@ -1252,7 +1478,7 @@ void BPF_STRUCT_OPS(mitosis_dump_task, struct scx_dump_ctx *dctx,
 		return;
 
 	scx_bpf_dump(
-		"Task[%d] vtime=%llu basis_vtime=%llu cell=%u dsq=%x all_cell_cpus_allowed=%d\n",
+		"Task[%d] vtime=%llu basis_vtime=%llu cell=%u dsq=%llu all_cell_cpus_allowed=%d\n",
 		p->pid, p->scx.dsq_vtime, tctx->basis_vtime, tctx->cell,
 		tctx->dsq, tctx->all_cell_cpus_allowed);
 	scx_bpf_dump("Task[%d] CPUS=", p->pid);
@@ -1286,7 +1512,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init)
 		if ((u8_ptr = MEMBER_VPTR(all_cpus, [i / 8]))) {
 			if (*u8_ptr & (1 << (i % 8))) {
 				bpf_cpumask_set_cpu(i, cpumask);
-				ret = scx_bpf_create_dsq(cpu_dsq(i), -1);
+				ret = scx_bpf_create_dsq(get_cpu_dsq_id(i), ANY_NUMA);
 				if (ret < 0) {
 					bpf_cpumask_release(cpumask);
 					return ret;
@@ -1301,14 +1527,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init)
 	if (cpumask)
 		bpf_cpumask_release(cpumask);
 
+	/* setup cell cpumasks */
 	bpf_for(i, 0, MAX_CELLS)
 	{
 		struct cell_cpumask_wrapper *cpumaskw;
-
-		ret = scx_bpf_create_dsq(i, -1);
-		if (ret < 0)
-			return ret;
-
 		if (!(cpumaskw = bpf_map_lookup_elem(&cell_cpumasks, &i)))
 			return -ENOENT;
 
@@ -1341,11 +1563,34 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init)
 	}
 
 	cells[0].in_use = true;
+
+	/* Configure root cell (cell 0) topology at init time using nr_l3 and l3_to_cpu masks */
+	recalc_cell_l3_counts(ROOT_CELL_ID);
+
+	/* Create (cell,L3) DSQs for all pairs. Userspace will populate maps. */
+	// This is a crazy over-estimate
+	bpf_for(i, 0, MAX_CELLS)
+	{
+		u32 l3;
+		bpf_for(l3, 0, nr_l3)
+		{
+			u64 id = get_cell_l3_dsq_id(i, l3);
+			ret = scx_bpf_create_dsq(id, ANY_NUMA);
+			if (ret < 0)
+				scx_bpf_error( "Failed to create DSQ for cell %d, L3 %d: err %d", i, l3, ret);
+		}
+	}
+
 	return 0;
 }
 
 void BPF_STRUCT_OPS(mitosis_exit, struct scx_exit_info *ei)
 {
+	// int i;
+	// bpf_for(i, 0, MAX_CELLS); {
+	// 	dump_cell_state((u32)i);
+	// }
+
 	UEI_RECORD(uei, ei);
 }
 
diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
index a4569f883e..4eb3b2231f 100644
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
@@ -23,6 +23,9 @@
 
 #define MAX_L3S 16
 
+#include "dsq.bpf.h"
+
+
 /*
  * A couple of tricky things about checking a cgroup's cpumask:
  *
@@ -48,8 +51,24 @@
 extern const volatile u32 nr_l3;
 
 enum mitosis_constants {
+
+	/* Root cell index */
+	ROOT_CELL_ID = 0,
+
 	/* Invalid/unset L3 value */
 	INVALID_L3_ID = -1,
+
+	/* Default weight divisor for vtime calculation */
+	DEFAULT_WEIGHT_MULTIPLIER = 100,
+
+	/* Vtime validation multiplier (slice_ns * 8192) */
+	VTIME_MAX_FUTURE_MULTIPLIER = 8192,
+
+	/* Bits per u32 for cpumask operations */
+	BITS_PER_U32 = 32,
+
+	/* No NUMA constraint for DSQ creation */
+	ANY_NUMA = -1,
 };
 
 struct cell {
@@ -83,12 +102,14 @@ struct task_ctx {
 	u32 cell;
 	/* For the sake of scheduling, a task is exclusively owned by either a cell
 	 * or a cpu */
-	u32 dsq;
+	u64 dsq;
 	/* latest configuration that was applied for this task */
 	/* (to know if it has to be re-applied) */
 	u32 configuration_seq;
 	/* Is this task allowed on all cores of its cell? */
 	bool all_cell_cpus_allowed;
+	// Which L3 this task is assigned to
+	s32 l3;
 
 #if MITOSIS_ENABLE_STEALING
 	/* When a task is stolen, dispatch() marks the destination L3 here.

From 8523b9d1c28bc04abf7e9f64ff75c47d56314c83 Mon Sep 17 00:00:00 2001
From: tommy-u <tunger10@gmail.com>
Date: Mon, 22 Sep 2025 15:09:55 -0700
Subject: [PATCH 06/12]  scx_mitosis: major work stealing cleanup

---
 scheds/rust/scx_mitosis/src/bpf/intf.h        |   2 +-
 .../rust/scx_mitosis/src/bpf/l3_aware.bpf.h   | 156 ++++++++++++++++--
 scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c | 111 +++++--------
 scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h |  46 +++++-
 scheds/rust/scx_mitosis/src/main.rs           |   4 +-
 5 files changed, 225 insertions(+), 94 deletions(-)

diff --git a/scheds/rust/scx_mitosis/src/bpf/intf.h b/scheds/rust/scx_mitosis/src/bpf/intf.h
index 64c0e27e87..8957d7165c 100644
--- a/scheds/rust/scx_mitosis/src/bpf/intf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/intf.h
@@ -46,7 +46,7 @@ enum cell_stat_idx {
 };
 
 /* Function invocation counters */
-enum counter_idx {
+enum fn_counter_idx {
 	COUNTER_SELECT_CPU,
 	COUNTER_ENQUEUE,
 	COUNTER_DISPATCH,
diff --git a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
index 0ced3fa78b..80ab1cc26b 100644
--- a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
@@ -3,8 +3,8 @@
  * This software may be used and distributed according to the terms of the
  * GNU General Public License version 2.
  *
- * This header adds L3 cache awareness to scx_mitosis by defining BPF
- * maps for CPU-to-L3 domain mappings. It provides functions to
+ * This header assists adding L3 cache awareness to scx_mitosis by defining
+ * maps and fns for managing CPU-to-L3 domain mappings. It provides code to
  * recalculate per-L3 CPU counts within cells and implements weighted
  * random L3 selection for tasks. It also tracks work-stealing
  * statistics for cross-L3 task migrations.
@@ -14,10 +14,15 @@
 #include "mitosis.bpf.h"
 #include "intf.h"
 
-// It's also an option to just compute this from the cpu_to_l3 map.
-struct l3_cpu_mask {
-	unsigned long cpumask[CPUMASK_LONG_ENTRIES];
-};
+typedef u32 l3_id_t;
+#define L3_INVALID ((l3_id_t) ~0u)
+
+// Configure how aggressively we steal work.
+// When task is detected as a steal candidate, skip it this many times
+// On a web server workload, 100 reduced steal count by ~90%
+#ifdef MITOSIS_ENABLE_STEALING
+#define PREVENT_N_STEALS 0
+#endif
 
 /* Work stealing statistics map - accessible from both BPF and userspace */
 struct steal_stats_map {
@@ -38,27 +43,46 @@ struct cpu_to_l3_map {
 struct l3_to_cpus_map {
 	__uint(type, BPF_MAP_TYPE_ARRAY);
 	__type(key, u32);
-	__type(value, struct l3_cpu_mask);
+	__type(value, struct cpumask);
 	__uint(max_entries, MAX_L3S);
 };
 
-extern struct cpu_to_l3_map cpu_to_l3 SEC(".maps");
-extern struct l3_to_cpus_map l3_to_cpus SEC(".maps");
-extern struct steal_stats_map steal_stats SEC(".maps");
+extern struct cpu_to_l3_map cpu_to_l3;
+extern struct l3_to_cpus_map l3_to_cpus;
+extern struct steal_stats_map steal_stats;
+
+static inline const bool l3_is_valid(u32 l3_id) {
+	if (l3_id == L3_INVALID)
+		return false;
+
+	return (l3_id >= 0) && (l3_id < MAX_L3S);
+}
+
+static inline void init_task_l3(struct task_ctx *tctx) {
+	tctx->l3 = L3_INVALID;
+
+#if MITOSIS_ENABLE_STEALING
+	tctx->pending_l3 = L3_INVALID;
+	tctx->steal_count = 0;
+	tctx->last_stolen_at = 0;
+	tctx->steals_prevented = 0;
+#endif
+
+}
 
 static inline const struct cpumask *lookup_l3_cpumask(u32 l3)
 {
-	struct l3_cpu_mask *mask;
+	struct cpumask *mask;
 
 	if (!(mask = bpf_map_lookup_elem(&l3_to_cpus, &l3))) {
 		scx_bpf_error("no l3 cpumask, l3: %d, %p", l3, &l3_to_cpus);
 		return NULL;
 	}
 
-	return (const struct cpumask *)mask;
+	return mask;
 }
 
-/* Recompute cell->l3_cpu_cnt[] after cell cpumask changes (no persistent kptrs). */
+/* Recompute cell->l3_cpu_cnt[] after cell cpumask changes */
 static __always_inline void recalc_cell_l3_counts(u32 cell_idx)
 {
 	struct cell *cell = lookup_cell(cell_idx);
@@ -89,7 +113,6 @@ static __always_inline void recalc_cell_l3_counts(u32 cell_idx)
 			continue;
 		}
 
-		/* ok: dst is bpf_cpumask*, sources are (RCU cpumask*, plain cpumask*) */
 		bpf_cpumask_and(tmp, cell_mask, l3_mask);
 
 		u32 cnt = bpf_cpumask_weight((const struct cpumask *)tmp);
@@ -113,24 +136,24 @@ static __always_inline void recalc_cell_l3_counts(u32 cell_idx)
  * have higher probability of being selected.
  *
  * @cell_id: The cell ID to select an L3 from
- * @return: L3 ID on success, INVALID_L3_ID on error, or 0 as fallback
+ * @return: L3 ID on success, L3_INVALID on error
  */
 static inline s32 pick_l3_for_task(u32 cell_id)
 {
 	struct cell *cell;
 	u32 l3, target, cur = 0;
-	s32 ret = INVALID_L3_ID;
+	s32 ret = L3_INVALID;
 
 	/* Look up the cell structure */
 	if (!(cell = lookup_cell(cell_id)))
-		return INVALID_L3_ID;
+		return L3_INVALID;
 
 	/* Handle case where cell has no CPUs assigned yet */
 	if (!cell->cpu_cnt) {
 		scx_bpf_error(
 			"pick_l3_for_task: cell %d has no CPUs accounted yet",
 			cell_id);
-		return INVALID_L3_ID;
+		return L3_INVALID;
 	}
 
 	/* Generate random target value in range [0, cpu_cnt) */
@@ -148,3 +171,100 @@ static inline s32 pick_l3_for_task(u32 cell_id)
 	}
 	return ret;
 }
+
+#ifdef MITOSIS_ENABLE_STEALING
+
+static inline bool try_stealing_this_task(struct task_ctx *task_ctx,
+					  s32 local_l3, u64 candidate_dsq)
+{
+	// Attempt the steal, can fail beacuse it's a race.
+	if (!scx_bpf_dsq_move_to_local(candidate_dsq))
+		return false;
+
+	// We got the task!
+	task_ctx->steal_count++;
+	task_ctx->last_stolen_at = scx_bpf_now();
+	/* Retag to thief L3 (the one for this cpu) */
+	task_ctx->pending_l3 = local_l3;
+	task_ctx->steals_prevented = 0;
+
+	/* Increment steal counter in map */
+	u32 key = 0;
+	u64 *count = bpf_map_lookup_elem(&steal_stats, &key);
+	// NOTE: This could get expensive, but I'm not anticipating that many steals. Percpu if we care.
+	if (count)
+		__sync_fetch_and_add(count, 1);
+
+	return true;
+}
+
+/* Work stealing:
+ * Scan sibling (cell,L3) DSQs in the same cell and steal the first queued task if it can run on this cpu
+*/
+static inline bool try_stealing_work(u32 cell, s32 local_l3)
+{
+	if (!l3_is_valid(local_l3))
+		scx_bpf_error("try_stealing_work: invalid local_l3");
+
+	struct cell *cell_ptr = lookup_cell(cell);
+	if (!cell_ptr)
+		scx_bpf_error("try_stealing_work: invalid cell");
+
+	// Loop over all other L3s, looking for a queued task to steal
+	u32 i;
+	bpf_for(i, 1, nr_l3)
+	{
+		// Start with the next one to spread out the load
+		u32 candidate_l3 = (local_l3 + i) % nr_l3;
+
+		// Prevents the optimizer from removing the following conditional return
+		// so that the verifier knows the read wil be safe
+		barrier_var(candidate_l3);
+
+		if (candidate_l3 >= MAX_L3S)
+			continue;
+
+		// Skip L3s that are not present in this cell
+		// Note: rechecking cell_ptr for verifier
+		if (cell_ptr && cell_ptr->l3_cpu_cnt[candidate_l3] == 0)
+			continue;
+
+		u64 candidate_dsq = get_cell_l3_dsq_id(cell, candidate_l3);
+
+		struct task_struct *task = NULL;
+		struct task_ctx *task_ctx;
+		// I'm only using this for the verifier
+		bool found_task = false;
+
+		// Optimization: skip if faster than constructing an iterator
+		// Not redundant with later checking if task found (race)
+		if (scx_bpf_dsq_nr_queued(candidate_dsq))
+			continue;
+
+		// Just a trick for peeking the head element
+		bpf_for_each(scx_dsq, task, candidate_dsq, 0)
+		{
+	 		task_ctx = lookup_task_ctx(task);
+			found_task = (task_ctx != NULL);
+			break;
+		}
+
+		// No task? Try next L3
+		if (!found_task)
+			continue;
+
+		// This knob throttles stealing.
+		// TODO: make runtime configurable
+		if (task_ctx->steals_prevented++ < PREVENT_N_STEALS) {
+			continue;
+		}
+
+		if (!try_stealing_this_task(task_ctx, local_l3, candidate_dsq))
+			continue;
+
+		// Success, we got a task (no guarantee it was the one we peeked though... race)
+		return true;
+	}
+	return false;
+}
+#endif
diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
index 76cacf134b..363a013935 100644
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
@@ -57,7 +57,7 @@ struct l3_to_cpus_map l3_to_cpus SEC(".maps");
 struct function_counters_map function_counters SEC(".maps");
 struct steal_stats_map steal_stats SEC(".maps");
 
-static inline void increment_counter(enum counter_idx idx) {
+static inline void increment_counter(enum fn_counter_idx idx) {
 	u64 *counter;
 	u32 key = idx;
 
@@ -312,20 +312,18 @@ static inline int update_task_cpumask(struct task_struct *p,
 	 */
 
 	// We want to set the task vtime to that of the cell it's joining.
-	// This used to be done by looking up the cell's dsq
-	// but now each cell has potentially multiple per l3 dsqs.
 	if (tctx->all_cell_cpus_allowed) {
 
 		const struct cpumask *l3_mask = NULL;
-		if (tctx->l3 != INVALID_L3_ID) {
+		if (tctx->l3 != L3_INVALID) {
 			l3_mask = lookup_l3_cpumask((u32)tctx->l3);
 			/* If the L3 no longer intersects the cell's cpumask, invalidate it */
 			if (!l3_mask || !bpf_cpumask_intersects(cell_cpumask, l3_mask))
-				tctx->l3 = INVALID_L3_ID;
+				tctx->l3 = L3_INVALID;
 		}
 
 		/* --- Pick a new L3 if needed --- */
-		if (tctx->l3 == INVALID_L3_ID) {
+		if (tctx->l3 == L3_INVALID) {
 			s32 new_l3 = pick_l3_for_task(tctx->cell);
 			if (new_l3 < 0)
 				return -ENODEV;
@@ -351,8 +349,9 @@ static inline int update_task_cpumask(struct task_struct *p,
 		if (!cell)
 			return -ENOENT;
 
-		if (tctx->l3 < 0 || tctx->l3 >= MAX_L3S)
+		if (!l3_is_valid(tctx->l3))
 			return -EINVAL;
+
 		p->scx.dsq_vtime = READ_ONCE(cell->l3_vtime_now[tctx->l3]);
 	} else {
 		/* Task is CPU-restricted, use task mask */
@@ -568,7 +567,7 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags)
 		if (!(cell = lookup_cell(tctx->cell)))
 			return;
 
-		if (tctx->l3 < 0 || tctx->l3 >= MAX_L3S) {
+		if (!l3_is_valid(tctx->l3)) {
 			scx_bpf_error("Invalid L3 ID for task %d in enqueue", p->pid);
 			return;
 		}
@@ -633,10 +632,10 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev)
 	// Get L3
 	u32 cpu_key = (u32)cpu;
 	u32 *l3_ptr = bpf_map_lookup_elem(&cpu_to_l3, &cpu_key);
-	s32 l3 = l3_ptr ? (s32)*l3_ptr : INVALID_L3_ID;
+	s32 l3 = l3_ptr ? (s32)*l3_ptr : L3_INVALID;
 
 	/* Check the L3 queue */
-	if (l3 != INVALID_L3_ID) {
+	if (l3 != L3_INVALID) {
 		u64 cell_l3_dsq = get_cell_l3_dsq_id(cell, l3);
 		bpf_for_each(scx_dsq, p, cell_l3_dsq, 0) {
 			min_vtime = p->scx.dsq_vtime;
@@ -661,59 +660,33 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev)
 	* and now the cell is empty. We have to ensure to try the cpu_dsq or else
 	* we might never wakeup.
 	*/
-	// TODO: The upstream side has "&& min_vtime_dsq != dsq" as part of this condition.
-	// Do we care?
-	if (!scx_bpf_dsq_move_to_local(min_vtime_dsq)) {
-#if MITOSIS_ENABLE_STEALING
-		/* Dead-simple work stealing:
-		 * If our local choices are empty, scan sibling (cell,L3) DSQs in the
-		 * same cell and steal the head task if it can run on @cpu.
-		 * No thresholds/cooldowns/lag heuristics—just the first eligible head.
-		 */
-		bool moved = false;
-		if (l3 != INVALID_L3_ID) {
-			// TODO: This math is kinda dumb and confusing.
-			u32 start = ((u32)l3 + 1) % nr_l3;
-			u32 off;
-			// TODO: This might try a bunch of L3s outside of the cell
-			bpf_for (off, 0, nr_l3) {
-				u32 cand = (start + off) % nr_l3;
-				if (cand == (u32)l3)
-					continue;
-				u64 src = get_cell_l3_dsq_id(cell, cand);
-
-				struct task_struct *q;
-				/* Peek only at the head. */
-				bpf_for_each(scx_dsq, q, src, 0) {
-					// TODO maybe this should use if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, q, SCX_DSQ_LOCAL, 0) == 0)
-					if (scx_bpf_dsq_move_to_local(src)) {
-						struct task_ctx *qt = lookup_task_ctx(q);
-						if (qt) {
-							qt->steal_count++;
-							qt->last_stolen_at = scx_bpf_now();
-							/* Retag to thief L3 */
-							qt->pending_l3 = l3;
-						}
-						/* Increment steal counter in map */
-						u32 key = 0;
-						u64 *count = bpf_map_lookup_elem(&steal_stats, &key);
-						// NOTE: This could get expensive, but I'm not
-						// anticipating that many steals. Percpu if we care.
-						if (count)
-							__sync_fetch_and_add(count, 1);
-						moved = true;
-					}
-					/* head only */
-					break;
-				}
-				if (moved)
-					break;
-			}
+
+
+	if (found) {
+		// We found a task in the local or cell-L3 DSQ
+
+		// If it was in the per cpu DSQ, there is no competation, grab it and return
+		if (min_vtime_dsq == local_dsq) {
+			scx_bpf_dsq_move_to_local(min_vtime_dsq);
+			return;
+		}
+
+		// If it was in the cell L3 DSQ, we are competing with other cpus in the cell-l3
+		// try to move it to the local DSQ
+		if (scx_bpf_dsq_move_to_local(min_vtime_dsq)) {
+			// We won the race and got the task, return
+			return;
 		}
-		if (!moved)
-#endif
-			scx_bpf_dsq_move_to_local(local_dsq);
 	}
+
+#if MITOSIS_ENABLE_STEALING
+	// We didn't find a task in either DSQ, or lost the race.
+	// Instead of going straight to idle, attempt to steal a task from another
+	// L3 in the cell.
+
+	// Try stealing. If successful, this moves the task to the local runqueue
+	try_stealing_work(cell, l3);
+#endif
 }
 
 struct cpumask_entry {
@@ -1129,10 +1102,10 @@ void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p)
 	 * effective cpumask+DSQ. Preserve vtime to keep fairness.
 	 */
 #if MITOSIS_ENABLE_STEALING
-	if (tctx->pending_l3 >= 0 && tctx->pending_l3 < MAX_L3S) {
+	if (l3_is_valid(tctx->pending_l3)) {
 		u64 save_v = p->scx.dsq_vtime;
 		tctx->l3 = tctx->pending_l3;
-		tctx->pending_l3 = INVALID_L3_ID;
+		tctx->pending_l3 = L3_INVALID;
 		update_task_cpumask(p, tctx);
 		p->scx.dsq_vtime = save_v;
 	}
@@ -1155,7 +1128,7 @@ void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p)
 	/*
 	 * Update per-(cell, L3) vtime for cell-schedulable tasks
 	 */
-	if (tctx->all_cell_cpus_allowed && tctx->l3 >= 0 && tctx->l3 < MAX_L3S) {
+	if (tctx->all_cell_cpus_allowed && l3_is_valid(tctx->l3)) {
 		if (time_before(READ_ONCE(cell->l3_vtime_now[tctx->l3]), p->scx.dsq_vtime))
 			WRITE_ONCE(cell->l3_vtime_now[tctx->l3], p->scx.dsq_vtime);
 	}
@@ -1202,8 +1175,7 @@ void BPF_STRUCT_OPS(mitosis_stopping, struct task_struct *p, bool runnable)
 		 * For cell-schedulable tasks, also accumulate vtime into
 		 * per-cell per-L3 queues
 		 */
-		if (tctx->all_cell_cpus_allowed && tctx->l3 >= 0 &&
-		    tctx->l3 < MAX_L3S) {
+		if (tctx->all_cell_cpus_allowed && l3_is_valid(tctx->l3)) {
 			/* Accumulate weighted execution time into per-(cell, L3) vtime */
 			cell->l3_vtime_now[tctx->l3] +=
 				used * DEFAULT_WEIGHT_MULTIPLIER /
@@ -1355,12 +1327,7 @@ s32 BPF_STRUCT_OPS(mitosis_init_task, struct task_struct *p,
 	}
 
 	/* Initialize L3 to invalid before cell assignment */
-	tctx->l3 = INVALID_L3_ID;
-#if MITOSIS_ENABLE_STEALING
-	tctx->pending_l3 = INVALID_L3_ID;
-	tctx->steal_count = 0;
-	tctx->last_stolen_at = 0;
-#endif
+	init_task_l3(tctx);
 
 	// TODO clean this up
 	if ((ret = update_task_cell(p, tctx, args->cgroup))) {
diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
index 4eb3b2231f..3f546512e8 100644
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
@@ -56,7 +56,7 @@ enum mitosis_constants {
 	ROOT_CELL_ID = 0,
 
 	/* Invalid/unset L3 value */
-	INVALID_L3_ID = -1,
+	// INVALID_L3_ID = -1,
 
 	/* Default weight divisor for vtime calculation */
 	DEFAULT_WEIGHT_MULTIPLIER = 100,
@@ -118,6 +118,7 @@ struct task_ctx {
 	s32 pending_l3;
 	u32 steal_count; /* how many times this task has been stolen */
 	u64 last_stolen_at; /* ns timestamp of the last steal (scx_bpf_now) */
+	u32 steals_prevented; /* how many times this task has been prevented from being stolen */
 #endif
 };
 
@@ -125,6 +126,8 @@ struct task_ctx {
 static inline struct cell *lookup_cell(int idx);
 static inline const struct cpumask *lookup_cell_cpumask(int idx);
 
+static inline struct task_ctx *lookup_task_ctx(struct task_struct *p);
+
 /* MAP TYPES */
 struct function_counters_map {
 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
@@ -132,3 +135,44 @@ struct function_counters_map {
 	__type(value, u64);
 	__uint(max_entries, NR_COUNTERS);
 };
+
+// static __always_inline void task_release_cleanup(struct task_struct **pp)
+// {
+// 	if (*pp)
+// 		bpf_task_release(*pp);
+// }
+
+// #define SCOPED_TASK __attribute__((cleanup(task_release_cleanup)))
+
+// __always_inline struct task_struct * dsq_head_peek(u64 dsq_id, task_struct *p)
+// {
+// 	bpf_rcu_read_lock();
+// 	struct task_struct *p = NULL;
+// 	bpf_for_each(scx_dsq, p, dsq_id, 0) {
+// 		bpf_task_acquire(p); /* extend lifetime beyond loop */
+// 		break;               /* only want the head */
+// 	}
+// 	bpf_rcu_read_unlock();
+
+// 	return p;
+// }
+
+// static __always_inline struct task_struct *
+// dsq_head_peek(u64 dsq_id)
+// {
+// 	struct bpf_iter_scx_dsq it = {};
+// 	struct task_struct *p;
+
+// 	if (bpf_iter_scx_dsq_new(&it, dsq_id, 0))
+// 		return NULL;
+
+// 	/* First element in dispatch order is the head. */
+// 	p = bpf_iter_scx_dsq_next(&it);
+
+// 	/* Take a ref so the pointer remains valid after we destroy the iter. */
+// 	if (p)
+// 		bpf_task_acquire(p);
+
+// 	bpf_iter_scx_dsq_destroy(&it);
+// 	return p; /* caller must bpf_task_release(p) when done */
+// }
diff --git a/scheds/rust/scx_mitosis/src/main.rs b/scheds/rust/scx_mitosis/src/main.rs
index c68a1476b1..9677be60c9 100644
--- a/scheds/rust/scx_mitosis/src/main.rs
+++ b/scheds/rust/scx_mitosis/src/main.rs
@@ -560,7 +560,7 @@ impl<'a> Scheduler<'a> {
         let mut all_counters = Vec::new();
 
         // Read counters for each function
-        for counter_idx in 0..bpf_intf::counter_idx_NR_COUNTERS {
+        for counter_idx in 0..bpf_intf::fn_counter_idx_NR_COUNTERS {
             let key = (counter_idx as u32).to_ne_bytes();
 
             // Read per-CPU values
@@ -620,7 +620,7 @@ impl<'a> Scheduler<'a> {
         }
 
         // Zero out all counters after printing
-        for counter_idx in 0..bpf_intf::counter_idx_NR_COUNTERS {
+        for counter_idx in 0..bpf_intf::fn_counter_idx_NR_COUNTERS {
             let key = (counter_idx as u32).to_ne_bytes();
             let zero_value = 0u64.to_ne_bytes().to_vec();
 

From 7ddaba0d9646098a02d8bf178d934902f9bf35d9 Mon Sep 17 00:00:00 2001
From: tommy-u <tunger10@gmail.com>
Date: Fri, 26 Sep 2025 05:48:25 -0700
Subject: [PATCH 07/12] Use dsq_id_t type

---
 scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h     | 13 ++---
 .../rust/scx_mitosis/src/bpf/l3_aware.bpf.h   |  2 +-
 scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c | 31 ++++++------
 scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h | 47 ++-----------------
 4 files changed, 24 insertions(+), 69 deletions(-)

diff --git a/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h
index a6b899d2f5..a8a8a21c2e 100644
--- a/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h
@@ -144,9 +144,8 @@ static inline bool is_cpu_dsq(dsq_id_t dsq_id)
 }
 
 // If this is a per cpu dsq, return the cpu
-static inline u32 get_cpu_from_dsq(u64 id)
+static inline u32 get_cpu_from_dsq(dsq_id_t dsq_id)
 {
-	dsq_id_t dsq_id = (dsq_id_t) {.raw = id};
 	if (!is_cpu_dsq(dsq_id))
 		scx_bpf_error("trying to get cpu from non-cpu dsq\n");
 
@@ -154,21 +153,19 @@ static inline u32 get_cpu_from_dsq(u64 id)
 }
 
 /* Helper functions to construct DSQ IDs */
-static inline u64 get_cpu_dsq_id(u32 cpu)
+static inline dsq_id_t get_cpu_dsq_id(u32 cpu)
 {
 	// Check for valid CPU range, 0 indexed so >=.
 	if (cpu >= MAX_CPUS)
 		scx_bpf_error("invalid cpu %u\n", cpu);
-	dsq_id_t dsq_id = { .cpu_dsq = { .cpu = cpu, .type = DSQ_TYPE_CPU } };
 
-	return dsq_id.raw;
+	return (dsq_id_t){ .cpu_dsq = { .cpu = cpu, .type = DSQ_TYPE_CPU } };
 }
 
-static inline u64 get_cell_l3_dsq_id(u32 cell, u32 l3)
+static inline dsq_id_t get_cell_l3_dsq_id(u32 cell, u32 l3)
 {
 	if (cell >= MAX_CELLS || l3 >= MAX_L3S)
 		scx_bpf_error("cell %u or l3 %u too large\n", cell, l3);
-	dsq_id_t dsq_id = { .cell_l3_dsq = { .l3 = l3, .cell = cell, .type = DSQ_TYPE_CELL_L3 } };
 
-	return dsq_id.raw;
+	return (dsq_id_t){ .cell_l3_dsq = { .l3 = l3, .cell = cell, .type = DSQ_TYPE_CELL_L3 } };
 }
diff --git a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
index 80ab1cc26b..12c1a2c28c 100644
--- a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
@@ -229,7 +229,7 @@ static inline bool try_stealing_work(u32 cell, s32 local_l3)
 		if (cell_ptr && cell_ptr->l3_cpu_cnt[candidate_l3] == 0)
 			continue;
 
-		u64 candidate_dsq = get_cell_l3_dsq_id(cell, candidate_l3);
+		u64 candidate_dsq = get_cell_l3_dsq_id(cell, candidate_l3).raw;
 
 		struct task_struct *task = NULL;
 		struct task_ctx *task_ctx;
diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
index 363a013935..98820c122b 100644
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
@@ -602,7 +602,7 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags)
 	if (time_before(vtime, basis_vtime - slice_ns))
 		vtime = basis_vtime - slice_ns;
 
-	scx_bpf_dsq_insert_vtime(p, tctx->dsq, slice_ns, vtime, enq_flags);
+	scx_bpf_dsq_insert_vtime(p, tctx->dsq.raw, slice_ns, vtime, enq_flags);
 
 	/* Kick the CPU if needed */
 	if (!__COMPAT_is_enq_cpu_selected(enq_flags) && cpu >= 0)
@@ -622,10 +622,10 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev)
 	cell = READ_ONCE(cctx->cell);
 
 	/* Start from a valid DSQ */
-	u64 local_dsq = get_cpu_dsq_id(cpu);
+	dsq_id_t local_dsq = get_cpu_dsq_id(cpu);
 
 	bool found = false;
-	u64 min_vtime_dsq = local_dsq;
+	dsq_id_t min_vtime_dsq = local_dsq;
 	u64 min_vtime = ~0ULL; /* U64_MAX */
 	struct task_struct *p;
 
@@ -636,8 +636,8 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev)
 
 	/* Check the L3 queue */
 	if (l3 != L3_INVALID) {
-		u64 cell_l3_dsq = get_cell_l3_dsq_id(cell, l3);
-		bpf_for_each(scx_dsq, p, cell_l3_dsq, 0) {
+		dsq_id_t cell_l3_dsq = get_cell_l3_dsq_id(cell, l3);
+		bpf_for_each(scx_dsq, p, cell_l3_dsq.raw, 0) {
 			min_vtime = p->scx.dsq_vtime;
 			min_vtime_dsq = cell_l3_dsq;
 			found = true;
@@ -646,7 +646,7 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev)
 	}
 
 	/* Check the CPU DSQ for a lower vtime */
-	bpf_for_each(scx_dsq, p, local_dsq, 0) {
+	bpf_for_each(scx_dsq, p, local_dsq.raw, 0) {
 		if (!found || time_before(p->scx.dsq_vtime, min_vtime)) {
 			min_vtime = p->scx.dsq_vtime;
 			min_vtime_dsq = local_dsq;
@@ -666,14 +666,14 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev)
 		// We found a task in the local or cell-L3 DSQ
 
 		// If it was in the per cpu DSQ, there is no competation, grab it and return
-		if (min_vtime_dsq == local_dsq) {
-			scx_bpf_dsq_move_to_local(min_vtime_dsq);
+		if (min_vtime_dsq.raw == local_dsq.raw) {
+			scx_bpf_dsq_move_to_local(min_vtime_dsq.raw);
 			return;
 		}
 
 		// If it was in the cell L3 DSQ, we are competing with other cpus in the cell-l3
 		// try to move it to the local DSQ
-		if (scx_bpf_dsq_move_to_local(min_vtime_dsq)) {
+		if (scx_bpf_dsq_move_to_local(min_vtime_dsq.raw)) {
 			// We won the race and got the task, return
 			return;
 		}
@@ -1112,7 +1112,7 @@ void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p)
 #endif
 
 	/* Validate task's DSQ before it starts running */
-	if (tctx->dsq == DSQ_INVALID) {
+	if (tctx->dsq.raw == DSQ_INVALID) {
 		if (tctx->all_cell_cpus_allowed) {
 			scx_bpf_error(
 				"Task %d has invalid DSQ 0 in running callback (CELL-SCHEDULABLE task, can run on any CPU in cell %d)",
@@ -1400,7 +1400,7 @@ static __always_inline void dump_l3_state(){
 
 void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx)
 {
-	u64 dsq_id;
+	dsq_id_t dsq_id;
 	int i;
 	struct cell *cell;
 	struct cpu_ctx *cpu_ctx;
@@ -1429,7 +1429,7 @@ void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx)
 		dsq_id = get_cpu_dsq_id(i);
 		scx_bpf_dump("CPU[%d] cell=%d vtime=%llu nr_queued=%d\n", i,
 			     cpu_ctx->cell, READ_ONCE(cpu_ctx->vtime_now),
-			     scx_bpf_dsq_nr_queued(dsq_id));
+			     scx_bpf_dsq_nr_queued(dsq_id.raw));
 	}
 
 	dump_l3_state();
@@ -1447,7 +1447,7 @@ void BPF_STRUCT_OPS(mitosis_dump_task, struct scx_dump_ctx *dctx,
 	scx_bpf_dump(
 		"Task[%d] vtime=%llu basis_vtime=%llu cell=%u dsq=%llu all_cell_cpus_allowed=%d\n",
 		p->pid, p->scx.dsq_vtime, tctx->basis_vtime, tctx->cell,
-		tctx->dsq, tctx->all_cell_cpus_allowed);
+		tctx->dsq.raw, tctx->all_cell_cpus_allowed);
 	scx_bpf_dump("Task[%d] CPUS=", p->pid);
 	dump_cpumask(p->cpus_ptr);
 	scx_bpf_dump("\n");
@@ -1479,7 +1479,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init)
 		if ((u8_ptr = MEMBER_VPTR(all_cpus, [i / 8]))) {
 			if (*u8_ptr & (1 << (i % 8))) {
 				bpf_cpumask_set_cpu(i, cpumask);
-				ret = scx_bpf_create_dsq(get_cpu_dsq_id(i), ANY_NUMA);
+				ret = scx_bpf_create_dsq(get_cpu_dsq_id(i).raw, ANY_NUMA);
 				if (ret < 0) {
 					bpf_cpumask_release(cpumask);
 					return ret;
@@ -1541,8 +1541,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init)
 		u32 l3;
 		bpf_for(l3, 0, nr_l3)
 		{
-			u64 id = get_cell_l3_dsq_id(i, l3);
-			ret = scx_bpf_create_dsq(id, ANY_NUMA);
+			ret = scx_bpf_create_dsq(get_cell_l3_dsq_id(i, l3).raw, ANY_NUMA);
 			if (ret < 0)
 				scx_bpf_error( "Failed to create DSQ for cell %d, L3 %d: err %d", i, l3, ret);
 		}
diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
index 3f546512e8..2024d2b5a1 100644
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
@@ -83,8 +83,8 @@ struct cell {
 	// Number of L3s with at least one CPU in this cell
 	u32 l3_present_cnt;
 
-  // TODO XXX remove this, only here temporarily to make the code compile
-  // current vtime of the cell
+	// TODO XXX remove this, only here temporarily to make the code compile
+	// current vtime of the cell
 	u64 vtime_now;
 };
 
@@ -102,7 +102,7 @@ struct task_ctx {
 	u32 cell;
 	/* For the sake of scheduling, a task is exclusively owned by either a cell
 	 * or a cpu */
-	u64 dsq;
+	dsq_id_t dsq;
 	/* latest configuration that was applied for this task */
 	/* (to know if it has to be re-applied) */
 	u32 configuration_seq;
@@ -135,44 +135,3 @@ struct function_counters_map {
 	__type(value, u64);
 	__uint(max_entries, NR_COUNTERS);
 };
-
-// static __always_inline void task_release_cleanup(struct task_struct **pp)
-// {
-// 	if (*pp)
-// 		bpf_task_release(*pp);
-// }
-
-// #define SCOPED_TASK __attribute__((cleanup(task_release_cleanup)))
-
-// __always_inline struct task_struct * dsq_head_peek(u64 dsq_id, task_struct *p)
-// {
-// 	bpf_rcu_read_lock();
-// 	struct task_struct *p = NULL;
-// 	bpf_for_each(scx_dsq, p, dsq_id, 0) {
-// 		bpf_task_acquire(p); /* extend lifetime beyond loop */
-// 		break;               /* only want the head */
-// 	}
-// 	bpf_rcu_read_unlock();
-
-// 	return p;
-// }
-
-// static __always_inline struct task_struct *
-// dsq_head_peek(u64 dsq_id)
-// {
-// 	struct bpf_iter_scx_dsq it = {};
-// 	struct task_struct *p;
-
-// 	if (bpf_iter_scx_dsq_new(&it, dsq_id, 0))
-// 		return NULL;
-
-// 	/* First element in dispatch order is the head. */
-// 	p = bpf_iter_scx_dsq_next(&it);
-
-// 	/* Take a ref so the pointer remains valid after we destroy the iter. */
-// 	if (p)
-// 		bpf_task_acquire(p);
-
-// 	bpf_iter_scx_dsq_destroy(&it);
-// 	return p; /* caller must bpf_task_release(p) when done */
-// }

From 7639d21e720bf770d1903dc2f54aa946305b546b Mon Sep 17 00:00:00 2001
From: tommy-u <tunger10@gmail.com>
Date: Wed, 8 Oct 2025 17:53:59 -0700
Subject: [PATCH 08/12] First cut at locking

---
 scheds/rust/scx_mitosis/build.rs              |   2 +-
 scheds/rust/scx_mitosis/src/bpf/intf.h        |  31 +++++
 scheds/rust/scx_mitosis/src/bpf/intf_rust.h   |   4 +
 .../rust/scx_mitosis/src/bpf/l3_aware.bpf.h   | 109 +++++++++++-------
 scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c |  47 +++++---
 scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h |  99 ++++++++++++----
 scheds/rust/scx_mitosis/src/main.rs           |  80 +++++++++----
 7 files changed, 272 insertions(+), 100 deletions(-)
 create mode 100644 scheds/rust/scx_mitosis/src/bpf/intf_rust.h

diff --git a/scheds/rust/scx_mitosis/build.rs b/scheds/rust/scx_mitosis/build.rs
index f617cea07d..a5854f718c 100644
--- a/scheds/rust/scx_mitosis/build.rs
+++ b/scheds/rust/scx_mitosis/build.rs
@@ -6,7 +6,7 @@
 fn main() {
     scx_cargo::BpfBuilder::new()
         .unwrap()
-        .enable_intf("src/bpf/intf.h", "bpf_intf.rs")
+        .enable_intf("src/bpf/intf_rust.h", "bpf_intf.rs")
         .enable_skel("src/bpf/mitosis.bpf.c", "bpf")
         .build()
         .unwrap();
diff --git a/scheds/rust/scx_mitosis/src/bpf/intf.h b/scheds/rust/scx_mitosis/src/bpf/intf.h
index 8957d7165c..00045df399 100644
--- a/scheds/rust/scx_mitosis/src/bpf/intf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/intf.h
@@ -11,6 +11,7 @@ typedef unsigned int u32;
 typedef _Bool bool;
 #endif
 
+
 #ifdef LSP
 #define __bpf__
 #include "../../../../include/scx/ravg.bpf.h"
@@ -34,6 +35,36 @@ enum consts {
 
 	PCPU_BASE = 0x80000000,
 	MAX_CG_DEPTH = 256,
+
+	MAX_L3S = 16,
+};
+
+/* Kernel side sees the real lock; userspace sees padded bytes of same size/alignment */
+#if defined(__BPF__)
+# define CELL_LOCK_T struct bpf_spin_lock
+#else
+/* userspace placeholder: kernel won’t copy spin_lock */
+# define CELL_LOCK_T struct { u32 __pad; }  /* 4-byte aligned as required */
+#endif
+
+struct cell {
+	// This is a lock in the kernel and padding in the user
+	CELL_LOCK_T lock;
+
+	// Whether or not the cell is used
+	u32 in_use;
+	// Number of CPUs in this cell
+	u32 cpu_cnt;
+	// per-L3 vtimes within this cell
+	u64 l3_vtime_now[MAX_L3S];
+	// Number of CPUs from each L3 assigned to this cell
+	u32 l3_cpu_cnt[MAX_L3S];
+	// Number of L3s with at least one CPU in this cell
+	u32 l3_present_cnt;
+
+	// TODO XXX remove this, only here temporarily to make the code compile
+	// current vtime of the cell
+	u64 vtime_now;
 };
 
 /* Statistics */
diff --git a/scheds/rust/scx_mitosis/src/bpf/intf_rust.h b/scheds/rust/scx_mitosis/src/bpf/intf_rust.h
new file mode 100644
index 0000000000..f8ffd3252a
--- /dev/null
+++ b/scheds/rust/scx_mitosis/src/bpf/intf_rust.h
@@ -0,0 +1,4 @@
+/* Force userspace path for Rust bindgen */
+#undef __BPF__
+#undef __bpf__
+#include "intf.h"
diff --git a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
index 12c1a2c28c..7ed77d68c3 100644
--- a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
@@ -86,46 +86,67 @@ static inline const struct cpumask *lookup_l3_cpumask(u32 l3)
 static __always_inline void recalc_cell_l3_counts(u32 cell_idx)
 {
 	struct cell *cell = lookup_cell(cell_idx);
-	if (!cell)
+	if (!cell) {
+		scx_bpf_error("recalc_cell_l3_counts: invalid cell %d",
+			      cell_idx);
 		return;
+	}
 
-	struct bpf_cpumask *tmp = bpf_cpumask_create();
-	if (!tmp)
+	CPUMASK_GUARD(tmp_guard);
+	if (!tmp_guard.mask) {
+		scx_bpf_error(
+			"recalc_cell_l3_counts: failed to create tmp mask");
 		return;
+	}
 
-	u32 l3, present = 0, total_cpus = 0;
+	u32 l3, l3s_present = 0, total_cpus = 0;
+	// Just so we don't hold the lock longer than necessary
+	u32 l3_cpu_cnt_tmp[MAX_L3S] = {0};
 
-	bpf_rcu_read_lock();
-	const struct cpumask *cell_mask =
-		lookup_cell_cpumask(cell_idx); // RCU ptr
-	if (!cell_mask) {
-		bpf_rcu_read_unlock();
-		bpf_cpumask_release(tmp);
-		return;
-	}
+	{ // RCU context
+		RCU_READ_GUARD();
+		const struct cpumask *cell_mask =
+			lookup_cell_cpumask(cell_idx); // RCU ptr
 
-	bpf_for(l3, 0, nr_l3)
-	{
-		const struct cpumask *l3_mask =
-			lookup_l3_cpumask(l3); // plain map memory
-		if (!l3_mask) {
-			cell->l3_cpu_cnt[l3] = 0;
-			continue;
+		if (!cell_mask) {
+			scx_bpf_error("recalc_cell_l3_counts: invalid cell mask");
+			return;
+		}
+
+		bpf_for(l3, 0, nr_l3)
+		{
+			const struct cpumask *l3_mask = lookup_l3_cpumask(l3);
+			if (!l3_mask) {
+				scx_bpf_error( "recalc_cell_l3_counts: invalid l3 mask");
+				return;
+			}
+
+			bpf_cpumask_and(tmp_guard.mask, cell_mask, l3_mask);
+
+			u32 cnt = bpf_cpumask_weight((const struct cpumask *)tmp_guard.mask);
+
+			l3_cpu_cnt_tmp[l3] = cnt;
+
+			bpf_printk("recalc_cell_l3_counts: cnt %d", cnt);
+
+			// These are counted across the whole cell
+			total_cpus += cnt;
+
+			// Number of non-empty L3s in this cell
+			if (cnt)
+				l3s_present++;
 		}
+	} // unlock RCU
 
-		bpf_cpumask_and(tmp, cell_mask, l3_mask);
 
-		u32 cnt = bpf_cpumask_weight((const struct cpumask *)tmp);
-		cell->l3_cpu_cnt[l3] = cnt;
-		total_cpus += cnt;
-		if (cnt)
-			present++;
+	bpf_spin_lock(&cell->lock);
+	for (u32 l3 = 0; l3 < nr_l3; l3++) {
+			cell->l3_cpu_cnt[l3] = l3_cpu_cnt_tmp[l3];
 	}
-	bpf_rcu_read_unlock();
 
-	cell->l3_present_cnt = present;
+	cell->l3_present_cnt = l3s_present;
 	cell->cpu_cnt = total_cpus;
-	bpf_cpumask_release(tmp);
+	bpf_spin_unlock(&cell->lock);
 }
 
 /**
@@ -138,29 +159,32 @@ static __always_inline void recalc_cell_l3_counts(u32 cell_idx)
  * @cell_id: The cell ID to select an L3 from
  * @return: L3 ID on success, L3_INVALID on error
  */
+// TODO: Lock
 static inline s32 pick_l3_for_task(u32 cell_id)
 {
 	struct cell *cell;
-	u32 l3, target, cur = 0;
-	s32 ret = L3_INVALID;
 
 	/* Look up the cell structure */
-	if (!(cell = lookup_cell(cell_id)))
+	if (!(cell = lookup_cell(cell_id))) {
+		scx_bpf_error("pick_l3_for_task: invalid cell %d", cell_id);
 		return L3_INVALID;
+	}
 
-	/* Handle case where cell has no CPUs assigned yet */
+	// No cpus
 	if (!cell->cpu_cnt) {
-		scx_bpf_error(
-			"pick_l3_for_task: cell %d has no CPUs accounted yet",
-			cell_id);
+		scx_bpf_error( "pick_l3_for_task: cell %d has no CPUs accounted yet", cell_id);
 		return L3_INVALID;
 	}
 
-	/* Generate random target value in range [0, cpu_cnt) */
-	target = bpf_get_prandom_u32() % cell->cpu_cnt;
-
 	/* Find the L3 domain corresponding to the target value using
 	 * weighted selection - accumulate CPU counts until we exceed target */
+
+	/* Generate random target value in range [0, cpu_cnt) */
+	u32 target = bpf_get_prandom_u32() % cell->cpu_cnt;
+	u32 l3, cur = 0;
+	s32 ret = L3_INVALID;
+
+	// This could be a prefix sum. Find first l3 where we exceed target
 	bpf_for(l3, 0, nr_l3)
 	{
 		cur += cell->l3_cpu_cnt[l3];
@@ -169,6 +193,12 @@ static inline s32 pick_l3_for_task(u32 cell_id)
 			break;
 		}
 	}
+
+	if (ret == L3_INVALID) {
+		scx_bpf_error("pick_l3_for_task: invalid L3");
+		return L3_INVALID;
+	}
+
 	return ret;
 }
 
@@ -226,6 +256,7 @@ static inline bool try_stealing_work(u32 cell, s32 local_l3)
 
 		// Skip L3s that are not present in this cell
 		// Note: rechecking cell_ptr for verifier
+		// TODO: Lock?
 		if (cell_ptr && cell_ptr->l3_cpu_cnt[candidate_l3] == 0)
 			continue;
 
@@ -244,7 +275,7 @@ static inline bool try_stealing_work(u32 cell, s32 local_l3)
 		// Just a trick for peeking the head element
 		bpf_for_each(scx_dsq, task, candidate_dsq, 0)
 		{
-	 		task_ctx = lookup_task_ctx(task);
+			task_ctx = lookup_task_ctx(task);
 			found_task = (task_ctx != NULL);
 			break;
 		}
diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
index 98820c122b..b920ecaf25 100644
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
@@ -13,7 +13,7 @@
  */
 
 // TODO: fix debug printer.
-#include "intf.h"
+// #include "intf.h"
 
 #include "mitosis.bpf.h"
 #include "dsq.bpf.h"
@@ -45,9 +45,15 @@ private(root_cgrp) struct cgroup __kptr *root_cgrp;
 
 UEI_DEFINE(uei);
 
+// Cells now defined as a map so we can lock.
+struct cell_map cells SEC(".maps");
+
 /*
  * Maps used for L3-aware scheduling
 */
+#if 0
+struct cell_locks_map cell_locks SEC(".maps");
+#endif
 struct cpu_to_l3_map cpu_to_l3 SEC(".maps");
 struct l3_to_cpus_map l3_to_cpus SEC(".maps");
 
@@ -162,19 +168,6 @@ static inline struct cpu_ctx *lookup_cpu_ctx(int cpu)
 	return cctx;
 }
 
-struct cell cells[MAX_CELLS];
-
-static inline struct cell *lookup_cell(int idx)
-{
-	struct cell *cell;
-
-	cell = MEMBER_VPTR(cells, [idx]);
-	if (!cell) {
-		scx_bpf_error("Invalid cell %d", idx);
-		return NULL;
-	}
-	return cell;
-}
 
 /*
  * Cells are allocated concurrently in some cases (e.g. cgroup_init).
@@ -191,8 +184,11 @@ static inline int allocate_cell()
 
 		if (__sync_bool_compare_and_swap(&c->in_use, 0, 1)) {
 			// TODO XXX, I think we need to make this concurrent safe
+			// TODO, lock with recalc_cell...()
 			__builtin_memset(c->l3_cpu_cnt, 0, sizeof(c->l3_cpu_cnt));
 			c->l3_present_cnt = 0;
+			// TODO zero cpu_cnt
+			// TODO Just zero the whole cell struct?
 			return cell_idx;
 		}
 	}
@@ -325,8 +321,10 @@ static inline int update_task_cpumask(struct task_struct *p,
 		/* --- Pick a new L3 if needed --- */
 		if (tctx->l3 == L3_INVALID) {
 			s32 new_l3 = pick_l3_for_task(tctx->cell);
-			if (new_l3 < 0)
+			if (new_l3 < 0) {
+				scx_bpf_error("bad L3: %d", new_l3);
 				return -ENODEV;
+			}
 			tctx->l3 = new_l3;
 			l3_mask = lookup_l3_cpumask((u32)tctx->l3);
 			if (!l3_mask)
@@ -339,8 +337,10 @@ static inline int update_task_cpumask(struct task_struct *p,
 			bpf_cpumask_and(tctx->cpumask, (const struct cpumask *)tctx->cpumask, l3_mask);
 
 		/* If empty after intersection, nothing can run here */
-		if (tctx->cpumask && bpf_cpumask_empty((const struct cpumask *)tctx->cpumask))
+		if (tctx->cpumask && bpf_cpumask_empty((const struct cpumask *)tctx->cpumask)) {
+			scx_bpf_error("Empty cpumask after intersection");
 			return -ENODEV;
+		}
 
 		/* --- Point to the correct (cell,L3) DSQ and set vtime baseline --- */
 		tctx->dsq = get_cell_l3_dsq_id(tctx->cell, tctx->l3);
@@ -349,8 +349,10 @@ static inline int update_task_cpumask(struct task_struct *p,
 		if (!cell)
 			return -ENOENT;
 
-		if (!l3_is_valid(tctx->l3))
+		if (!l3_is_valid(tctx->l3)){
+			scx_bpf_error("Invalid L3 %d", tctx->l3);
 			return -EINVAL;
+		}
 
 		p->scx.dsq_vtime = READ_ONCE(cell->l3_vtime_now[tctx->l3]);
 	} else {
@@ -1386,7 +1388,8 @@ static __always_inline void dump_cell_state(u32 cell_idx)
 		   cell_idx, cell->in_use, cell->cpu_cnt, cell->l3_present_cnt);
 
 	u32 l3;
-	// Print vtimes for L3s
+	// TODO Print vtimes for L3s
+	// TODO lock
 	bpf_for(l3, 0, nr_l3) {
 		if (cell->l3_cpu_cnt[l3] > 0) {
 			scx_bpf_dump("  L3[%d]: %d CPUs", l3, cell->l3_cpu_cnt[l3]);
@@ -1490,6 +1493,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init)
 		}
 	}
 
+
 	cpumask = bpf_kptr_xchg(&all_cpumask, cpumask);
 	if (cpumask)
 		bpf_cpumask_release(cpumask);
@@ -1529,7 +1533,12 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init)
 		}
 	}
 
-	cells[0].in_use = true;
+	struct cell *cell = lookup_cell(0);
+	if (!cell) {
+		scx_bpf_error("Failed to lookup cell 0");
+		return -ENOENT;
+	}
+	cell->in_use = true;
 
 	/* Configure root cell (cell 0) topology at init time using nr_l3 and l3_to_cpu masks */
 	recalc_cell_l3_counts(ROOT_CELL_ID);
diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
index 2024d2b5a1..4441a19a27 100644
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
@@ -20,12 +20,8 @@
 #endif
 
 #include "intf.h"
-
-#define MAX_L3S 16
-
 #include "dsq.bpf.h"
 
-
 /*
  * A couple of tricky things about checking a cgroup's cpumask:
  *
@@ -50,6 +46,10 @@
 
 extern const volatile u32 nr_l3;
 
+
+
+extern struct cell_map cells;
+
 enum mitosis_constants {
 
 	/* Root cell index */
@@ -71,22 +71,36 @@ enum mitosis_constants {
 	ANY_NUMA = -1,
 };
 
-struct cell {
-	// Whether or not the cell is used or not
-	u32 in_use;
-	// Number of CPUs in this cell
-	u32 cpu_cnt;
-	// per-L3 vtimes within this cell
-	u64 l3_vtime_now[MAX_L3S];
-	// Number of CPUs from each L3 assigned to this cell
-	u32 l3_cpu_cnt[MAX_L3S];
-	// Number of L3s with at least one CPU in this cell
-	u32 l3_present_cnt;
-
-	// TODO XXX remove this, only here temporarily to make the code compile
-	// current vtime of the cell
-	u64 vtime_now;
-};
+
+
+
+static inline struct cell *lookup_cell(int idx)
+{
+	struct cell *cell;
+
+	cell = bpf_map_lookup_elem(&cells, &idx);
+
+	if (!cell) {
+		scx_bpf_error("Invalid cell %d", idx);
+		return NULL;
+	}
+	return cell;
+}
+
+static inline struct bpf_spin_lock *get_cell_lock(u32 cell_idx)
+{
+	if (cell_idx >= MAX_CELLS) {
+		scx_bpf_error("Invalid cell index %d", cell_idx);
+		return NULL;
+	}
+
+	struct cell *cell = lookup_cell(cell_idx);
+	if (!cell) {
+		scx_bpf_error("Cell %d not found", cell_idx);
+		return NULL;
+	}
+	return &cell->lock;
+}
 
 /*
  * task_ctx is the per-task information kept by scx_mitosis
@@ -123,7 +137,6 @@ struct task_ctx {
 };
 
 // These could go in mitosis.bpf.h, but we'll cross that bridge when we get
-static inline struct cell *lookup_cell(int idx);
 static inline const struct cpumask *lookup_cell_cpumask(int idx);
 
 static inline struct task_ctx *lookup_task_ctx(struct task_struct *p);
@@ -135,3 +148,47 @@ struct function_counters_map {
 	__type(value, u64);
 	__uint(max_entries, NR_COUNTERS);
 };
+
+struct cell_map {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, struct cell);
+	__uint(max_entries, MAX_CELLS);
+};
+
+struct rcu_read_guard {
+	bool active;
+};
+
+static inline struct rcu_read_guard rcu_read_lock_guard(void) {
+	bpf_rcu_read_lock();
+	return (struct rcu_read_guard){.active = true};
+}
+
+static inline void rcu_read_guard_release(struct rcu_read_guard *guard) {
+	if (guard->active) {
+		bpf_rcu_read_unlock();
+		guard->active = false;
+	}
+}
+#define RCU_READ_GUARD() \
+	struct rcu_read_guard __rcu_guard __attribute__((__cleanup__(rcu_read_guard_release))) = rcu_read_lock_guard()
+
+struct cpumask_guard {
+	struct bpf_cpumask *mask;
+};
+
+static inline struct cpumask_guard cpumask_create_guard(void) {
+	struct bpf_cpumask *mask = bpf_cpumask_create();
+	return (struct cpumask_guard){.mask = mask};
+}
+
+static inline void cpumask_guard_release(struct cpumask_guard *guard) {
+	if (guard->mask) {
+		bpf_cpumask_release(guard->mask);
+		guard->mask = NULL;
+	}
+}
+
+#define CPUMASK_GUARD(var_name) \
+	struct cpumask_guard var_name __attribute__((__cleanup__(cpumask_guard_release))) = cpumask_create_guard()
diff --git a/scheds/rust/scx_mitosis/src/main.rs b/scheds/rust/scx_mitosis/src/main.rs
index 9677be60c9..b25be74326 100644
--- a/scheds/rust/scx_mitosis/src/main.rs
+++ b/scheds/rust/scx_mitosis/src/main.rs
@@ -24,7 +24,7 @@ use anyhow::Context;
 use anyhow::Result;
 use clap::Parser;
 use crossbeam::channel::RecvTimeoutError;
-use libbpf_rs::{MapCore, OpenObject};
+use libbpf_rs::{MapCore, OpenObject, MapFlags};
 use log::debug;
 use log::info;
 use log::trace;
@@ -49,6 +49,12 @@ use stats::CellMetrics;
 use stats::Metrics;
 use crate::mitosis_topology_utils::{populate_topology_maps, MapKind};
 
+// This is the cell type from intf.h.
+// When copied to user, the lock field is omitted.
+// We can mmap it, or use calls to the BPF_MAP_LOOKUP_ELEM
+// command of the bpf() system call with the BPF_F_LOCK flag
+type BpfCell = bpf_intf::cell;
+
 const SCHEDULER_NAME: &str = "scx_mitosis";
 const MAX_CELLS: usize = bpf_intf::consts_MAX_CELLS as usize;
 const NR_CSTATS: usize = bpf_intf::cell_stat_idx_NR_CSTATS as usize;
@@ -138,14 +144,14 @@ const QUEUE_STATS_IDX: [bpf_intf::cell_stat_idx; 3] = [
 
 // Per cell book-keeping
 #[derive(Debug)]
-struct Cell {
+struct CellMask {
     cpus: Cpumask,
 }
 
 struct Scheduler<'a> {
     skel: BpfSkel<'a>,
     monitor_interval: Duration,
-    cells: HashMap<u32, Cell>,
+    cells: HashMap<u32, CellMask>,
     // These are the per-cell cstats.
     // Note these are accumulated across all CPUs.
     prev_cell_stats: [[u64; NR_CSTATS]; MAX_CELLS],
@@ -193,11 +199,36 @@ impl Display for DistributionStats {
 }
 
 impl<'a> Scheduler<'a> {
+    fn get_bpf_cell(&self, cell_id: u32) -> anyhow::Result<Option<BpfCell>> {
+        let key = cell_id.to_ne_bytes();
+        let map = &self.skel.maps.cells; // NOTE: map is a field, not a method
+
+        match map.lookup(&key, MapFlags::ANY)? {
+            Some(bytes) => {
+                let need = core::mem::size_of::<BpfCell>();
+                if bytes.len() != need {
+                    anyhow::bail!("cells value size {} != BpfCell {}", bytes.len(), need);
+                }
+                // Copy to an aligned buffer to avoid misaligned reference
+                let mut tmp = MaybeUninit::<BpfCell>::uninit();
+                unsafe {
+                    std::ptr::copy_nonoverlapping(
+                        bytes.as_ptr(),
+                        tmp.as_mut_ptr() as *mut u8,
+                        need,
+                    );
+                    Ok(Some(tmp.assume_init()))
+                }
+            }
+            None => Ok(None),
+        }
+    }
+
     fn is_cell_in_use(&self, cell_id: u32) -> bool {
-        let cells = &self.skel.maps.bss_data.as_ref().unwrap().cells;
-        let bpf_cell = cells[cell_id as usize];
-        let in_use = unsafe { std::ptr::read_volatile(&bpf_cell.in_use as *const u32) };
-        in_use != 0
+        match self.get_bpf_cell(cell_id) {
+            Ok(Some(c)) => c.in_use != 0,
+            _ => false,
+        }
     }
 
     fn init(opts: &Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
@@ -235,6 +266,18 @@ impl<'a> Scheduler<'a> {
 
         let mut skel = scx_ops_load!(skel, mitosis, uei)?;
 
+        // Verify our version of the cell datastructure is the same size
+        // as the bpf one.
+        let cells_info = skel.maps.cells.info()?;
+        let usz = core::mem::size_of::<BpfCell>() as u32;
+        if cells_info.info.value_size != usz {
+            bail!(
+                "cells value_size={} but Rust expects {} (BpfCell)",
+                cells_info.info.value_size,
+                usz
+            );
+        }
+
         // Set up CPU to L3 topology mapping using the common functionality
         populate_topology_maps(&mut skel, MapKind::CpuToL3, None)?;
 
@@ -474,7 +517,7 @@ impl<'a> Scheduler<'a> {
     fn print_debug_status(&self) {
         if let Ok(flags) = DEBUG_FLAGS.lock() {
             let mut disabled: Vec<_> = flags.iter().filter_map(|(flag, &enabled)| (!enabled).then_some(format!("{}~{}{}", ANSI_RED, flag, ANSI_RESET))).collect();
-            let mut enabled: Vec<_> = flags.iter().filter_map(|(flag, &enabled)| enabled.then_some(format!("{}+{}{}", ANSI_GREEN, flag, ANSI_RESET))).collect();
+            let enabled: Vec<_> = flags.iter().filter_map(|(flag, &enabled)| enabled.then_some(format!("{}+{}{}", ANSI_GREEN, flag, ANSI_RESET))).collect();
             disabled.extend(enabled);
             trace!("Debug Flags: {}", if disabled.is_empty() { "none".to_string() } else { disabled.join(" ") });
             // trace!("hint: sudo ./scx_mitosis cli debug ~/+<flag_name>");
@@ -567,7 +610,7 @@ impl<'a> Scheduler<'a> {
             let percpu_values = self.skel
                 .maps
                 .function_counters
-                .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
+                .lookup_percpu(&key, MapFlags::ANY)
                 .context("Failed to lookup function counter")?
                 .unwrap_or_default();
 
@@ -632,7 +675,7 @@ impl<'a> Scheduler<'a> {
             self.skel
                 .maps
                 .function_counters
-                .update_percpu(&key, &percpu_values, libbpf_rs::MapFlags::ANY)
+                .update_percpu(&key, &percpu_values, MapFlags::ANY)
                 .context("Failed to reset function counter")?;
         }
 
@@ -655,7 +698,7 @@ fn update_steal_metrics(&mut self) -> Result<()> {
     let key = 0u32.to_ne_bytes();
 
     // Read the count; lazily initialize the slot to 0 if it doesn't exist.
-    let steal_count = match self.skel.maps.steal_stats.lookup(&key, libbpf_rs::MapFlags::ANY) {
+    let steal_count = match self.skel.maps.steal_stats.lookup(&key, MapFlags::ANY) {
         Ok(Some(data)) if data.len() >= 8 => {
             u64::from_ne_bytes(data[..8].try_into().unwrap())
         }
@@ -667,7 +710,7 @@ fn update_steal_metrics(&mut self) -> Result<()> {
         }
         Ok(None) => {
             let zero = 0u64.to_ne_bytes();
-            if let Err(e) = self.skel.maps.steal_stats.update(&key, &zero, libbpf_rs::MapFlags::ANY) {
+            if let Err(e) = self.skel.maps.steal_stats.update(&key, &zero, MapFlags::ANY) {
                 if steals_debug {
                     debug!("Failed to initialize steal_stats map: {e}");
                 }
@@ -736,15 +779,12 @@ fn update_steal_metrics(&mut self) -> Result<()> {
         // Create cells we don't have yet, drop cells that are no longer in use.
         // If we continue to drop cell metrics once a cell is removed, we'll need to make sure we
         // flush metrics for a cell before we remove it completely.
-        let cells = &self.skel.maps.bss_data.as_ref().unwrap().cells;
         for i in 0..MAX_CELLS {
             let cell_idx = i as u32;
-            let bpf_cell = cells[i];
-            let in_use = unsafe { std::ptr::read_volatile(&bpf_cell.in_use as *const u32) };
-            if in_use > 0 {
+            if self.is_cell_in_use(cell_idx) {
                 self.cells
                     .entry(cell_idx)
-                    .or_insert_with(|| Cell {
+                    .or_insert_with(|| CellMask {
                         cpus: Cpumask::new(),
                     })
                     .cpus = cell_to_cpus
@@ -769,7 +809,7 @@ fn read_cpu_ctxs(skel: &BpfSkel) -> Result<Vec<bpf_intf::cpu_ctx>> {
     let cpu_ctxs_vec = skel
         .maps
         .cpu_ctxs
-        .lookup_percpu(&0u32.to_ne_bytes(), libbpf_rs::MapFlags::ANY)
+        .lookup_percpu(&0u32.to_ne_bytes(), MapFlags::ANY)
         .context("Failed to lookup cpu_ctx")?
         .unwrap();
     for cpu in 0..*NR_CPUS_POSSIBLE {
@@ -787,7 +827,7 @@ fn read_cpu_to_l3(skel: &BpfSkel) -> Result<Vec<u32>> {
         let val = skel
             .maps
             .cpu_to_l3
-            .lookup(&key, libbpf_rs::MapFlags::ANY)?
+            .lookup(&key, MapFlags::ANY)?
             .map(|v| u32::from_ne_bytes(v.try_into().unwrap()))
             .unwrap_or(0);
         cpu_to_l3.push(val);
@@ -806,7 +846,7 @@ fn read_l3_to_cpus(skel: &BpfSkel) -> Result<Vec<(u32, Cpumask)>> {
         let mask = if let Some(v) = skel
             .maps
             .l3_to_cpus
-            .lookup(&key, libbpf_rs::MapFlags::ANY)?
+            .lookup(&key, MapFlags::ANY)?
         {
             let bytes = v.as_slice();
             let mut longs = [0u64; CPUMASK_LONG_ENTRIES];

From 7972846b8459eb6455d652889459101fc5ea73b3 Mon Sep 17 00:00:00 2001
From: tommy-u <tunger10@gmail.com>
Date: Thu, 9 Oct 2025 14:10:13 -0700
Subject: [PATCH 09/12] Lock cell state

---
 code.txt                                      | 2382 +++++++++++++++++
 scheds/rust/scx_mitosis/src/bpf/intf.h        |   47 +-
 .../rust/scx_mitosis/src/bpf/l3_aware.bpf.h   |   17 +-
 scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c |   25 +-
 scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h |   14 +-
 5 files changed, 2454 insertions(+), 31 deletions(-)
 create mode 100644 code.txt

diff --git a/code.txt b/code.txt
new file mode 100644
index 0000000000..64c3002bbe
--- /dev/null
+++ b/code.txt
@@ -0,0 +1,2382 @@
+]633;E;for file in scheds/rust/scx_mitosis/src/bpf/*;7dc75c10-53e2-4af4-8cab-ea0159bd7502]633;C# File: scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h
+/* Copyright (c) Meta Platforms, Inc. and affiliates. */
+/*
+ * This software may be used and distributed according to the terms of the
+ * GNU General Public License version 2.
+ *
+ * This header defines the 64-bit dispatch queue (DSQ) ID encoding
+ * scheme for scx_mitosis, using type fields to distinguish between
+ * per-CPU and cell+L3 domain queues. It includes helper functions to
+ * construct, validate, and parse these DSQ IDs for queue management.
+ */
+#pragma once
+
+#include "intf.h"
+#include "mitosis.bpf.h"
+
+/*
+ * ================================
+ * BPF DSQ ID Layout (64 bits wide)
+ * ================================
+ *
+ * Top-level format:
+ *   [63] [62..0]
+ *   [ B] [  ID ]
+ *
+ * If B == 1 it is a Built-in DSQ
+ * -------------------------
+ *   [63] [62] [61 .. 32]  [31..0]
+ *   [ 1] [ L] [   R    ]  [  V  ]
+ *
+ *   - L (bit 62): LOCAL_ON flag
+ *       If L == 1 -> V = CPU number
+ *   - R (30 bits): reserved / unused
+ *   - V (32 bits): value (e.g., CPU#)
+ *
+ * If B == 0 -> User-defined DSQ
+ * -----------------------------
+ * Only the low 32 bits are used.
+ *
+ *   [63     ..     32] [31..0]
+ *   [ 0][   unused   ] [ VAL ]
+ *
+ *   Mitosis uses VAL as follows:
+ *
+ *   [31..28] [27..0]
+ *   [QTYPE ] [DATA ]
+ *
+ *   QTYPE encodes the queue type:
+ *
+ *     QTYPE = 0x1 -> Per-CPU Q
+ *       [31..28] [27 ..          ..        0]
+ *       [ 0001 ] [          CPU#            ]
+ *       [Q-TYPE:1]
+ *
+ *     QTYPE = 0x2 -> Cell+L3 Q
+ *       [31..28] [27 .. 16] [15      ..    0]
+ *       [ 0010 ] [  CELL# ] [      L3ID     ]
+ *       [Q-TYPE:2]
+ *
+ */
+/*
+ * The use of these bitfields depends on compiler defined byte AND bit ordering.
+ * Make sure we're only building with Clang/LLVM and that we're little-endian.
+ */
+#ifndef __clang__
+#error "This code must be compiled with Clang/LLVM (eBPF: clang -target bpf)."
+#endif
+
+#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+#error "dsq64 bitfield layout assumes little-endian (bpfel)."
+#endif
+
+/* ---- Bitfield widths (bits) ---- */
+#define CPU_B     28
+#define L3_B      16
+#define CELL_B    12
+#define TYPE_B     4
+#define DATA_B    28
+#define RSVD_B    32
+
+/* Sum checks (in bits) */
+_Static_assert(CPU_B  + TYPE_B          == 32, "CPU layout low half must be 32 bits");
+_Static_assert(L3_B   + CELL_B + TYPE_B == 32, "CELL+L3 layout low half must be 32 bits");
+_Static_assert(DATA_B + TYPE_B          == 32, "Common layout low half must be 32 bits");
+
+typedef union {
+	u64 raw;
+
+	/* Per-CPU user DSQ */
+	struct { u64 cpu: CPU_B;   u64 type: TYPE_B; u64 rsvd: RSVD_B; } cpu_dsq;
+
+	/* Cell+L3 user DSQ */
+	struct { u64 l3: L3_B;     u64 cell: CELL_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } cell_l3_dsq;
+
+	/* Generic user view */
+	struct { u64 data: DATA_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } user_dsq;
+
+	/* Built-in DSQ view */
+	struct { u64 value:32; u64 rsvd:30; u64 local_on:1; u64 builtin:1; } builtin_dsq;
+
+	/* NOTE: Considered packed and aligned attributes, but that's redundant */
+} dsq_id_t;
+
+/*
+ * Invalid DSQ ID Sentinel:
+ * invalid bc bit 63 clear (it's a user DSQ) && dsq_type == 0 (no type)
+ * Good for catching uninitialized DSQ IDs.
+*/
+#define DSQ_INVALID ((u64) 0)
+
+_Static_assert(sizeof(((dsq_id_t){0}).cpu_dsq)     == sizeof(u64), "cpu view must be 8 bytes");
+_Static_assert(sizeof(((dsq_id_t){0}).cell_l3_dsq) == sizeof(u64), "cell+l3 view must be 8 bytes");
+_Static_assert(sizeof(((dsq_id_t){0}).user_dsq)    == sizeof(u64), "user common view must be 8 bytes");
+_Static_assert(sizeof(((dsq_id_t){0}).builtin_dsq) == sizeof(u64), "builtin view must be 8 bytes");
+
+/* Compile-time checks (in bytes) */
+_Static_assert(sizeof(dsq_id_t)   == sizeof(u64), "dsq_id_t must be 8 bytes (64 bits)");
+_Static_assert(_Alignof(dsq_id_t) == sizeof(u64), "dsq_id_t must be 8-byte aligned");
+
+/* DSQ type enumeration */
+enum dsq_type {
+	DSQ_TYPE_NONE,
+	DSQ_TYPE_CPU,
+	DSQ_TYPE_CELL_L3,
+};
+
+/* Range guards */
+_Static_assert(MAX_CPUS  <= (1u << CPU_B),  "MAX_CPUS must fit in field");
+_Static_assert(MAX_L3S   <= (1u << L3_B),   "MAX_L3S must fit in field");
+_Static_assert(MAX_CELLS <= (1u << CELL_B), "MAX_CELLS must fit in field");
+_Static_assert(DSQ_TYPE_CELL_L3 < (1u << TYPE_B), "DSQ_TYPE_CELL_L3 must fit in field");
+
+/*
+ * While I considered error propagation, I decided to bail to force errors early.
+*/
+
+static inline bool is_user_dsq(dsq_id_t dsq_id){
+	return !dsq_id.builtin_dsq.builtin && dsq_id.user_dsq.type != DSQ_TYPE_NONE;
+}
+
+// Is this a per CPU DSQ?
+static inline bool is_cpu_dsq(dsq_id_t dsq_id)
+{
+	return is_user_dsq(dsq_id) && dsq_id.user_dsq.type == DSQ_TYPE_CPU;
+}
+
+// If this is a per cpu dsq, return the cpu
+static inline u32 get_cpu_from_dsq(dsq_id_t dsq_id)
+{
+	if (!is_cpu_dsq(dsq_id))
+		scx_bpf_error("trying to get cpu from non-cpu dsq\n");
+
+	return dsq_id.cpu_dsq.cpu;
+}
+
+/* Helper functions to construct DSQ IDs */
+static inline dsq_id_t get_cpu_dsq_id(u32 cpu)
+{
+	// Check for valid CPU range, 0 indexed so >=.
+	if (cpu >= MAX_CPUS)
+		scx_bpf_error("invalid cpu %u\n", cpu);
+
+	return (dsq_id_t){ .cpu_dsq = { .cpu = cpu, .type = DSQ_TYPE_CPU } };
+}
+
+static inline dsq_id_t get_cell_l3_dsq_id(u32 cell, u32 l3)
+{
+	if (cell >= MAX_CELLS || l3 >= MAX_L3S)
+		scx_bpf_error("cell %u or l3 %u too large\n", cell, l3);
+
+	return (dsq_id_t){ .cell_l3_dsq = { .l3 = l3, .cell = cell, .type = DSQ_TYPE_CELL_L3 } };
+}
+# File: scheds/rust/scx_mitosis/src/bpf/intf.h
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2.
+#ifndef __INTF_H
+#define __INTF_H
+
+#ifndef __KERNEL__
+typedef unsigned long long u64;
+typedef unsigned int u32;
+typedef _Bool bool;
+#endif
+
+#ifdef LSP
+#define __bpf__
+#include "../../../../include/scx/ravg.bpf.h"
+#else
+#include <scx/ravg.bpf.h>
+#endif
+
+/* ---- Work stealing config (compile-time) ------------------------------- */
+#ifndef MITOSIS_ENABLE_STEALING
+#define MITOSIS_ENABLE_STEALING 1
+#endif
+/* ----------------------------------------------------------------------- */
+
+enum consts {
+	CACHELINE_SIZE = 64,
+	MAX_CPUS_SHIFT = 9,
+	MAX_CPUS = 1 << MAX_CPUS_SHIFT,
+	MAX_CPUS_U8 = MAX_CPUS / 8,
+	MAX_CELLS = 16,
+	USAGE_HALF_LIFE = 100000000, /* 100ms */
+
+	PCPU_BASE = 0x80000000,
+	MAX_CG_DEPTH = 256,
+};
+
+/* Statistics */
+enum cell_stat_idx {
+	CSTAT_LOCAL,
+	CSTAT_CPU_DSQ,
+	CSTAT_CELL_DSQ,
+	CSTAT_AFFN_VIOL,
+	NR_CSTATS,
+};
+
+/* Function invocation counters */
+enum fn_counter_idx {
+	COUNTER_SELECT_CPU,
+	COUNTER_ENQUEUE,
+	COUNTER_DISPATCH,
+	NR_COUNTERS,
+};
+
+struct cpu_ctx {
+	u64 cstats[MAX_CELLS][NR_CSTATS];
+	u64 cell_cycles[MAX_CELLS];
+	u32 cell;
+	u64 vtime_now;
+};
+
+struct cgrp_ctx {
+	u32 cell;
+	bool cell_owner;
+};
+
+#endif /* __INTF_H */
+# File: scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
+/* Copyright (c) Meta Platforms, Inc. and affiliates. */
+/*
+ * This software may be used and distributed according to the terms of the
+ * GNU General Public License version 2.
+ *
+ * This header assists adding L3 cache awareness to scx_mitosis by defining
+ * maps and fns for managing CPU-to-L3 domain mappings. It provides code to
+ * recalculate per-L3 CPU counts within cells and implements weighted
+ * random L3 selection for tasks. It also tracks work-stealing
+ * statistics for cross-L3 task migrations.
+ */
+#pragma once
+
+#include "mitosis.bpf.h"
+#include "intf.h"
+
+typedef u32 l3_id_t;
+#define L3_INVALID ((l3_id_t)~0u)
+
+// Configure how aggressively we steal work.
+// When task is detected as a steal candidate, skip it this many times
+// On a web server workload, 100 reduced steal count by ~90%
+#ifdef MITOSIS_ENABLE_STEALING
+#define PREVENT_N_STEALS 0
+#endif
+
+/* Work stealing statistics map - accessible from both BPF and userspace */
+struct steal_stats_map {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, u64);
+	__uint(max_entries, 1);
+};
+
+// A CPU -> L3 cache ID map
+struct cpu_to_l3_map {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, u32);
+	__uint(max_entries, MAX_CPUS);
+};
+
+struct l3_to_cpus_map {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, struct cpumask);
+	__uint(max_entries, MAX_L3S);
+};
+
+extern struct cpu_to_l3_map cpu_to_l3;
+extern struct l3_to_cpus_map l3_to_cpus;
+extern struct steal_stats_map steal_stats;
+
+static inline const bool l3_is_valid(u32 l3_id)
+{
+	if (l3_id == L3_INVALID)
+		return false;
+
+	return (l3_id >= 0) && (l3_id < MAX_L3S);
+}
+
+static inline void init_task_l3(struct task_ctx *tctx)
+{
+	tctx->l3 = L3_INVALID;
+
+#if MITOSIS_ENABLE_STEALING
+	tctx->pending_l3 = L3_INVALID;
+	tctx->steal_count = 0;
+	tctx->last_stolen_at = 0;
+	tctx->steals_prevented = 0;
+#endif
+}
+
+static inline const struct cpumask *lookup_l3_cpumask(u32 l3)
+{
+	struct cpumask *mask;
+
+	if (!(mask = bpf_map_lookup_elem(&l3_to_cpus, &l3))) {
+		scx_bpf_error("no l3 cpumask, l3: %d, %p", l3, &l3_to_cpus);
+		return NULL;
+	}
+
+	return mask;
+}
+
+/* Recompute cell->l3_cpu_cnt[] after cell cpumask changes */
+// TODO: use RAII and lock around updates (races with )
+static __always_inline void recalc_cell_l3_counts(u32 cell_idx)
+{
+	struct cell *cell = lookup_cell(cell_idx);
+	if (!cell) {
+		scx_bpf_error("recalc_cell_l3_counts: invalid cell %d",
+			      cell_idx);
+		return;
+	}
+
+	CPUMASK_GUARD(tmp_guard);
+	if (!tmp_guard.mask) {
+		scx_bpf_error(
+			"recalc_cell_l3_counts: failed to create tmp mask");
+		return;
+	}
+
+	u32 l3, l3s_present = 0, total_cpus = 0;
+	// Just so we don't hold the lock longer than necessary
+	u32 l3_cpu_cnt_tmp[MAX_L3S] = {0};
+
+	{ // RCU context
+		RCU_READ_GUARD();
+		const struct cpumask *cell_mask =
+			lookup_cell_cpumask(cell_idx); // RCU ptr
+
+		if (!cell_mask) {
+			scx_bpf_error(
+				"recalc_cell_l3_counts: invalid cell mask");
+			return;
+		}
+
+		bpf_for(l3, 0, nr_l3)
+		{
+			const struct cpumask *l3_mask = lookup_l3_cpumask(l3);
+			if (!l3_mask) {
+				scx_bpf_error(
+					"recalc_cell_l3_counts: invalid l3 mask");
+				return;
+			}
+
+			bpf_cpumask_and(tmp_guard.mask, cell_mask, l3_mask);
+
+			u32 cnt = bpf_cpumask_weight( (const struct cpumask *)tmp_guard.mask);
+
+			l3_cpu_cnt_tmp[l3] = cnt;
+
+			bpf_printk("recalc_cell_l3_counts: cnt %d", cnt);
+
+			// These are counted across the whole cell
+			total_cpus += cnt;
+
+			if (cnt)
+				l3s_present++;
+		}
+	} // bpf_rcu_read_unlock();
+
+	// WITH_CELL_LOCK(cell, cell_idx, {
+	for (u32 l3 = 0; l3 < nr_l3; l3++) {
+		cell->l3_cpu_cnt[l3] = l3_cpu_cnt_tmp[l3];
+	}
+
+	cell->l3_present_cnt = l3s_present;
+	cell->cpu_cnt = total_cpus;
+	// });
+}
+
+/**
+ * Weighted random selection of an L3 cache domain for a task.
+ *
+ * Uses the CPU count in each L3 domain within the cell as weights to
+ * probabilistically select an L3. L3 domains with more CPUs in the cell
+ * have higher probability of being selected.
+ *
+ * @cell_id: The cell ID to select an L3 from
+ * @return: L3 ID on success, L3_INVALID on error
+ */
+// TODO: Lock
+static inline s32 pick_l3_for_task(u32 cell_id)
+{
+	struct cell *cell;
+
+	/* Look up the cell structure */
+	if (!(cell = lookup_cell(cell_id))) {
+		scx_bpf_error("pick_l3_for_task: invalid cell %d", cell_id);
+		return L3_INVALID;
+	}
+
+	// No cells
+	if (!cell->cpu_cnt) {
+		scx_bpf_error( "pick_l3_for_task: cell %d has no CPUs accounted yet", cell_id);
+		return L3_INVALID;
+	}
+
+	/* Find the L3 domain corresponding to the target value using
+	 * weighted selection - accumulate CPU counts until we exceed target */
+
+	/* Generate random target value in range [0, cpu_cnt) */
+	u32 target = bpf_get_prandom_u32() % cell->cpu_cnt;
+	u32 l3, cur = 0;
+	s32 ret = L3_INVALID;
+
+	// This could be a prefix sum. Find first l3 where we exceed target
+	bpf_for(l3, 0, nr_l3)
+	{
+		cur += cell->l3_cpu_cnt[l3];
+		if (target < cur) {
+			ret = (s32)l3;
+			break;
+		}
+	}
+
+	if (ret == L3_INVALID) {
+		scx_bpf_error("pick_l3_for_task: invalid L3");
+		return L3_INVALID;
+	}
+
+	return ret;
+}
+
+#ifdef MITOSIS_ENABLE_STEALING
+
+static inline bool try_stealing_this_task(struct task_ctx *task_ctx,
+					  s32 local_l3, u64 candidate_dsq)
+{
+	// Attempt the steal, can fail beacuse it's a race.
+	if (!scx_bpf_dsq_move_to_local(candidate_dsq))
+		return false;
+
+	// We got the task!
+	task_ctx->steal_count++;
+	task_ctx->last_stolen_at = scx_bpf_now();
+	/* Retag to thief L3 (the one for this cpu) */
+	task_ctx->pending_l3 = local_l3;
+	task_ctx->steals_prevented = 0;
+
+	/* Increment steal counter in map */
+	u32 key = 0;
+	u64 *count = bpf_map_lookup_elem(&steal_stats, &key);
+	// NOTE: This could get expensive, but I'm not anticipating that many steals. Percpu if we care.
+	if (count)
+		__sync_fetch_and_add(count, 1);
+
+	return true;
+}
+
+/* Work stealing:
+ * Scan sibling (cell,L3) DSQs in the same cell and steal the first queued task if it can run on this cpu
+*/
+static inline bool try_stealing_work(u32 cell, s32 local_l3)
+{
+	if (!l3_is_valid(local_l3))
+		scx_bpf_error("try_stealing_work: invalid local_l3");
+
+	struct cell *cell_ptr = lookup_cell(cell);
+	if (!cell_ptr)
+		scx_bpf_error("try_stealing_work: invalid cell");
+
+	// Loop over all other L3s, looking for a queued task to steal
+	u32 i;
+	bpf_for(i, 1, nr_l3)
+	{
+		// Start with the next one to spread out the load
+		u32 candidate_l3 = (local_l3 + i) % nr_l3;
+
+		// Prevents the optimizer from removing the following conditional return
+		// so that the verifier knows the read wil be safe
+		barrier_var(candidate_l3);
+
+		if (candidate_l3 >= MAX_L3S)
+			continue;
+
+		// Skip L3s that are not present in this cell
+		// Note: rechecking cell_ptr for verifier
+		// TODO: Lock?
+		if (cell_ptr && cell_ptr->l3_cpu_cnt[candidate_l3] == 0)
+			continue;
+
+		u64 candidate_dsq = get_cell_l3_dsq_id(cell, candidate_l3).raw;
+
+		struct task_struct *task = NULL;
+		struct task_ctx *task_ctx;
+		// I'm only using this for the verifier
+		bool found_task = false;
+
+		// Optimization: skip if faster than constructing an iterator
+		// Not redundant with later checking if task found (race)
+		if (scx_bpf_dsq_nr_queued(candidate_dsq))
+			continue;
+
+		// Just a trick for peeking the head element
+		bpf_for_each(scx_dsq, task, candidate_dsq, 0)
+		{
+			task_ctx = lookup_task_ctx(task);
+			found_task = (task_ctx != NULL);
+			break;
+		}
+
+		// No task? Try next L3
+		if (!found_task)
+			continue;
+
+		// This knob throttles stealing.
+		// TODO: make runtime configurable
+		if (task_ctx->steals_prevented++ < PREVENT_N_STEALS) {
+			continue;
+		}
+
+		if (!try_stealing_this_task(task_ctx, local_l3, candidate_dsq))
+			continue;
+
+		// Success, we got a task (no guarantee it was the one we peeked though... race)
+		return true;
+	}
+	return false;
+}
+#endif
+# File: scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
+/* Copyright (c) Meta Platforms, Inc. and affiliates. */
+/*
+ * This software may be used and distributed according to the terms of the
+ * GNU General Public License version 2.
+ *
+ * scx_mitosis is a dynamic affinity scheduler. Cgroups (and their tasks) are
+ * assigned to Cells which are affinitized to discrete sets of CPUs. The number
+ * of cells is dynamic, as is cgroup to cell assignment and cell to CPU
+ * assignment (all are determined by userspace).
+ *
+ * Each cell has an associated DSQ which it uses for vtime scheduling of the
+ * cgroups belonging to the cell.
+ */
+
+// TODO: fix debug printer.
+#include "intf.h"
+
+#include "mitosis.bpf.h"
+#include "dsq.bpf.h"
+#include "l3_aware.bpf.h"
+
+char _license[] SEC("license") = "GPL";
+
+/*
+ * Variables populated by userspace
+ */
+const volatile u32 nr_possible_cpus = 1;
+const volatile bool smt_enabled = true;
+const volatile unsigned char all_cpus[MAX_CPUS_U8];
+
+const volatile u64 slice_ns;
+const volatile u64 root_cgid = 1;
+
+const volatile u32 nr_l3 = 1;
+/*
+ * CPU assignment changes aren't fully in effect until a subsequent tick()
+ * configuration_seq is bumped on each assignment change
+ * applied_configuration_seq is bumped when the effect is fully applied
+ */
+u32 configuration_seq;
+u32 applied_configuration_seq;
+
+private(all_cpumask) struct bpf_cpumask __kptr *all_cpumask;
+private(root_cgrp) struct cgroup __kptr *root_cgrp;
+
+UEI_DEFINE(uei);
+
+// Cells now defined as a map so we can lock.
+struct cell_map cells SEC(".maps");
+
+/*
+ * Maps used for L3-aware scheduling
+*/
+#if 0
+struct cell_locks_map cell_locks SEC(".maps");
+#endif
+struct cpu_to_l3_map cpu_to_l3 SEC(".maps");
+struct l3_to_cpus_map l3_to_cpus SEC(".maps");
+
+/*
+ * Maps for statistics
+*/
+struct function_counters_map function_counters SEC(".maps");
+struct steal_stats_map steal_stats SEC(".maps");
+
+static inline void increment_counter(enum fn_counter_idx idx) {
+	u64 *counter;
+	u32 key = idx;
+
+	counter = bpf_map_lookup_elem(&function_counters, &key);
+	if (counter)
+		(*counter)++;
+}
+
+static inline struct cgroup *lookup_cgrp_ancestor(struct cgroup *cgrp,
+						  u32 ancestor)
+{
+	struct cgroup *cg;
+
+	if (!(cg = bpf_cgroup_ancestor(cgrp, ancestor))) {
+		scx_bpf_error("Failed to get ancestor level %d for cgid %llu",
+			      ancestor, cgrp->kn->id);
+		return NULL;
+	}
+
+	return cg;
+}
+
+struct {
+	__uint(type, BPF_MAP_TYPE_CGRP_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct cgrp_ctx);
+} cgrp_ctxs SEC(".maps");
+
+static inline struct cgrp_ctx *lookup_cgrp_ctx_fallible(struct cgroup *cgrp)
+{
+	struct cgrp_ctx *cgc;
+
+	if (!(cgc = bpf_cgrp_storage_get(&cgrp_ctxs, cgrp, 0, 0))) {
+		return NULL;
+	}
+
+	return cgc;
+}
+
+static inline struct cgrp_ctx *lookup_cgrp_ctx(struct cgroup *cgrp)
+{
+	struct cgrp_ctx *cgc = lookup_cgrp_ctx_fallible(cgrp);
+
+	if (!cgc)
+		scx_bpf_error("cgrp_ctx lookup failed for cgid %llu",
+			      cgrp->kn->id);
+
+	return cgc;
+}
+
+static inline struct cgroup *task_cgroup(struct task_struct *p)
+{
+	struct cgroup *cgrp = __COMPAT_scx_bpf_task_cgroup(p);
+	if (!cgrp) {
+		scx_bpf_error("Failed to get cgroup for task %d", p->pid);
+	}
+	return cgrp;
+}
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct task_ctx);
+} task_ctxs SEC(".maps");
+
+static inline struct task_ctx *lookup_task_ctx(struct task_struct *p)
+{
+	struct task_ctx *tctx;
+
+	if ((tctx = bpf_task_storage_get(&task_ctxs, p, 0, 0))) {
+		return tctx;
+	}
+
+	scx_bpf_error("task_ctx lookup failed");
+	return NULL;
+}
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, u32);
+	__type(value, struct cpu_ctx);
+	__uint(max_entries, 1);
+} cpu_ctxs SEC(".maps");
+
+static inline struct cpu_ctx *lookup_cpu_ctx(int cpu)
+{
+	struct cpu_ctx *cctx;
+	u32 zero = 0;
+
+	if (cpu < 0)
+		cctx = bpf_map_lookup_elem(&cpu_ctxs, &zero);
+	else
+		cctx = bpf_map_lookup_percpu_elem(&cpu_ctxs, &zero, cpu);
+
+	if (!cctx) {
+		scx_bpf_error("no cpu_ctx for cpu %d", cpu);
+		return NULL;
+	}
+
+	return cctx;
+}
+
+
+
+/*
+ * Cells are allocated concurrently in some cases (e.g. cgroup_init).
+ * allocate_cell and free_cell enable these allocations to be done safely
+ */
+static inline int allocate_cell()
+{
+	int cell_idx;
+	bpf_for(cell_idx, 0, MAX_CELLS)
+	{
+		struct cell *c;
+		if (!(c = lookup_cell(cell_idx)))
+			return -1;
+
+		if (__sync_bool_compare_and_swap(&c->in_use, 0, 1)) {
+			// TODO XXX, I think we need to make this concurrent safe
+			// TODO, lock with recalc_cell...()
+			__builtin_memset(c->l3_cpu_cnt, 0, sizeof(c->l3_cpu_cnt));
+			c->l3_present_cnt = 0;
+			// TODO zero cpu_cnt
+			// TODO Just zero the whole cell struct?
+			return cell_idx;
+		}
+	}
+	scx_bpf_error("No available cells to allocate");
+	return -1;
+}
+
+static inline int free_cell(int cell_idx)
+{
+	struct cell *c;
+
+	if (cell_idx < 0 || cell_idx >= MAX_CELLS) {
+		scx_bpf_error("Invalid cell %d", cell_idx);
+		return -1;
+	}
+
+	if (!(c = lookup_cell(cell_idx)))
+		return -1;
+
+	WRITE_ONCE(c->in_use, 0);
+	return 0;
+}
+
+/*
+ * Store the cpumask for each cell (owned by BPF logic). We need this in an
+ * explicit map to allow for these to be kptrs.
+ */
+struct cell_cpumask_wrapper {
+	struct bpf_cpumask __kptr *cpumask;
+	/*
+	 * To avoid allocation on the reconfiguration path, have a second cpumask we
+	 * can just do an xchg on.
+	 */
+	struct bpf_cpumask __kptr *tmp_cpumask;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, struct cell_cpumask_wrapper);
+	__uint(max_entries, MAX_CELLS);
+	__uint(map_flags, 0);
+} cell_cpumasks SEC(".maps");
+
+static inline const struct cpumask *lookup_cell_cpumask(int idx)
+{
+	struct cell_cpumask_wrapper *cpumaskw;
+
+	if (!(cpumaskw = bpf_map_lookup_elem(&cell_cpumasks, &idx))) {
+		scx_bpf_error("no cell cpumask");
+		return NULL;
+	}
+
+	return (const struct cpumask *)cpumaskw->cpumask;
+}
+
+/*
+ * Helper functions for bumping per-cell stats
+ */
+static void cstat_add(enum cell_stat_idx idx, u32 cell, struct cpu_ctx *cctx,
+		      s64 delta)
+{
+	u64 *vptr;
+
+	if ((vptr = MEMBER_VPTR(*cctx, .cstats[cell][idx])))
+		(*vptr) += delta;
+	else
+		scx_bpf_error("invalid cell or stat idxs: %d, %d", idx, cell);
+}
+
+static void cstat_inc(enum cell_stat_idx idx, u32 cell, struct cpu_ctx *cctx)
+{
+	cstat_add(idx, cell, cctx, 1);
+}
+
+static inline int update_task_cpumask(struct task_struct *p,
+				      struct task_ctx *tctx)
+{
+	const struct cpumask *cell_cpumask;
+	struct cpu_ctx *cpu_ctx;
+	u32 cpu;
+
+	if (!(cell_cpumask = lookup_cell_cpumask(tctx->cell)))
+		return -ENOENT;
+
+	if (!tctx->cpumask)
+		return -EINVAL;
+
+	/*
+	 * Calculate the intersection of CPUs that are both:
+	 * 1. In this task's assigned cell (cell_cpumask)
+	 * 2. Allowed by the task's CPU affinity (p->cpus_ptr)
+	 * Store result in tctx->cpumask - this becomes the effective CPU set
+	 * where this task can actually run.
+	 */
+	bpf_cpumask_and(tctx->cpumask, cell_cpumask, p->cpus_ptr);
+
+	/*
+	 * Check if the task can run on ALL CPUs in its assigned cell.
+	 * If cell_cpumask is a subset of p->cpus_ptr, it means the task's
+	 * CPU affinity doesn't restrict it within the cell - it can use
+	 * any CPU in the cell. This affects scheduling decisions later.
+	 * True if all the bits in cell_cpumask are set in p->cpus_ptr.
+	 */
+	tctx->all_cell_cpus_allowed =
+		bpf_cpumask_subset(cell_cpumask, p->cpus_ptr);
+
+	/*
+	 * XXX - To be correct, we'd need to calculate the vtime
+	 * delta in the previous dsq, scale it by the load
+	 * fraction difference and then offset from the new
+	 * dsq's vtime_now. For now, just do the simple thing
+	 * and assume the offset to be zero.
+	 *
+	 * Revisit if high frequency dynamic cell switching
+	 * needs to be supported.
+	 */
+
+	// We want to set the task vtime to that of the cell it's joining.
+	if (tctx->all_cell_cpus_allowed) {
+
+		const struct cpumask *l3_mask = NULL;
+		if (tctx->l3 != L3_INVALID) {
+			l3_mask = lookup_l3_cpumask((u32)tctx->l3);
+			/* If the L3 no longer intersects the cell's cpumask, invalidate it */
+			if (!l3_mask || !bpf_cpumask_intersects(cell_cpumask, l3_mask))
+				tctx->l3 = L3_INVALID;
+		}
+
+		/* --- Pick a new L3 if needed --- */
+		if (tctx->l3 == L3_INVALID) {
+			s32 new_l3 = pick_l3_for_task(tctx->cell);
+			if (new_l3 < 0) {
+				scx_bpf_error("bad L3: %d", new_l3);
+				return -ENODEV;
+			}
+			tctx->l3 = new_l3;
+			l3_mask = lookup_l3_cpumask((u32)tctx->l3);
+			if (!l3_mask)
+				return -ENOENT;
+		}
+
+		/* --- Narrow the effective cpumask by the chosen L3 --- */
+		/* tctx->cpumask already contains (task_affinity ∧ cell_mask) */
+		if (tctx->cpumask)
+			bpf_cpumask_and(tctx->cpumask, (const struct cpumask *)tctx->cpumask, l3_mask);
+
+		/* If empty after intersection, nothing can run here */
+		if (tctx->cpumask && bpf_cpumask_empty((const struct cpumask *)tctx->cpumask)) {
+			scx_bpf_error("Empty cpumask after intersection");
+			return -ENODEV;
+		}
+
+		/* --- Point to the correct (cell,L3) DSQ and set vtime baseline --- */
+		tctx->dsq = get_cell_l3_dsq_id(tctx->cell, tctx->l3);
+
+		struct cell *cell = lookup_cell(tctx->cell);
+		if (!cell)
+			return -ENOENT;
+
+		if (!l3_is_valid(tctx->l3)){
+			scx_bpf_error("Invalid L3 %d", tctx->l3);
+			return -EINVAL;
+		}
+
+		p->scx.dsq_vtime = READ_ONCE(cell->l3_vtime_now[tctx->l3]);
+	} else {
+		/* Task is CPU-restricted, use task mask */
+		cpu = bpf_cpumask_any_distribute(p->cpus_ptr);
+		if (!(cpu_ctx = lookup_cpu_ctx(cpu)))
+			return -ENOENT;
+		tctx->dsq = get_cpu_dsq_id(cpu);
+		p->scx.dsq_vtime = READ_ONCE(cpu_ctx->vtime_now);
+	}
+
+	return 0;
+}
+
+/*
+ * Figure out the task's cell, dsq and store the corresponding cpumask in the
+ * task_ctx.
+ */
+static inline int update_task_cell(struct task_struct *p, struct task_ctx *tctx,
+				   struct cgroup *cg)
+{
+	struct cgrp_ctx *cgc;
+
+	if (!(cgc = lookup_cgrp_ctx(cg)))
+		return -ENOENT;
+
+	/*
+	 * This ordering is pretty important, we read applied_configuration_seq
+	 * before reading everything else expecting that the updater will update
+	 * everything and then bump applied_configuration_seq last. This ensures
+	 * that we cannot miss an update.
+	 */
+	tctx->configuration_seq = READ_ONCE(applied_configuration_seq);
+	barrier();
+	tctx->cell = cgc->cell;
+
+	return update_task_cpumask(p, tctx);
+}
+
+/* Helper function for picking an idle cpu out of a candidate set */
+static s32 pick_idle_cpu_from(struct task_struct *p,
+			      const struct cpumask *cand_cpumask, s32 prev_cpu,
+			      const struct cpumask *idle_smtmask)
+{
+	bool prev_in_cand = bpf_cpumask_test_cpu(prev_cpu, cand_cpumask);
+	s32 cpu;
+
+	/*
+	 * If CPU has SMT, any wholly idle CPU is likely a better pick than
+	 * partially idle @prev_cpu.
+	 */
+	if (smt_enabled) {
+		if (prev_in_cand &&
+		    bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) &&
+		    scx_bpf_test_and_clear_cpu_idle(prev_cpu))
+			return prev_cpu;
+
+		cpu = scx_bpf_pick_idle_cpu(cand_cpumask, SCX_PICK_IDLE_CORE);
+		if (cpu >= 0)
+			return cpu;
+	}
+
+	if (prev_in_cand && scx_bpf_test_and_clear_cpu_idle(prev_cpu))
+		return prev_cpu;
+
+	return scx_bpf_pick_idle_cpu(cand_cpumask, 0);
+}
+
+/* Check if we need to update the cell/cpumask mapping */
+static __always_inline int maybe_refresh_cell(struct task_struct *p,
+					      struct task_ctx *tctx)
+{
+	struct cgroup *cgrp;
+	int ret = 0;
+	if (tctx->configuration_seq != READ_ONCE(applied_configuration_seq)) {
+		if (!(cgrp = task_cgroup(p)))
+			return -1;
+		if (update_task_cell(p, tctx, cgrp))
+			ret = -1;
+		bpf_cgroup_release(cgrp);
+	}
+	return ret;
+}
+
+static __always_inline s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
+					 struct cpu_ctx *cctx,
+					 struct task_ctx *tctx)
+{
+	struct cpumask *task_cpumask;
+	const struct cpumask *idle_smtmask;
+	s32 cpu;
+
+	if (!(task_cpumask = (struct cpumask *)tctx->cpumask) ||
+	    !(idle_smtmask = scx_bpf_get_idle_smtmask())) {
+		scx_bpf_error("Failed to get task cpumask or idle smtmask");
+		return -1;
+	}
+
+	/* No overlap between cell cpus and task cpus, just find some idle cpu */
+	if (bpf_cpumask_empty(task_cpumask)) {
+		cstat_inc(CSTAT_AFFN_VIOL, tctx->cell, cctx);
+		cpu = pick_idle_cpu_from(p, p->cpus_ptr, prev_cpu,
+					 idle_smtmask);
+		goto out;
+	}
+
+	cpu = pick_idle_cpu_from(p, task_cpumask, prev_cpu, idle_smtmask);
+out:
+	scx_bpf_put_idle_cpumask(idle_smtmask);
+	return cpu;
+}
+
+/*
+ * select_cpu is where we update each task's cell assignment and then try to
+ * dispatch to an idle core in the cell if possible
+ */
+s32 BPF_STRUCT_OPS(mitosis_select_cpu, struct task_struct *p, s32 prev_cpu,
+		   u64 wake_flags)
+{
+	s32 cpu;
+	struct cpu_ctx *cctx;
+	struct task_ctx *tctx;
+
+	increment_counter(COUNTER_SELECT_CPU);
+
+	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
+		return prev_cpu;
+
+	if (maybe_refresh_cell(p, tctx) < 0)
+		return prev_cpu;
+
+	/* Pinned path: only if our task really requires a per-CPU queue. */
+	if (!tctx->all_cell_cpus_allowed) {
+		cstat_inc(CSTAT_AFFN_VIOL, tctx->cell, cctx);
+		cpu = get_cpu_from_dsq(tctx->dsq);
+		if (scx_bpf_test_and_clear_cpu_idle(cpu))
+			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
+		return cpu;
+	}
+
+	// Grab an idle core
+	if ((cpu = pick_idle_cpu(p, prev_cpu, cctx, tctx)) >= 0) {
+		cstat_inc(CSTAT_LOCAL, tctx->cell, cctx);
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
+		return cpu;
+	}
+
+	if (!tctx->cpumask) {
+		scx_bpf_error("tctx->cpumask should never be NULL");
+		return prev_cpu;
+	}
+	/*
+	 * All else failed, send it to the prev cpu (if that's valid), otherwise any
+	 * valid cpu.
+	 */
+	if (!bpf_cpumask_test_cpu(prev_cpu, cast_mask(tctx->cpumask)) &&
+	    tctx->cpumask)
+		cpu = bpf_cpumask_any_distribute(cast_mask(tctx->cpumask));
+	else
+		cpu = prev_cpu;
+
+	return cpu;
+}
+
+void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct cpu_ctx *cctx;
+	struct task_ctx *tctx;
+	struct cell *cell;
+	s32 task_cpu = scx_bpf_task_cpu(p);
+	u64 vtime = p->scx.dsq_vtime;
+	s32 cpu = -1;
+	u64 basis_vtime;
+
+	increment_counter(COUNTER_ENQUEUE);
+
+	if (!(tctx = lookup_task_ctx(p)) || !(cctx = lookup_cpu_ctx(-1)))
+		return;
+
+	if (maybe_refresh_cell(p, tctx) < 0)
+		return;
+
+	// Cpu pinned work
+	if (!tctx->all_cell_cpus_allowed) {
+		cpu = get_cpu_from_dsq(tctx->dsq);
+	} else if (!__COMPAT_is_enq_cpu_selected(enq_flags)) {
+		/*
+		 * If we haven't selected a cpu, then we haven't looked for and kicked an
+		 * idle CPU. Let's do the lookup now and kick at the end.
+		 */
+		if (!(cctx = lookup_cpu_ctx(-1)))
+			return;
+		cpu = pick_idle_cpu(p, task_cpu, cctx, tctx);
+		if (cpu == -1)
+			return;
+		if (cpu == -EBUSY) {
+			/*
+			 * Verifier gets unhappy claiming two different pointer types for
+			 * the same instruction here. This fixes it
+			 */
+			barrier_var(tctx);
+			if (tctx->cpumask)
+				cpu = bpf_cpumask_any_distribute(
+					(const struct cpumask *)tctx->cpumask);
+		}
+	}
+
+	if (tctx->all_cell_cpus_allowed) {
+		// This is a task that can run on any cpu in the cell
+
+		cstat_inc(CSTAT_CELL_DSQ, tctx->cell, cctx);
+
+		/* Task can use any CPU in its cell, set basis_vtime from per-(cell, L3) vtime */
+		if (!(cell = lookup_cell(tctx->cell)))
+			return;
+
+		if (!l3_is_valid(tctx->l3)) {
+			scx_bpf_error("Invalid L3 ID for task %d in enqueue", p->pid);
+			return;
+		}
+		basis_vtime = READ_ONCE(cell->l3_vtime_now[tctx->l3]);
+
+	} else {
+		// This is a task that can only run on a specific cpu
+		cstat_inc(CSTAT_CPU_DSQ, tctx->cell, cctx);
+
+		/*
+		 * cctx is the local core cpu (where enqueue is running), not the core
+		 * the task belongs to. Fetch the right cctx
+		 */
+		if (!(cctx = lookup_cpu_ctx(cpu)))
+			return;
+		/* Task is pinned to specific CPUs, use per-CPU DSQ */
+		basis_vtime = READ_ONCE(cctx->vtime_now);
+	}
+
+	tctx->basis_vtime = basis_vtime;
+
+	if (time_after(vtime,
+		       basis_vtime + VTIME_MAX_FUTURE_MULTIPLIER * slice_ns)) {
+		scx_bpf_error("vtime is too far in the future for %d", p->pid);
+		return;
+	}
+	/*
+	 * Limit the amount of budget that an idling task can accumulate
+	 * to one slice.
+	 */
+	// TODO: Should this be time_before64?
+	if (time_before(vtime, basis_vtime - slice_ns))
+		vtime = basis_vtime - slice_ns;
+
+	scx_bpf_dsq_insert_vtime(p, tctx->dsq.raw, slice_ns, vtime, enq_flags);
+
+	/* Kick the CPU if needed */
+	if (!__COMPAT_is_enq_cpu_selected(enq_flags) && cpu >= 0)
+		scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
+}
+
+void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev)
+{
+	struct cpu_ctx *cctx;
+	u32 cell;
+
+	increment_counter(COUNTER_DISPATCH);
+
+	if (!(cctx = lookup_cpu_ctx(-1)))
+		return;
+
+	cell = READ_ONCE(cctx->cell);
+
+	/* Start from a valid DSQ */
+	dsq_id_t local_dsq = get_cpu_dsq_id(cpu);
+
+	bool found = false;
+	dsq_id_t min_vtime_dsq = local_dsq;
+	u64 min_vtime = ~0ULL; /* U64_MAX */
+	struct task_struct *p;
+
+	// Get L3
+	u32 cpu_key = (u32)cpu;
+	u32 *l3_ptr = bpf_map_lookup_elem(&cpu_to_l3, &cpu_key);
+	s32 l3 = l3_ptr ? (s32)*l3_ptr : L3_INVALID;
+
+	/* Check the L3 queue */
+	if (l3 != L3_INVALID) {
+		dsq_id_t cell_l3_dsq = get_cell_l3_dsq_id(cell, l3);
+		bpf_for_each(scx_dsq, p, cell_l3_dsq.raw, 0) {
+			min_vtime = p->scx.dsq_vtime;
+			min_vtime_dsq = cell_l3_dsq;
+			found = true;
+			break;
+		}
+	}
+
+	/* Check the CPU DSQ for a lower vtime */
+	bpf_for_each(scx_dsq, p, local_dsq.raw, 0) {
+		if (!found || time_before(p->scx.dsq_vtime, min_vtime)) {
+			min_vtime = p->scx.dsq_vtime;
+			min_vtime_dsq = local_dsq;
+			found = true;
+		}
+		break;
+	}
+
+	/*
+	* The move_to_local can fail if we raced with some other cpu in the cell
+	* and now the cell is empty. We have to ensure to try the cpu_dsq or else
+	* we might never wakeup.
+	*/
+
+
+	if (found) {
+		// We found a task in the local or cell-L3 DSQ
+
+		// If it was in the per cpu DSQ, there is no competation, grab it and return
+		if (min_vtime_dsq.raw == local_dsq.raw) {
+			scx_bpf_dsq_move_to_local(min_vtime_dsq.raw);
+			return;
+		}
+
+		// If it was in the cell L3 DSQ, we are competing with other cpus in the cell-l3
+		// try to move it to the local DSQ
+		if (scx_bpf_dsq_move_to_local(min_vtime_dsq.raw)) {
+			// We won the race and got the task, return
+			return;
+		}
+	}
+
+#if MITOSIS_ENABLE_STEALING
+	// We didn't find a task in either DSQ, or lost the race.
+	// Instead of going straight to idle, attempt to steal a task from another
+	// L3 in the cell.
+
+	// Try stealing. If successful, this moves the task to the local runqueue
+	try_stealing_work(cell, l3);
+#endif
+}
+
+struct cpumask_entry {
+	unsigned long cpumask[CPUMASK_LONG_ENTRIES];
+	u64 used;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, u32);
+	__type(value, struct cpumask_entry);
+	__uint(max_entries, MAX_CPUMASK_ENTRIES);
+} cgrp_init_percpu_cpumask SEC(".maps");
+
+static inline struct cpumask_entry *allocate_cpumask_entry()
+{
+	int cpumask_idx;
+	bpf_for(cpumask_idx, 0, MAX_CPUMASK_ENTRIES)
+	{
+		struct cpumask_entry *ent = bpf_map_lookup_elem(
+			&cgrp_init_percpu_cpumask, &cpumask_idx);
+		if (!ent) {
+			scx_bpf_error("Failed to fetch cpumask_entry");
+			return NULL;
+		}
+		if (__sync_bool_compare_and_swap(&ent->used, 0, 1))
+			return ent;
+	}
+	scx_bpf_error("All cpumask entries are in use");
+	return NULL;
+}
+
+static inline void free_cpumask_entry(struct cpumask_entry *entry)
+{
+	WRITE_ONCE(entry->used, 0);
+}
+
+/* For use by cleanup attribute */
+static inline void __free_cpumask_entry(struct cpumask_entry **entry)
+{
+	if (entry)
+		if (*entry)
+			free_cpumask_entry(*entry);
+}
+
+#define DECLARE_CPUMASK_ENTRY(var) \
+	struct cpumask_entry *var __attribute__((cleanup(__free_cpumask_entry)))
+
+/* Define types for cpumasks in-situ vs as a ptr in struct cpuset */
+struct cpumask___local {};
+
+typedef struct cpumask___local *cpumask_var_t___ptr;
+
+struct cpuset___cpumask_ptr {
+	cpumask_var_t___ptr cpus_allowed;
+};
+
+typedef struct cpumask___local cpumask_var_t___arr[1];
+
+struct cpuset___cpumask_arr {
+	cpumask_var_t___arr cpus_allowed;
+};
+
+/*
+ * Given a cgroup, get its cpumask (populated in entry), returns 0 if no
+ * cpumask, < 0 on error and > 0 on a populated cpumask.
+ */
+static inline int get_cgroup_cpumask(struct cgroup *cgrp,
+				     struct cpumask_entry *entry)
+{
+	if (!cgrp->subsys[cpuset_cgrp_id])
+		return 0;
+
+	struct cpuset *cpuset =
+		container_of(cgrp->subsys[cpuset_cgrp_id], struct cpuset, css);
+
+	if (!cpuset)
+		return 0;
+
+	unsigned long runtime_cpumask_size = bpf_core_type_size(struct cpumask);
+	if (runtime_cpumask_size > CPUMASK_SIZE) {
+		scx_bpf_error(
+			"Definition of struct cpumask is too large. Please increase CPUMASK_LONG_ENTRIES");
+		return -EINVAL;
+	}
+
+	int err;
+	if (bpf_core_type_matches(struct cpuset___cpumask_arr)) {
+		struct cpuset___cpumask_arr *cpuset_typed =
+			(void *)bpf_core_cast(cpuset, struct cpuset);
+		err = bpf_core_read(&entry->cpumask, runtime_cpumask_size,
+				    &cpuset_typed->cpus_allowed);
+	} else if (bpf_core_type_matches(struct cpuset___cpumask_ptr)) {
+		struct cpuset___cpumask_ptr *cpuset_typed =
+			(void *)bpf_core_cast(cpuset, struct cpuset);
+		err = bpf_core_read(&entry->cpumask, runtime_cpumask_size,
+				    cpuset_typed->cpus_allowed);
+	} else {
+		scx_bpf_error(
+			"Definition of struct cpuset did not match any expected struct");
+		return -EINVAL;
+	}
+
+	if (err < 0) {
+		scx_bpf_error(
+			"bpf_core_read of cpuset->cpus_allowed failed for cgid %llu",
+			cgrp->kn->id);
+		return err;
+	}
+
+	if (bpf_cpumask_empty((const struct cpumask *)&entry->cpumask))
+		return 0;
+
+	if (!all_cpumask) {
+		scx_bpf_error("all_cpumask should not be NULL");
+		return -EINVAL;
+	}
+
+	if (bpf_cpumask_subset((const struct cpumask *)all_cpumask,
+			       (const struct cpumask *)&entry->cpumask))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * This array keeps track of the cgroup ancestor's cell as we iterate over the
+ * cgroup hierarchy.
+ */
+u32 level_cells[MAX_CG_DEPTH];
+int running;
+
+/* The guard is a stack variable. When it falls out of scope,
+ * we drop the running lock. */
+static inline void __running_unlock(int *guard) {
+	(void)guard; /* unused */
+	WRITE_ONCE(running, 0);
+}
+
+/*
+ * On tick, we identify new cells and apply CPU assignment
+ */
+void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run)
+{
+
+	u32 local_configuration_seq = READ_ONCE(configuration_seq);
+	if (local_configuration_seq == READ_ONCE(applied_configuration_seq))
+		return;
+
+	int zero = 0;
+	if (!__atomic_compare_exchange_n(&running, &zero, 1, false,
+					 __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
+		return;
+
+	int __attribute__((cleanup(__running_unlock), unused)) __running_guard;
+
+	DECLARE_CPUMASK_ENTRY(entry) = allocate_cpumask_entry();
+	if (!entry)
+		return;
+
+	/* Get the root cell (cell 0) and its cpumask */
+	struct cell_cpumask_wrapper *root_cell_cpumaskw;
+	if (!(root_cell_cpumaskw =
+		      bpf_map_lookup_elem(&cell_cpumasks, &zero))) {
+		scx_bpf_error("Failed to find root cell cpumask");
+		return;
+	}
+
+	struct bpf_cpumask *root_bpf_cpumask;
+	root_bpf_cpumask =
+		bpf_kptr_xchg(&root_cell_cpumaskw->tmp_cpumask, NULL);
+	if (!root_bpf_cpumask) {
+		scx_bpf_error("tmp_cpumask should never be null");
+		return;
+	}
+	if (!root_cell_cpumaskw->cpumask) {
+		scx_bpf_error("root cpumasks should never be null");
+		goto out;
+	}
+
+	if (!all_cpumask) {
+		scx_bpf_error("NULL all_cpumask");
+		goto out;
+	}
+
+	/*
+	 * Initialize root cell cpumask to all cpus, and then remove from it as we go
+	 */
+	bpf_cpumask_copy(root_bpf_cpumask, (const struct cpumask *)all_cpumask);
+
+	struct cgroup_subsys_state *root_css, *pos;
+	struct cgroup *cur_cgrp, *root_cgrp_ref;
+
+	if (!root_cgrp) {
+		scx_bpf_error("root_cgrp should not be null");
+		goto out;
+	}
+
+	struct cgrp_ctx *root_cgrp_ctx;
+	if (!(root_cgrp_ctx = lookup_cgrp_ctx(root_cgrp)))
+		goto out;
+
+	if (!root_cgrp) {
+		scx_bpf_error("root_cgrp should not be null");
+		goto out;
+	}
+
+	if (!(root_cgrp_ref = bpf_cgroup_acquire(root_cgrp))) {
+		scx_bpf_error("Failed to acquire reference to root_cgrp");
+		goto out;
+	}
+	root_css = &root_cgrp_ref->self;
+
+	bpf_rcu_read_lock();
+	/*
+	 * Iterate over all cgroups, check if any have a cpumask and populate them
+	 * as a separate cell.
+	 */
+	bpf_for_each(css, pos, root_css, BPF_CGROUP_ITER_DESCENDANTS_PRE) {
+		cur_cgrp = pos->cgroup;
+
+		/*
+		 * We can iterate over dying cgroups, in which case this lookup will
+		 * fail. These cgroups can't have tasks in them so just continue.
+		 */
+		struct cgrp_ctx *cgrp_ctx;
+		if (!(cgrp_ctx = lookup_cgrp_ctx_fallible(cur_cgrp)))
+			continue;
+
+		int rc = get_cgroup_cpumask(cur_cgrp, entry);
+		if (!rc) {
+			/*
+			 * TODO: If this was a cell owner that just had its cpuset removed,
+			 * it should free the cell. Doing so would require draining
+			 * in-flight tasks scheduled to the dsq.
+			 */
+			/* No cpuset, assign to parent cell and continue */
+			if (cur_cgrp->kn->id != root_cgid) {
+				u32 level = cur_cgrp->level;
+				if (level <= 0 || level >= MAX_CG_DEPTH) {
+					scx_bpf_error(
+						"Cgroup hierarchy is too deep: %d",
+						level);
+					goto out_rcu_unlock;
+				}
+				/*
+				 * This is a janky way of getting the parent cell, ideally we'd
+				 * lookup the parent cgrp_ctx and get it that way, but some
+				 * cgroup lookups don't work here because they are (erroneously)
+				 * only operating on the cgroup namespace of current. Given this
+				 * is a tick() it could be anything. See
+				 * https://lore.kernel.org/bpf/20250811175045.1055202-1-memxor@gmail.com/
+				 * for details.
+				 *
+				 * Instead, we just track the parent cells as we walk the cgroup
+				 * hierarchy in a separate array. Because the iteration is
+				 * pre-order traversal, we're guaranteed to have the current
+				 * cgroup's ancestor's cells in level_cells.
+				 */
+				u32 parent_cell = level_cells[level - 1];
+				WRITE_ONCE(cgrp_ctx->cell, parent_cell);
+				level_cells[level] = parent_cell;
+			}
+			continue;
+		} else if (rc < 0)
+			goto out_rcu_unlock;
+
+		/*
+		 * cgroup has a cpumask, allocate a new cell if needed, and assign cpus
+		 */
+		int cell_idx = READ_ONCE(cgrp_ctx->cell);
+		if (!cgrp_ctx->cell_owner) {
+			cell_idx = allocate_cell();
+			if (cell_idx < 0)
+				goto out_rcu_unlock;
+			cgrp_ctx->cell_owner = true;
+		}
+
+		struct cell_cpumask_wrapper *cell_cpumaskw;
+		if (!(cell_cpumaskw =
+			      bpf_map_lookup_elem(&cell_cpumasks, &cell_idx))) {
+			scx_bpf_error("Failed to find cell cpumask: %d",
+				      cell_idx);
+			goto out_rcu_unlock;
+		}
+
+		struct bpf_cpumask *bpf_cpumask;
+		bpf_cpumask = bpf_kptr_xchg(&cell_cpumaskw->tmp_cpumask, NULL);
+		if (!bpf_cpumask) {
+			scx_bpf_error("tmp_cpumask should never be null");
+			goto out_rcu_unlock;
+		}
+		bpf_cpumask_copy(bpf_cpumask,
+				 (const struct cpumask *)&entry->cpumask);
+		int cpu_idx;
+		bpf_for(cpu_idx, 0, nr_possible_cpus)
+		{
+			if (bpf_cpumask_test_cpu(
+				    cpu_idx,
+				    (const struct cpumask *)&entry->cpumask)) {
+				struct cpu_ctx *cpu_ctx;
+				if (!(cpu_ctx = lookup_cpu_ctx(cpu_idx))) {
+					bpf_cpumask_release(bpf_cpumask);
+					goto out_rcu_unlock;
+				}
+				cpu_ctx->cell = cell_idx;
+				bpf_cpumask_clear_cpu(cpu_idx,
+						      root_bpf_cpumask);
+			}
+		}
+		bpf_cpumask =
+			bpf_kptr_xchg(&cell_cpumaskw->cpumask, bpf_cpumask);
+		if (!bpf_cpumask) {
+			scx_bpf_error("cpumask should never be null");
+			goto out_rcu_unlock;
+		}
+
+		bpf_cpumask =
+			bpf_kptr_xchg(&cell_cpumaskw->tmp_cpumask, bpf_cpumask);
+		if (bpf_cpumask) {
+			scx_bpf_error("tmp_cpumask should be null");
+			bpf_cpumask_release(bpf_cpumask);
+			goto out_rcu_unlock;
+		}
+
+		barrier();
+		WRITE_ONCE(cgrp_ctx->cell, cell_idx);
+		u32 level = cur_cgrp->level;
+		if (level <= 0 || level >= MAX_CG_DEPTH) {
+			scx_bpf_error("Cgroup hierarchy is too deep: %d",
+				      level);
+			goto out_rcu_unlock;
+		}
+		level_cells[level] = cell_idx;
+	}
+	bpf_rcu_read_unlock();
+
+	/*
+	 * assign root cell cpus that are left over
+	 */
+	int cpu_idx;
+	bpf_for(cpu_idx, 0, nr_possible_cpus)
+	{
+		if (bpf_cpumask_test_cpu( cpu_idx, (const struct cpumask *)root_bpf_cpumask)) {
+			struct cpu_ctx *cpu_ctx;
+			if (!(cpu_ctx = lookup_cpu_ctx(cpu_idx)))
+				goto out_root_cgrp;
+			cpu_ctx->cell = 0;
+		}
+	}
+
+	root_bpf_cpumask =
+		bpf_kptr_xchg(&root_cell_cpumaskw->cpumask, root_bpf_cpumask);
+	if (!root_bpf_cpumask) {
+		scx_bpf_error("root cpumask should never be null");
+		bpf_cgroup_release(root_cgrp_ref);
+		return;
+	}
+
+	root_bpf_cpumask = bpf_kptr_xchg(&root_cell_cpumaskw->tmp_cpumask,
+					 root_bpf_cpumask);
+	if (root_bpf_cpumask) {
+		scx_bpf_error("root tmp_cpumask should be null");
+		goto out_root_cgrp;
+	}
+
+	int cell_idx;
+	/* Recalculate L3 counts for all active cells after CPU assignment changes */
+	bpf_for(cell_idx, 1, MAX_CELLS) {
+		struct cell *cell;
+		if (!(cell = lookup_cell(cell_idx))) {
+			scx_bpf_error("Lookup for cell %d failed in tick()", cell_idx);
+			goto out_root_cgrp;
+		}
+
+		if (!cell->in_use)
+			continue;
+
+		/* Recalculate L3 counts for each active cell */
+		recalc_cell_l3_counts(cell_idx);
+	}
+
+	/* Recalculate root cell's L3 counts after cpumask update */
+	recalc_cell_l3_counts(ROOT_CELL_ID);
+
+	barrier();
+	WRITE_ONCE(applied_configuration_seq, local_configuration_seq);
+
+	bpf_cgroup_release(root_cgrp_ref);
+	return;
+
+out_rcu_unlock:
+	bpf_rcu_read_unlock();
+out_root_cgrp:
+	bpf_cgroup_release(root_cgrp_ref);
+out:
+	if (root_bpf_cpumask)
+		bpf_cpumask_release(root_bpf_cpumask);
+}
+
+void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p)
+{
+	struct cpu_ctx *cctx;
+	struct task_ctx *tctx;
+	struct cell *cell;
+
+	if (!(tctx = lookup_task_ctx(p)) || !(cctx = lookup_cpu_ctx(-1)) ||
+	    !(cell = lookup_cell(cctx->cell)))
+		return;
+
+	/*
+	 * If this task was stolen across L3s, retag to thief L3 and recompute
+	 * effective cpumask+DSQ. Preserve vtime to keep fairness.
+	 */
+#if MITOSIS_ENABLE_STEALING
+	if (l3_is_valid(tctx->pending_l3)) {
+		u64 save_v = p->scx.dsq_vtime;
+		tctx->l3 = tctx->pending_l3;
+		tctx->pending_l3 = L3_INVALID;
+		update_task_cpumask(p, tctx);
+		p->scx.dsq_vtime = save_v;
+	}
+#endif
+
+	/* Validate task's DSQ before it starts running */
+	if (tctx->dsq.raw == DSQ_INVALID) {
+		if (tctx->all_cell_cpus_allowed) {
+			scx_bpf_error(
+				"Task %d has invalid DSQ 0 in running callback (CELL-SCHEDULABLE task, can run on any CPU in cell %d)",
+				p->pid, tctx->cell);
+		} else {
+			scx_bpf_error(
+				"Task %d has invalid DSQ 0 in running callback (CORE-PINNED task, restricted to specific CPUs)",
+				p->pid);
+		}
+		return;
+	}
+
+	/*
+	 * Update per-(cell, L3) vtime for cell-schedulable tasks
+	 */
+	if (tctx->all_cell_cpus_allowed && l3_is_valid(tctx->l3)) {
+		if (time_before(READ_ONCE(cell->l3_vtime_now[tctx->l3]), p->scx.dsq_vtime))
+			WRITE_ONCE(cell->l3_vtime_now[tctx->l3], p->scx.dsq_vtime);
+	}
+
+	/*
+	 * Update CPU vtime for CPU-pinned tasks
+	 */
+	if (time_before(READ_ONCE(cctx->vtime_now), p->scx.dsq_vtime))
+		WRITE_ONCE(cctx->vtime_now, p->scx.dsq_vtime);
+
+	tctx->started_running_at = scx_bpf_now();
+}
+
+void BPF_STRUCT_OPS(mitosis_stopping, struct task_struct *p, bool runnable)
+{
+	struct cpu_ctx *cctx;
+	struct task_ctx *tctx;
+	struct cell *cell;
+	u64 now, used;
+	u32 cidx;
+
+	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
+		return;
+
+	cidx = tctx->cell;
+	if (!(cell = lookup_cell(cidx)))
+		return;
+
+	now = scx_bpf_now();
+	used = now - tctx->started_running_at;
+	tctx->started_running_at = now;
+	/* scale the execution time by the inverse of the weight and charge */
+	p->scx.dsq_vtime += used * DEFAULT_WEIGHT_MULTIPLIER / p->scx.weight;
+
+	if (cidx != 0 || tctx->all_cell_cpus_allowed) {
+		u64 *cell_cycles = MEMBER_VPTR(cctx->cell_cycles, [cidx]);
+		if (!cell_cycles) {
+			scx_bpf_error("Cell index is too large: %d", cidx);
+			return;
+		}
+		*cell_cycles += used;
+
+		/*
+		 * For cell-schedulable tasks, also accumulate vtime into
+		 * per-cell per-L3 queues
+		 */
+		if (tctx->all_cell_cpus_allowed && l3_is_valid(tctx->l3)) {
+			/* Accumulate weighted execution time into per-(cell, L3) vtime */
+			cell->l3_vtime_now[tctx->l3] +=
+				used * DEFAULT_WEIGHT_MULTIPLIER /
+				p->scx.weight;
+		}
+	}
+}
+
+SEC("fentry/cpuset_write_resmask")
+int BPF_PROG(fentry_cpuset_write_resmask, struct kernfs_open_file *of,
+	     char *buf, size_t nbytes, loff_t off, ssize_t retval)
+{
+	/*
+	 * On a write to cpuset.cpus, we'll need to configure new cells, bump
+	 * configuration_seq so tick() does that.
+	 */
+	__atomic_add_fetch(&configuration_seq, 1, __ATOMIC_RELEASE);
+	return 0;
+}
+
+s32 BPF_STRUCT_OPS(mitosis_cgroup_init, struct cgroup *cgrp,
+		   struct scx_cgroup_init_args *args)
+{
+	struct cgrp_ctx *cgc;
+	if (!(cgc = bpf_cgrp_storage_get(&cgrp_ctxs, cgrp, 0,
+					 BPF_LOCAL_STORAGE_GET_F_CREATE))) {
+		scx_bpf_error("cgrp_ctx creation failed for cgid %llu",
+			      cgrp->kn->id);
+		return -ENOENT;
+	}
+
+	// Special case for root cell
+	if (cgrp->kn->id == root_cgid) {
+		WRITE_ONCE(cgc->cell, ROOT_CELL_ID);
+		return 0;
+	}
+
+	DECLARE_CPUMASK_ENTRY(entry) = allocate_cpumask_entry();
+	if (!entry)
+		return -EINVAL;
+	int rc = get_cgroup_cpumask(cgrp, entry);
+	if (rc < 0)
+		return rc;
+	else if (rc > 0) {
+		/*
+		 * This cgroup has a cpuset, bump configuration_seq so tick()
+		 * configures it.
+		 */
+		__atomic_add_fetch(&configuration_seq, 1, __ATOMIC_RELEASE);
+	}
+
+	/* Initialize to parent's cell */
+	struct cgroup *parent_cg;
+	if (!(parent_cg = lookup_cgrp_ancestor(cgrp, cgrp->level - 1)))
+		return -ENOENT;
+
+	struct cgrp_ctx *parent_cgc;
+	if (!(parent_cgc = lookup_cgrp_ctx(parent_cg))) {
+		bpf_cgroup_release(parent_cg);
+		return -ENOENT;
+	}
+
+	bpf_cgroup_release(parent_cg);
+	cgc->cell = parent_cgc->cell;
+	return 0;
+}
+
+s32 BPF_STRUCT_OPS(mitosis_cgroup_exit, struct cgroup *cgrp)
+{
+	struct cgrp_ctx *cgc;
+	if (!(cgc = bpf_cgrp_storage_get(&cgrp_ctxs, cgrp, 0,
+					 BPF_LOCAL_STORAGE_GET_F_CREATE))) {
+		scx_bpf_error("cgrp_ctx creation failed for cgid %llu",
+			      cgrp->kn->id);
+		return -ENOENT;
+	}
+
+	if (cgc->cell_owner) {
+		int ret;
+		if ((ret = free_cell(cgc->cell)))
+			return ret;
+		/*
+		 * Need to make sure the cpus of this cell are freed back to the root
+		 * cell and the root cell cpumask can be expanded. Bump
+		 * configuration_seq so tick() does that.
+		 */
+		__atomic_add_fetch(&configuration_seq, 1, __ATOMIC_RELEASE);
+	}
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(mitosis_cgroup_move, struct task_struct *p,
+		    struct cgroup *from, struct cgroup *to)
+{
+	struct task_ctx *tctx;
+
+	if (!(tctx = lookup_task_ctx(p)))
+		return;
+
+	update_task_cell(p, tctx, to);
+}
+
+void BPF_STRUCT_OPS(mitosis_set_cpumask, struct task_struct *p,
+		    const struct cpumask *cpumask)
+{
+	struct task_ctx *tctx;
+
+	if (!(tctx = lookup_task_ctx(p)))
+		return;
+
+	if (!all_cpumask) {
+		scx_bpf_error("NULL all_cpumask");
+		return;
+	}
+
+	update_task_cpumask(p, tctx);
+}
+
+s32 BPF_STRUCT_OPS(mitosis_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
+{
+	struct task_ctx *tctx;
+	struct bpf_cpumask *cpumask;
+	int ret;
+
+	tctx = bpf_task_storage_get(&task_ctxs, p, 0,
+				    BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!tctx) {
+		scx_bpf_error("task_ctx allocation failure");
+		return -ENOMEM;
+	}
+
+	cpumask = bpf_cpumask_create();
+	if (!cpumask)
+		return -ENOMEM;
+
+	cpumask = bpf_kptr_xchg(&tctx->cpumask, cpumask);
+	if (cpumask) {
+		/* Should never happen as we just inserted it above. */
+		bpf_cpumask_release(cpumask);
+		scx_bpf_error("tctx cpumask is unexpectedly populated on init");
+		return -EINVAL;
+	}
+
+	if (!all_cpumask) {
+		scx_bpf_error("missing all_cpumask");
+		return -EINVAL;
+	}
+
+	/* Initialize L3 to invalid before cell assignment */
+	init_task_l3(tctx);
+
+	// TODO clean this up
+	if ((ret = update_task_cell(p, tctx, args->cgroup))) {
+		return ret;
+	}
+
+	return 0;
+}
+
+__hidden void dump_cpumask_word(s32 word, const struct cpumask *cpumask)
+{
+	u32 u, v = 0;
+
+	bpf_for(u, 0, BITS_PER_U32)
+	{
+		s32 cpu = BITS_PER_U32 * word + u;
+		if (cpu < nr_possible_cpus &&
+		    bpf_cpumask_test_cpu(cpu, cpumask))
+			v |= 1 << u;
+	}
+	scx_bpf_dump("%08x", v);
+}
+
+static void dump_cpumask(const struct cpumask *cpumask)
+{
+	u32 word, nr_words = (nr_possible_cpus + 31) / 32;
+
+	bpf_for(word, 0, nr_words)
+	{
+		if (word)
+			scx_bpf_dump(",");
+		dump_cpumask_word(nr_words - word - 1, cpumask);
+	}
+}
+
+static void dump_cell_cpumask(int id)
+{
+	const struct cpumask *cell_cpumask;
+
+	if (!(cell_cpumask = lookup_cell_cpumask(id)))
+		return;
+
+	dump_cpumask(cell_cpumask);
+}
+
+/* Print cell state for debugging */
+static __always_inline void dump_cell_state(u32 cell_idx)
+{
+	struct cell *cell = lookup_cell(cell_idx);
+	if (!cell) {
+		scx_bpf_dump("Cell %d: NOT FOUND", cell_idx);
+		return;
+	}
+
+	scx_bpf_dump("Cell %d: in_use=%d, cpu_cnt=%d, l3_present_cnt=%d",
+		   cell_idx, cell->in_use, cell->cpu_cnt, cell->l3_present_cnt);
+
+	u32 l3;
+	// TODO Print vtimes for L3s
+	// TODO lock
+	bpf_for(l3, 0, nr_l3) {
+		if (cell->l3_cpu_cnt[l3] > 0) {
+			scx_bpf_dump("  L3[%d]: %d CPUs", l3, cell->l3_cpu_cnt[l3]);
+		}
+	}
+}
+
+// TODO: FIX THIS
+static __always_inline void dump_l3_state(){
+}
+
+void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx)
+{
+	dsq_id_t dsq_id;
+	int i;
+	struct cell *cell;
+	struct cpu_ctx *cpu_ctx;
+
+	scx_bpf_dump_header();
+
+	bpf_for(i, 0, MAX_CELLS)
+	{
+		if (!(cell = lookup_cell(i)))
+			return;
+
+		if (!cell->in_use)
+			continue;
+
+		scx_bpf_dump("CELL[%d] CPUS=", i);
+		dump_cell_cpumask(i);
+		scx_bpf_dump("\n");
+		dump_cell_state(i);
+	}
+
+	bpf_for(i, 0, nr_possible_cpus)
+	{
+		if (!(cpu_ctx = lookup_cpu_ctx(i)))
+			return;
+
+		dsq_id = get_cpu_dsq_id(i);
+		scx_bpf_dump("CPU[%d] cell=%d vtime=%llu nr_queued=%d\n", i,
+			     cpu_ctx->cell, READ_ONCE(cpu_ctx->vtime_now),
+			     scx_bpf_dsq_nr_queued(dsq_id.raw));
+	}
+
+	dump_l3_state();
+
+}
+
+void BPF_STRUCT_OPS(mitosis_dump_task, struct scx_dump_ctx *dctx,
+		    struct task_struct *p)
+{
+	struct task_ctx *tctx;
+
+	if (!(tctx = lookup_task_ctx(p)))
+		return;
+
+	scx_bpf_dump(
+		"Task[%d] vtime=%llu basis_vtime=%llu cell=%u dsq=%llu all_cell_cpus_allowed=%d\n",
+		p->pid, p->scx.dsq_vtime, tctx->basis_vtime, tctx->cell,
+		tctx->dsq.raw, tctx->all_cell_cpus_allowed);
+	scx_bpf_dump("Task[%d] CPUS=", p->pid);
+	dump_cpumask(p->cpus_ptr);
+	scx_bpf_dump("\n");
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init)
+{
+	struct bpf_cpumask *cpumask;
+	u32 i;
+	s32 ret;
+
+	struct cgroup *rootcg;
+	if (!(rootcg = bpf_cgroup_from_id(root_cgid)))
+		return -ENOENT;
+
+	rootcg = bpf_kptr_xchg(&root_cgrp, rootcg);
+	if (rootcg)
+		bpf_cgroup_release(rootcg);
+
+	/* setup all_cpumask */
+	cpumask = bpf_cpumask_create();
+	if (!cpumask)
+		return -ENOMEM;
+
+	bpf_for(i, 0, nr_possible_cpus)
+	{
+		const volatile u8 *u8_ptr;
+
+		if ((u8_ptr = MEMBER_VPTR(all_cpus, [i / 8]))) {
+			if (*u8_ptr & (1 << (i % 8))) {
+				bpf_cpumask_set_cpu(i, cpumask);
+				ret = scx_bpf_create_dsq(get_cpu_dsq_id(i).raw, ANY_NUMA);
+				if (ret < 0) {
+					bpf_cpumask_release(cpumask);
+					return ret;
+				}
+			}
+		} else {
+			return -EINVAL;
+		}
+	}
+
+
+	cpumask = bpf_kptr_xchg(&all_cpumask, cpumask);
+	if (cpumask)
+		bpf_cpumask_release(cpumask);
+
+	/* setup cell cpumasks */
+	bpf_for(i, 0, MAX_CELLS)
+	{
+		struct cell_cpumask_wrapper *cpumaskw;
+		if (!(cpumaskw = bpf_map_lookup_elem(&cell_cpumasks, &i)))
+			return -ENOENT;
+
+		cpumask = bpf_cpumask_create();
+		if (!cpumask)
+			return -ENOMEM;
+
+		/*
+		 * Start with all full cpumask for all cells. They'll get setup in
+		 * cgroup_init
+		 */
+		bpf_cpumask_setall(cpumask);
+
+		cpumask = bpf_kptr_xchg(&cpumaskw->cpumask, cpumask);
+		if (cpumask) {
+			/* Should be impossible, we just initialized the cell cpumask */
+			bpf_cpumask_release(cpumask);
+			return -EINVAL;
+		}
+
+		cpumask = bpf_cpumask_create();
+		if (!cpumask)
+			return -ENOMEM;
+		cpumask = bpf_kptr_xchg(&cpumaskw->tmp_cpumask, cpumask);
+		if (cpumask) {
+			/* Should be impossible, we just initialized the cell tmp_cpumask */
+			bpf_cpumask_release(cpumask);
+			return -EINVAL;
+		}
+	}
+
+	// cells[0].in_use = true;
+	lookup_cell(0)->in_use = true;
+
+	/* Configure root cell (cell 0) topology at init time using nr_l3 and l3_to_cpu masks */
+	recalc_cell_l3_counts(ROOT_CELL_ID);
+
+	/* Create (cell,L3) DSQs for all pairs. Userspace will populate maps. */
+	// This is a crazy over-estimate
+	bpf_for(i, 0, MAX_CELLS)
+	{
+		u32 l3;
+		bpf_for(l3, 0, nr_l3)
+		{
+			ret = scx_bpf_create_dsq(get_cell_l3_dsq_id(i, l3).raw, ANY_NUMA);
+			if (ret < 0)
+				scx_bpf_error( "Failed to create DSQ for cell %d, L3 %d: err %d", i, l3, ret);
+		}
+	}
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(mitosis_exit, struct scx_exit_info *ei)
+{
+	// int i;
+	// bpf_for(i, 0, MAX_CELLS); {
+	// 	dump_cell_state((u32)i);
+	// }
+
+	UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops mitosis = {
+	.select_cpu = (void *)mitosis_select_cpu,
+	.enqueue = (void *)mitosis_enqueue,
+	.dispatch = (void *)mitosis_dispatch,
+	.tick = (void *)mitosis_tick,
+	.running = (void *)mitosis_running,
+	.stopping = (void *)mitosis_stopping,
+	.set_cpumask = (void *)mitosis_set_cpumask,
+	.init_task = (void *)mitosis_init_task,
+	.cgroup_init = (void *)mitosis_cgroup_init,
+	.cgroup_exit = (void *)mitosis_cgroup_exit,
+	.cgroup_move = (void *)mitosis_cgroup_move,
+	.dump = (void *)mitosis_dump,
+	.dump_task = (void *)mitosis_dump_task,
+	.init = (void *)mitosis_init,
+	.exit = (void *)mitosis_exit,
+	.name = "mitosis",
+};
+# File: scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
+/* Copyright (c) Meta Platforms, Inc. and affiliates. */
+/*
+ * This software may be used and distributed according to the terms of the
+ * GNU General Public License version 2.
+ *
+ * This defines the core data structures, types, and constants
+ * for the scx_mitosis scheduler, primarily containing `struct cell`
+ * and `struct task_ctx`.
+ */
+
+#pragma once
+
+#ifdef LSP
+#define __bpf__
+#include "../../../../include/scx/common.bpf.h"
+#include "../../../../include/scx/ravg_impl.bpf.h"
+#else
+#include <scx/common.bpf.h>
+#include <scx/ravg_impl.bpf.h>
+#endif
+
+#include "intf.h"
+
+#define MAX_L3S 16
+
+#include "dsq.bpf.h"
+
+/*
+ * A couple of tricky things about checking a cgroup's cpumask:
+ *
+ * First, we need an RCU pointer to pass to cpumask kfuncs. The only way to get
+ * this right now is to copy the cpumask to a map entry. Given that cgroup init
+ * could be re-entrant we have a few per-cpu entries in a map to make this
+ * doable.
+ *
+ * Second, cpumask can sometimes be stored as an array in-situ or as a pointer
+ * and with different lengths. Some bpf_core_type_matches finagling can make
+ * this all work.
+ */
+#define MAX_CPUMASK_ENTRIES (4)
+
+/*
+ * We don't know how big struct cpumask is at compile time, so just allocate a
+ * large space and check that it is big enough at runtime
+ * TODO: This should be deduplicated with the rust code and put in intf.h
+ */
+#define CPUMASK_LONG_ENTRIES (128)
+#define CPUMASK_SIZE (sizeof(long) * CPUMASK_LONG_ENTRIES)
+
+extern const volatile u32 nr_l3;
+
+extern struct cell_map cells;
+
+
+enum mitosis_constants {
+
+	/* Root cell index */
+	ROOT_CELL_ID = 0,
+
+	/* Invalid/unset L3 value */
+	// INVALID_L3_ID = -1,
+
+	/* Default weight divisor for vtime calculation */
+	DEFAULT_WEIGHT_MULTIPLIER = 100,
+
+	/* Vtime validation multiplier (slice_ns * 8192) */
+	VTIME_MAX_FUTURE_MULTIPLIER = 8192,
+
+	/* Bits per u32 for cpumask operations */
+	BITS_PER_U32 = 32,
+
+	/* No NUMA constraint for DSQ creation */
+	ANY_NUMA = -1,
+};
+
+struct cell {
+	struct bpf_spin_lock lock;
+
+	// Whether or not the cell is used or not
+	u32 in_use;
+	// Number of CPUs in this cell
+	u32 cpu_cnt;
+	// per-L3 vtimes within this cell
+	u64 l3_vtime_now[MAX_L3S];
+	// Number of CPUs from each L3 assigned to this cell
+	u32 l3_cpu_cnt[MAX_L3S];
+	// Number of L3s with at least one CPU in this cell
+	u32 l3_present_cnt;
+
+	// TODO XXX remove this, only here temporarily to make the code compile
+	// current vtime of the cell
+	u64 vtime_now;
+};
+
+// #if 0
+/* Wrap the spin lock in a struct for verifier */
+// struct cell_lock_wrapper {
+//     struct bpf_spin_lock lock;
+// };
+
+// struct cell_locks_map {
+//     __uint(type, BPF_MAP_TYPE_ARRAY);
+//     __type(key, u32);
+//     __type(value, struct cell_lock_wrapper);
+//     __uint(max_entries, MAX_CELLS);
+// };
+
+#define WITH_CELL_LOCK(cell_ptr, cell_idx, block)                       \
+	do {                                                            \
+		struct bpf_spin_lock *lock = get_cell_lock(cell_idx);   \
+		if (!lock) {                                            \
+			scx_bpf_error("Failed to get lock for cell %d", \
+				      cell_idx);                        \
+			break;                                          \
+		}                                                       \
+		bpf_spin_lock(lock);                                    \
+		block bpf_spin_unlock(lock);                            \
+	} while (0)
+
+static inline struct cell *lookup_cell(int idx)
+{
+	struct cell *cell;
+
+	// cell = MEMBER_VPTR(cells, [idx]);
+	cell = bpf_map_lookup_elem(&cells, &idx);
+
+
+	if (!cell) {
+		scx_bpf_error("Invalid cell %d", idx);
+		return NULL;
+	}
+	return cell;
+}
+
+static inline struct bpf_spin_lock *get_cell_lock(u32 cell_idx)
+{
+	if (cell_idx >= MAX_CELLS) {
+		scx_bpf_error("Invalid cell index %d", cell_idx);
+		return NULL;
+	}
+
+	struct cell *cell = lookup_cell(cell_idx);
+	if (!cell) {
+		scx_bpf_error("Cell %d not found", cell_idx);
+		return NULL;
+	}
+	return &cell->lock;
+}
+// #endif
+
+/*
+ * task_ctx is the per-task information kept by scx_mitosis
+ */
+struct task_ctx {
+	/* cpumask is the set of valid cpus this task can schedule on */
+	/* (tasks cpumask anded with its cell cpumask) */
+	struct bpf_cpumask __kptr *cpumask;
+	/* started_running_at for recording runtime */
+	u64 started_running_at;
+	u64 basis_vtime;
+	/* For the sake of monitoring, each task is owned by a cell */
+	u32 cell;
+	/* For the sake of scheduling, a task is exclusively owned by either a cell
+	 * or a cpu */
+	dsq_id_t dsq;
+	/* latest configuration that was applied for this task */
+	/* (to know if it has to be re-applied) */
+	u32 configuration_seq;
+	/* Is this task allowed on all cores of its cell? */
+	bool all_cell_cpus_allowed;
+	// Which L3 this task is assigned to
+	s32 l3;
+
+#if MITOSIS_ENABLE_STEALING
+	/* When a task is stolen, dispatch() marks the destination L3 here.
+	 * running() applies the retag and recomputes cpumask (vtime preserved).
+	*/
+	s32 pending_l3;
+	u32 steal_count; /* how many times this task has been stolen */
+	u64 last_stolen_at; /* ns timestamp of the last steal (scx_bpf_now) */
+	u32 steals_prevented; /* how many times this task has been prevented from being stolen */
+#endif
+};
+
+// These could go in mitosis.bpf.h, but we'll cross that bridge when we get
+static inline const struct cpumask *lookup_cell_cpumask(int idx);
+
+static inline struct task_ctx *lookup_task_ctx(struct task_struct *p);
+
+/* MAP TYPES */
+struct function_counters_map {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, u32);
+	__type(value, u64);
+	__uint(max_entries, NR_COUNTERS);
+};
+
+struct cell_map {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, struct cell);
+	__uint(max_entries, MAX_CELLS);
+};
+
+struct rcu_read_guard {
+	bool active;
+};
+
+static inline struct rcu_read_guard rcu_read_lock_guard(void)
+{
+	bpf_rcu_read_lock();
+	return (struct rcu_read_guard){ .active = true };
+}
+
+static inline void rcu_read_guard_release(struct rcu_read_guard *guard)
+{
+	if (guard->active) {
+		bpf_rcu_read_unlock();
+		guard->active = false;
+	}
+}
+#define RCU_READ_GUARD()                                               \
+	struct rcu_read_guard __rcu_guard                              \
+		__attribute__((__cleanup__(rcu_read_guard_release))) = \
+			rcu_read_lock_guard()
+
+struct cpumask_guard {
+	struct bpf_cpumask *mask;
+};
+
+static inline struct cpumask_guard cpumask_create_guard(void)
+{
+	struct bpf_cpumask *mask = bpf_cpumask_create();
+	return (struct cpumask_guard){ .mask = mask };
+}
+
+static inline void cpumask_guard_release(struct cpumask_guard *guard)
+{
+	if (guard->mask) {
+		bpf_cpumask_release(guard->mask);
+		guard->mask = NULL;
+	}
+}
+
+#define CPUMASK_GUARD(var_name)                                       \
+	struct cpumask_guard var_name                                 \
+		__attribute__((__cleanup__(cpumask_guard_release))) = \
+			cpumask_create_guard()
diff --git a/scheds/rust/scx_mitosis/src/bpf/intf.h b/scheds/rust/scx_mitosis/src/bpf/intf.h
index 00045df399..130f4f2480 100644
--- a/scheds/rust/scx_mitosis/src/bpf/intf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/intf.h
@@ -5,13 +5,13 @@
 #ifndef __INTF_H
 #define __INTF_H
 
-#ifndef __KERNEL__
+#ifndef __BPF__
+#include <stddef.h>
 typedef unsigned long long u64;
 typedef unsigned int u32;
 typedef _Bool bool;
 #endif
 
-
 #ifdef LSP
 #define __bpf__
 #include "../../../../include/scx/ravg.bpf.h"
@@ -49,24 +49,51 @@ enum consts {
 
 struct cell {
 	// This is a lock in the kernel and padding in the user
-	CELL_LOCK_T lock;
+	CELL_LOCK_T lock; // Assumed to be the first entry (see below)
 
 	// Whether or not the cell is used
 	u32 in_use;
+
 	// Number of CPUs in this cell
 	u32 cpu_cnt;
-	// per-L3 vtimes within this cell
-	u64 l3_vtime_now[MAX_L3S];
-	// Number of CPUs from each L3 assigned to this cell
-	u32 l3_cpu_cnt[MAX_L3S];
+
 	// Number of L3s with at least one CPU in this cell
 	u32 l3_present_cnt;
 
-	// TODO XXX remove this, only here temporarily to make the code compile
-	// current vtime of the cell
-	u64 vtime_now;
+	// Number of CPUs from each L3 assigned to this cell
+	u32 l3_cpu_cnt[MAX_L3S];
+
+	// per-L3 vtimes within this cell
+	u64 l3_vtime_now[MAX_L3S];
 };
 
+// Putting the lock first in the struct is our convention.
+// We pad this space when in Rust code that will never see the lock value.
+// We intentionally avoid it in copy_cell_no_lock to keep the verifier happy.
+// It is a BPF constraint that it is 4 byte aligned.
+
+// All assertions work for both BPF and userspace builds
+_Static_assert(offsetof(struct cell, lock) == 0,
+               "lock/padding must be first field");
+
+_Static_assert(sizeof(((struct cell *)0)->lock) == 4,
+               "lock/padding must be 4 bytes");
+
+_Static_assert(_Alignof(CELL_LOCK_T) == 4,
+               "lock/padding must be 4-byte aligned");
+
+_Static_assert(offsetof(struct cell, in_use) == 4,
+               "in_use must follow 4-byte lock/padding");
+
+// Verify these are the same size in both BPF and Rust.
+_Static_assert(sizeof(struct cell) ==
+               ( (4 * sizeof(u32)) + (4 * MAX_L3S) + (8 * MAX_L3S)),
+               "struct cell size must be stable for Rust bindings");
+
+// Ensure no unexpected padding was added
+_Static_assert(sizeof(struct cell) == 208,
+               "struct cell must be exactly 208 bytes");
+
 /* Statistics */
 enum cell_stat_idx {
 	CSTAT_LOCAL,
diff --git a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
index 7ed77d68c3..eb5e35c352 100644
--- a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
@@ -138,7 +138,7 @@ static __always_inline void recalc_cell_l3_counts(u32 cell_idx)
 		}
 	} // unlock RCU
 
-
+	// Write to cell
 	bpf_spin_lock(&cell->lock);
 	for (u32 l3 = 0; l3 < nr_l3; l3++) {
 			cell->l3_cpu_cnt[l3] = l3_cpu_cnt_tmp[l3];
@@ -159,7 +159,6 @@ static __always_inline void recalc_cell_l3_counts(u32 cell_idx)
  * @cell_id: The cell ID to select an L3 from
  * @return: L3 ID on success, L3_INVALID on error
  */
-// TODO: Lock
 static inline s32 pick_l3_for_task(u32 cell_id)
 {
 	struct cell *cell;
@@ -170,9 +169,15 @@ static inline s32 pick_l3_for_task(u32 cell_id)
 		return L3_INVALID;
 	}
 
+	// Snapshot the current state of the cell
+	struct cell cell_snapshot;
+	bpf_spin_lock(&cell->lock);
+	copy_cell_skip_lock(&cell_snapshot, cell);
+	bpf_spin_unlock(&cell->lock);
+
 	// No cpus
-	if (!cell->cpu_cnt) {
-		scx_bpf_error( "pick_l3_for_task: cell %d has no CPUs accounted yet", cell_id);
+	if (!cell_snapshot.cpu_cnt) {
+		scx_bpf_error("pick_l3_for_task: cell %d has no CPUs accounted yet", cell_id);
 		return L3_INVALID;
 	}
 
@@ -180,14 +185,14 @@ static inline s32 pick_l3_for_task(u32 cell_id)
 	 * weighted selection - accumulate CPU counts until we exceed target */
 
 	/* Generate random target value in range [0, cpu_cnt) */
-	u32 target = bpf_get_prandom_u32() % cell->cpu_cnt;
+	u32 target = bpf_get_prandom_u32() % cell_snapshot.cpu_cnt;
 	u32 l3, cur = 0;
 	s32 ret = L3_INVALID;
 
 	// This could be a prefix sum. Find first l3 where we exceed target
 	bpf_for(l3, 0, nr_l3)
 	{
-		cur += cell->l3_cpu_cnt[l3];
+		cur += cell_snapshot.l3_cpu_cnt[l3];
 		if (target < cur) {
 			ret = (s32)l3;
 			break;
diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
index b920ecaf25..c1e6acf17e 100644
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
@@ -182,15 +182,15 @@ static inline int allocate_cell()
 		if (!(c = lookup_cell(cell_idx)))
 			return -1;
 
-		if (__sync_bool_compare_and_swap(&c->in_use, 0, 1)) {
-			// TODO XXX, I think we need to make this concurrent safe
-			// TODO, lock with recalc_cell...()
-			__builtin_memset(c->l3_cpu_cnt, 0, sizeof(c->l3_cpu_cnt));
-			c->l3_present_cnt = 0;
-			// TODO zero cpu_cnt
-			// TODO Just zero the whole cell struct?
+		bpf_spin_lock(&c->lock);
+		if (c->in_use == 0) {
+			// Zero everything except the lock (which is first)
+			__builtin_memset(&c->in_use, 0, sizeof(struct cell) - sizeof(CELL_LOCK_T));
+			c->in_use = 1;  // Then mark as in use
+			bpf_spin_unlock(&c->lock);
 			return cell_idx;
 		}
+		bpf_spin_unlock(&c->lock);
 	}
 	scx_bpf_error("No available cells to allocate");
 	return -1;
@@ -309,7 +309,6 @@ static inline int update_task_cpumask(struct task_struct *p,
 
 	// We want to set the task vtime to that of the cell it's joining.
 	if (tctx->all_cell_cpus_allowed) {
-
 		const struct cpumask *l3_mask = NULL;
 		if (tctx->l3 != L3_INVALID) {
 			l3_mask = lookup_l3_cpumask((u32)tctx->l3);
@@ -346,8 +345,10 @@ static inline int update_task_cpumask(struct task_struct *p,
 		tctx->dsq = get_cell_l3_dsq_id(tctx->cell, tctx->l3);
 
 		struct cell *cell = lookup_cell(tctx->cell);
-		if (!cell)
+		if (!cell) {
+			scx_bpf_error("Invalid cell");
 			return -ENOENT;
+		}
 
 		if (!l3_is_valid(tctx->l3)){
 			scx_bpf_error("Invalid L3 %d", tctx->l3);
@@ -1099,7 +1100,7 @@ void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p)
 	    !(cell = lookup_cell(cctx->cell)))
 		return;
 
-	/*
+		/*
 	 * If this task was stolen across L3s, retag to thief L3 and recompute
 	 * effective cpumask+DSQ. Preserve vtime to keep fairness.
 	 */
@@ -1385,7 +1386,7 @@ static __always_inline void dump_cell_state(u32 cell_idx)
 	}
 
 	scx_bpf_dump("Cell %d: in_use=%d, cpu_cnt=%d, l3_present_cnt=%d",
-		   cell_idx, cell->in_use, cell->cpu_cnt, cell->l3_present_cnt);
+		     cell_idx, cell->in_use, cell->cpu_cnt, cell->l3_present_cnt);
 
 	u32 l3;
 	// TODO Print vtimes for L3s
@@ -1399,6 +1400,8 @@ static __always_inline void dump_cell_state(u32 cell_idx)
 
 // TODO: FIX THIS
 static __always_inline void dump_l3_state(){
+
+
 }
 
 void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx)
diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
index 4441a19a27..e42f7379f2 100644
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
@@ -46,8 +46,6 @@
 
 extern const volatile u32 nr_l3;
 
-
-
 extern struct cell_map cells;
 
 enum mitosis_constants {
@@ -71,8 +69,16 @@ enum mitosis_constants {
 	ANY_NUMA = -1,
 };
 
-
-
+static inline void copy_cell_skip_lock(struct cell *dst, const struct cell *src)
+{
+	/* Copy everything AFTER the lock field.
+	 * Since lock is first and 4 bytes (verified by static assertions),
+	 * we skip it and copy the remainder of the struct.
+	 */
+	__builtin_memcpy(&dst->in_use,
+	                 &src->in_use,
+	                 sizeof(struct cell) - sizeof(CELL_LOCK_T));
+}
 
 static inline struct cell *lookup_cell(int idx)
 {

From 73a86232faa2edcff9026b8910e59fcc3257e1bc Mon Sep 17 00:00:00 2001
From: tommy-u <tunger10@gmail.com>
Date: Thu, 9 Oct 2025 14:17:18 -0700
Subject: [PATCH 10/12] Clang format

---
 scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h     |  82 ++++++++----
 scheds/rust/scx_mitosis/src/bpf/intf.h        |  22 ++--
 .../rust/scx_mitosis/src/bpf/l3_aware.bpf.h   |  26 ++--
 scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c | 121 +++++++++++++-----
 scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h |  33 +++--
 5 files changed, 195 insertions(+), 89 deletions(-)

diff --git a/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h
index a8a8a21c2e..fc50f17fba 100644
--- a/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h
@@ -70,32 +70,51 @@
 #endif
 
 /* ---- Bitfield widths (bits) ---- */
-#define CPU_B     28
-#define L3_B      16
-#define CELL_B    12
-#define TYPE_B     4
-#define DATA_B    28
-#define RSVD_B    32
+#define CPU_B 28
+#define L3_B 16
+#define CELL_B 12
+#define TYPE_B 4
+#define DATA_B 28
+#define RSVD_B 32
 
 /* Sum checks (in bits) */
-_Static_assert(CPU_B  + TYPE_B          == 32, "CPU layout low half must be 32 bits");
-_Static_assert(L3_B   + CELL_B + TYPE_B == 32, "CELL+L3 layout low half must be 32 bits");
-_Static_assert(DATA_B + TYPE_B          == 32, "Common layout low half must be 32 bits");
+_Static_assert(CPU_B + TYPE_B == 32, "CPU layout low half must be 32 bits");
+_Static_assert(L3_B + CELL_B + TYPE_B == 32,
+	       "CELL+L3 layout low half must be 32 bits");
+_Static_assert(DATA_B + TYPE_B == 32, "Common layout low half must be 32 bits");
 
 typedef union {
 	u64 raw;
 
 	/* Per-CPU user DSQ */
-	struct { u64 cpu: CPU_B;   u64 type: TYPE_B; u64 rsvd: RSVD_B; } cpu_dsq;
+	struct {
+		u64 cpu : CPU_B;
+		u64 type : TYPE_B;
+		u64 rsvd : RSVD_B;
+	} cpu_dsq;
 
 	/* Cell+L3 user DSQ */
-	struct { u64 l3: L3_B;     u64 cell: CELL_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } cell_l3_dsq;
+	struct {
+		u64 l3 : L3_B;
+		u64 cell : CELL_B;
+		u64 type : TYPE_B;
+		u64 rsvd : RSVD_B;
+	} cell_l3_dsq;
 
 	/* Generic user view */
-	struct { u64 data: DATA_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } user_dsq;
+	struct {
+		u64 data : DATA_B;
+		u64 type : TYPE_B;
+		u64 rsvd : RSVD_B;
+	} user_dsq;
 
 	/* Built-in DSQ view */
-	struct { u64 value:32; u64 rsvd:30; u64 local_on:1; u64 builtin:1; } builtin_dsq;
+	struct {
+		u64 value : 32;
+		u64 rsvd : 30;
+		u64 local_on : 1;
+		u64 builtin : 1;
+	} builtin_dsq;
 
 	/* NOTE: Considered packed and aligned attributes, but that's redundant */
 } dsq_id_t;
@@ -105,16 +124,22 @@ typedef union {
  * invalid bc bit 63 clear (it's a user DSQ) && dsq_type == 0 (no type)
  * Good for catching uninitialized DSQ IDs.
 */
-#define DSQ_INVALID ((u64) 0)
+#define DSQ_INVALID ((u64)0)
 
-_Static_assert(sizeof(((dsq_id_t){0}).cpu_dsq)     == sizeof(u64), "cpu view must be 8 bytes");
-_Static_assert(sizeof(((dsq_id_t){0}).cell_l3_dsq) == sizeof(u64), "cell+l3 view must be 8 bytes");
-_Static_assert(sizeof(((dsq_id_t){0}).user_dsq)    == sizeof(u64), "user common view must be 8 bytes");
-_Static_assert(sizeof(((dsq_id_t){0}).builtin_dsq) == sizeof(u64), "builtin view must be 8 bytes");
+_Static_assert(sizeof(((dsq_id_t){ 0 }).cpu_dsq) == sizeof(u64),
+	       "cpu view must be 8 bytes");
+_Static_assert(sizeof(((dsq_id_t){ 0 }).cell_l3_dsq) == sizeof(u64),
+	       "cell+l3 view must be 8 bytes");
+_Static_assert(sizeof(((dsq_id_t){ 0 }).user_dsq) == sizeof(u64),
+	       "user common view must be 8 bytes");
+_Static_assert(sizeof(((dsq_id_t){ 0 }).builtin_dsq) == sizeof(u64),
+	       "builtin view must be 8 bytes");
 
 /* Compile-time checks (in bytes) */
-_Static_assert(sizeof(dsq_id_t)   == sizeof(u64), "dsq_id_t must be 8 bytes (64 bits)");
-_Static_assert(_Alignof(dsq_id_t) == sizeof(u64), "dsq_id_t must be 8-byte aligned");
+_Static_assert(sizeof(dsq_id_t) == sizeof(u64),
+	       "dsq_id_t must be 8 bytes (64 bits)");
+_Static_assert(_Alignof(dsq_id_t) == sizeof(u64),
+	       "dsq_id_t must be 8-byte aligned");
 
 /* DSQ type enumeration */
 enum dsq_type {
@@ -124,17 +149,20 @@ enum dsq_type {
 };
 
 /* Range guards */
-_Static_assert(MAX_CPUS  <= (1u << CPU_B),  "MAX_CPUS must fit in field");
-_Static_assert(MAX_L3S   <= (1u << L3_B),   "MAX_L3S must fit in field");
+_Static_assert(MAX_CPUS <= (1u << CPU_B), "MAX_CPUS must fit in field");
+_Static_assert(MAX_L3S <= (1u << L3_B), "MAX_L3S must fit in field");
 _Static_assert(MAX_CELLS <= (1u << CELL_B), "MAX_CELLS must fit in field");
-_Static_assert(DSQ_TYPE_CELL_L3 < (1u << TYPE_B), "DSQ_TYPE_CELL_L3 must fit in field");
+_Static_assert(DSQ_TYPE_CELL_L3 < (1u << TYPE_B),
+	       "DSQ_TYPE_CELL_L3 must fit in field");
 
 /*
  * While I considered error propagation, I decided to bail to force errors early.
 */
 
-static inline bool is_user_dsq(dsq_id_t dsq_id){
-	return !dsq_id.builtin_dsq.builtin && dsq_id.user_dsq.type != DSQ_TYPE_NONE;
+static inline bool is_user_dsq(dsq_id_t dsq_id)
+{
+	return !dsq_id.builtin_dsq.builtin &&
+	       dsq_id.user_dsq.type != DSQ_TYPE_NONE;
 }
 
 // Is this a per CPU DSQ?
@@ -167,5 +195,7 @@ static inline dsq_id_t get_cell_l3_dsq_id(u32 cell, u32 l3)
 	if (cell >= MAX_CELLS || l3 >= MAX_L3S)
 		scx_bpf_error("cell %u or l3 %u too large\n", cell, l3);
 
-	return (dsq_id_t){ .cell_l3_dsq = { .l3 = l3, .cell = cell, .type = DSQ_TYPE_CELL_L3 } };
+	return (dsq_id_t){ .cell_l3_dsq = { .l3 = l3,
+					    .cell = cell,
+					    .type = DSQ_TYPE_CELL_L3 } };
 }
diff --git a/scheds/rust/scx_mitosis/src/bpf/intf.h b/scheds/rust/scx_mitosis/src/bpf/intf.h
index 130f4f2480..b1612430c6 100644
--- a/scheds/rust/scx_mitosis/src/bpf/intf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/intf.h
@@ -41,10 +41,13 @@ enum consts {
 
 /* Kernel side sees the real lock; userspace sees padded bytes of same size/alignment */
 #if defined(__BPF__)
-# define CELL_LOCK_T struct bpf_spin_lock
+#define CELL_LOCK_T struct bpf_spin_lock
 #else
 /* userspace placeholder: kernel won’t copy spin_lock */
-# define CELL_LOCK_T struct { u32 __pad; }  /* 4-byte aligned as required */
+#define CELL_LOCK_T        \
+	struct {           \
+		u32 __pad; \
+	} /* 4-byte aligned as required */
 #endif
 
 struct cell {
@@ -74,25 +77,24 @@ struct cell {
 
 // All assertions work for both BPF and userspace builds
 _Static_assert(offsetof(struct cell, lock) == 0,
-               "lock/padding must be first field");
+	       "lock/padding must be first field");
 
 _Static_assert(sizeof(((struct cell *)0)->lock) == 4,
-               "lock/padding must be 4 bytes");
+	       "lock/padding must be 4 bytes");
 
 _Static_assert(_Alignof(CELL_LOCK_T) == 4,
-               "lock/padding must be 4-byte aligned");
+	       "lock/padding must be 4-byte aligned");
 
 _Static_assert(offsetof(struct cell, in_use) == 4,
-               "in_use must follow 4-byte lock/padding");
+	       "in_use must follow 4-byte lock/padding");
 
 // Verify these are the same size in both BPF and Rust.
 _Static_assert(sizeof(struct cell) ==
-               ( (4 * sizeof(u32)) + (4 * MAX_L3S) + (8 * MAX_L3S)),
-               "struct cell size must be stable for Rust bindings");
+		       ((4 * sizeof(u32)) + (4 * MAX_L3S) + (8 * MAX_L3S)),
+	       "struct cell size must be stable for Rust bindings");
 
-// Ensure no unexpected padding was added
 _Static_assert(sizeof(struct cell) == 208,
-               "struct cell must be exactly 208 bytes");
+	       "struct cell must be exactly 208 bytes");
 
 /* Statistics */
 enum cell_stat_idx {
diff --git a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
index eb5e35c352..2e5281984b 100644
--- a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
@@ -15,7 +15,7 @@
 #include "intf.h"
 
 typedef u32 l3_id_t;
-#define L3_INVALID ((l3_id_t) ~0u)
+#define L3_INVALID ((l3_id_t)~0u)
 
 // Configure how aggressively we steal work.
 // When task is detected as a steal candidate, skip it this many times
@@ -51,14 +51,16 @@ extern struct cpu_to_l3_map cpu_to_l3;
 extern struct l3_to_cpus_map l3_to_cpus;
 extern struct steal_stats_map steal_stats;
 
-static inline const bool l3_is_valid(u32 l3_id) {
+static inline const bool l3_is_valid(u32 l3_id)
+{
 	if (l3_id == L3_INVALID)
 		return false;
 
 	return (l3_id >= 0) && (l3_id < MAX_L3S);
 }
 
-static inline void init_task_l3(struct task_ctx *tctx) {
+static inline void init_task_l3(struct task_ctx *tctx)
+{
 	tctx->l3 = L3_INVALID;
 
 #if MITOSIS_ENABLE_STEALING
@@ -67,7 +69,6 @@ static inline void init_task_l3(struct task_ctx *tctx) {
 	tctx->last_stolen_at = 0;
 	tctx->steals_prevented = 0;
 #endif
-
 }
 
 static inline const struct cpumask *lookup_l3_cpumask(u32 l3)
@@ -101,7 +102,7 @@ static __always_inline void recalc_cell_l3_counts(u32 cell_idx)
 
 	u32 l3, l3s_present = 0, total_cpus = 0;
 	// Just so we don't hold the lock longer than necessary
-	u32 l3_cpu_cnt_tmp[MAX_L3S] = {0};
+	u32 l3_cpu_cnt_tmp[MAX_L3S] = { 0 };
 
 	{ // RCU context
 		RCU_READ_GUARD();
@@ -109,7 +110,8 @@ static __always_inline void recalc_cell_l3_counts(u32 cell_idx)
 			lookup_cell_cpumask(cell_idx); // RCU ptr
 
 		if (!cell_mask) {
-			scx_bpf_error("recalc_cell_l3_counts: invalid cell mask");
+			scx_bpf_error(
+				"recalc_cell_l3_counts: invalid cell mask");
 			return;
 		}
 
@@ -117,13 +119,15 @@ static __always_inline void recalc_cell_l3_counts(u32 cell_idx)
 		{
 			const struct cpumask *l3_mask = lookup_l3_cpumask(l3);
 			if (!l3_mask) {
-				scx_bpf_error( "recalc_cell_l3_counts: invalid l3 mask");
+				scx_bpf_error(
+					"recalc_cell_l3_counts: invalid l3 mask");
 				return;
 			}
 
 			bpf_cpumask_and(tmp_guard.mask, cell_mask, l3_mask);
 
-			u32 cnt = bpf_cpumask_weight((const struct cpumask *)tmp_guard.mask);
+			u32 cnt = bpf_cpumask_weight(
+				(const struct cpumask *)tmp_guard.mask);
 
 			l3_cpu_cnt_tmp[l3] = cnt;
 
@@ -141,7 +145,7 @@ static __always_inline void recalc_cell_l3_counts(u32 cell_idx)
 	// Write to cell
 	bpf_spin_lock(&cell->lock);
 	for (u32 l3 = 0; l3 < nr_l3; l3++) {
-			cell->l3_cpu_cnt[l3] = l3_cpu_cnt_tmp[l3];
+		cell->l3_cpu_cnt[l3] = l3_cpu_cnt_tmp[l3];
 	}
 
 	cell->l3_present_cnt = l3s_present;
@@ -177,7 +181,9 @@ static inline s32 pick_l3_for_task(u32 cell_id)
 
 	// No cpus
 	if (!cell_snapshot.cpu_cnt) {
-		scx_bpf_error("pick_l3_for_task: cell %d has no CPUs accounted yet", cell_id);
+		scx_bpf_error(
+			"pick_l3_for_task: cell %d has no CPUs accounted yet",
+			cell_id);
 		return L3_INVALID;
 	}
 
diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
index c1e6acf17e..44cfee2f3d 100644
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
@@ -63,7 +63,8 @@ struct l3_to_cpus_map l3_to_cpus SEC(".maps");
 struct function_counters_map function_counters SEC(".maps");
 struct steal_stats_map steal_stats SEC(".maps");
 
-static inline void increment_counter(enum fn_counter_idx idx) {
+static inline void increment_counter(enum fn_counter_idx idx)
+{
 	u64 *counter;
 	u32 key = idx;
 
@@ -168,7 +169,6 @@ static inline struct cpu_ctx *lookup_cpu_ctx(int cpu)
 	return cctx;
 }
 
-
 /*
  * Cells are allocated concurrently in some cases (e.g. cgroup_init).
  * allocate_cell and free_cell enable these allocations to be done safely
@@ -185,8 +185,10 @@ static inline int allocate_cell()
 		bpf_spin_lock(&c->lock);
 		if (c->in_use == 0) {
 			// Zero everything except the lock (which is first)
-			__builtin_memset(&c->in_use, 0, sizeof(struct cell) - sizeof(CELL_LOCK_T));
-			c->in_use = 1;  // Then mark as in use
+			__builtin_memset(&c->in_use, 0,
+					 sizeof(struct cell) -
+						 sizeof(CELL_LOCK_T));
+			c->in_use = 1; // Then mark as in use
 			bpf_spin_unlock(&c->lock);
 			return cell_idx;
 		}
@@ -313,7 +315,8 @@ static inline int update_task_cpumask(struct task_struct *p,
 		if (tctx->l3 != L3_INVALID) {
 			l3_mask = lookup_l3_cpumask((u32)tctx->l3);
 			/* If the L3 no longer intersects the cell's cpumask, invalidate it */
-			if (!l3_mask || !bpf_cpumask_intersects(cell_cpumask, l3_mask))
+			if (!l3_mask ||
+			    !bpf_cpumask_intersects(cell_cpumask, l3_mask))
 				tctx->l3 = L3_INVALID;
 		}
 
@@ -333,10 +336,13 @@ static inline int update_task_cpumask(struct task_struct *p,
 		/* --- Narrow the effective cpumask by the chosen L3 --- */
 		/* tctx->cpumask already contains (task_affinity ∧ cell_mask) */
 		if (tctx->cpumask)
-			bpf_cpumask_and(tctx->cpumask, (const struct cpumask *)tctx->cpumask, l3_mask);
+			bpf_cpumask_and(tctx->cpumask,
+					(const struct cpumask *)tctx->cpumask,
+					l3_mask);
 
 		/* If empty after intersection, nothing can run here */
-		if (tctx->cpumask && bpf_cpumask_empty((const struct cpumask *)tctx->cpumask)) {
+		if (tctx->cpumask &&
+		    bpf_cpumask_empty((const struct cpumask *)tctx->cpumask)) {
 			scx_bpf_error("Empty cpumask after intersection");
 			return -ENODEV;
 		}
@@ -350,7 +356,7 @@ static inline int update_task_cpumask(struct task_struct *p,
 			return -ENOENT;
 		}
 
-		if (!l3_is_valid(tctx->l3)){
+		if (!l3_is_valid(tctx->l3)) {
 			scx_bpf_error("Invalid L3 %d", tctx->l3);
 			return -EINVAL;
 		}
@@ -571,7 +577,8 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags)
 			return;
 
 		if (!l3_is_valid(tctx->l3)) {
-			scx_bpf_error("Invalid L3 ID for task %d in enqueue", p->pid);
+			scx_bpf_error("Invalid L3 ID for task %d in enqueue",
+				      p->pid);
 			return;
 		}
 		basis_vtime = READ_ONCE(cell->l3_vtime_now[tctx->l3]);
@@ -640,7 +647,8 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev)
 	/* Check the L3 queue */
 	if (l3 != L3_INVALID) {
 		dsq_id_t cell_l3_dsq = get_cell_l3_dsq_id(cell, l3);
-		bpf_for_each(scx_dsq, p, cell_l3_dsq.raw, 0) {
+		bpf_for_each(scx_dsq, p, cell_l3_dsq.raw, 0)
+		{
 			min_vtime = p->scx.dsq_vtime;
 			min_vtime_dsq = cell_l3_dsq;
 			found = true;
@@ -649,7 +657,8 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev)
 	}
 
 	/* Check the CPU DSQ for a lower vtime */
-	bpf_for_each(scx_dsq, p, local_dsq.raw, 0) {
+	bpf_for_each(scx_dsq, p, local_dsq.raw, 0)
+	{
 		if (!found || time_before(p->scx.dsq_vtime, min_vtime)) {
 			min_vtime = p->scx.dsq_vtime;
 			min_vtime_dsq = local_dsq;
@@ -664,7 +673,6 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev)
 	* we might never wakeup.
 	*/
 
-
 	if (found) {
 		// We found a task in the local or cell-L3 DSQ
 
@@ -824,7 +832,8 @@ int running;
 
 /* The guard is a stack variable. When it falls out of scope,
  * we drop the running lock. */
-static inline void __running_unlock(int *guard) {
+static inline void __running_unlock(int *guard)
+{
 	(void)guard; /* unused */
 	WRITE_ONCE(running, 0);
 }
@@ -834,7 +843,6 @@ static inline void __running_unlock(int *guard) {
  */
 void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run)
 {
-
 	u32 local_configuration_seq = READ_ONCE(configuration_seq);
 	if (local_configuration_seq == READ_ONCE(applied_configuration_seq))
 		return;
@@ -908,7 +916,8 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run)
 	 * Iterate over all cgroups, check if any have a cpumask and populate them
 	 * as a separate cell.
 	 */
-	bpf_for_each(css, pos, root_css, BPF_CGROUP_ITER_DESCENDANTS_PRE) {
+	bpf_for_each(css, pos, root_css, BPF_CGROUP_ITER_DESCENDANTS_PRE)
+	{
 		cur_cgrp = pos->cgroup;
 
 		/*
@@ -1033,7 +1042,8 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run)
 	int cpu_idx;
 	bpf_for(cpu_idx, 0, nr_possible_cpus)
 	{
-		if (bpf_cpumask_test_cpu( cpu_idx, (const struct cpumask *)root_bpf_cpumask)) {
+		if (bpf_cpumask_test_cpu(cpu_idx, (const struct cpumask *)
+							  root_bpf_cpumask)) {
 			struct cpu_ctx *cpu_ctx;
 			if (!(cpu_ctx = lookup_cpu_ctx(cpu_idx)))
 				goto out_root_cgrp;
@@ -1058,10 +1068,12 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run)
 
 	int cell_idx;
 	/* Recalculate L3 counts for all active cells after CPU assignment changes */
-	bpf_for(cell_idx, 1, MAX_CELLS) {
+	bpf_for(cell_idx, 1, MAX_CELLS)
+	{
 		struct cell *cell;
 		if (!(cell = lookup_cell(cell_idx))) {
-			scx_bpf_error("Lookup for cell %d failed in tick()", cell_idx);
+			scx_bpf_error("Lookup for cell %d failed in tick()",
+				      cell_idx);
 			goto out_root_cgrp;
 		}
 
@@ -1132,8 +1144,10 @@ void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p)
 	 * Update per-(cell, L3) vtime for cell-schedulable tasks
 	 */
 	if (tctx->all_cell_cpus_allowed && l3_is_valid(tctx->l3)) {
-		if (time_before(READ_ONCE(cell->l3_vtime_now[tctx->l3]), p->scx.dsq_vtime))
-			WRITE_ONCE(cell->l3_vtime_now[tctx->l3], p->scx.dsq_vtime);
+		if (time_before(READ_ONCE(cell->l3_vtime_now[tctx->l3]),
+				p->scx.dsq_vtime))
+			WRITE_ONCE(cell->l3_vtime_now[tctx->l3],
+				   p->scx.dsq_vtime);
 	}
 
 	/*
@@ -1386,22 +1400,67 @@ static __always_inline void dump_cell_state(u32 cell_idx)
 	}
 
 	scx_bpf_dump("Cell %d: in_use=%d, cpu_cnt=%d, l3_present_cnt=%d",
-		     cell_idx, cell->in_use, cell->cpu_cnt, cell->l3_present_cnt);
+		     cell_idx, cell->in_use, cell->cpu_cnt,
+		     cell->l3_present_cnt);
 
 	u32 l3;
 	// TODO Print vtimes for L3s
 	// TODO lock
-	bpf_for(l3, 0, nr_l3) {
+	bpf_for(l3, 0, nr_l3)
+	{
 		if (cell->l3_cpu_cnt[l3] > 0) {
-			scx_bpf_dump("  L3[%d]: %d CPUs", l3, cell->l3_cpu_cnt[l3]);
+			scx_bpf_dump("  L3[%d]: %d CPUs", l3,
+				     cell->l3_cpu_cnt[l3]);
 		}
 	}
 }
 
-// TODO: FIX THIS
-static __always_inline void dump_l3_state(){
+static __always_inline void dump_l3_state()
+{
+	u32 l3;
+	const struct cpumask *l3_mask;
+	dsq_id_t dsq_id;
+
+	scx_bpf_dump("\n=== L3 Cache Topology ===\n");
+	scx_bpf_dump("Total L3 domains: %d\n", nr_l3);
 
+	bpf_for(l3, 0, nr_l3)
+	{
+		l3_mask = lookup_l3_cpumask(l3);
+		if (!l3_mask) {
+			scx_bpf_dump(
+				"L3[%d]: ERROR - failed to lookup cpumask\n",
+				l3);
+			continue;
+		}
+
+		scx_bpf_dump("L3[%d] CPUS=", l3);
+		dump_cpumask(l3_mask);
+		scx_bpf_dump("\n");
 
+		scx_bpf_dump("  Per-cell DSQ stats:\n");
+		u32 cell_idx;
+		bpf_for(cell_idx, 0, MAX_CELLS)
+		{
+			struct cell *cell = lookup_cell(cell_idx);
+			if (!cell || !cell->in_use)
+				continue;
+
+			if (!l3_is_valid(l3))
+				continue;
+
+			dsq_id = get_cell_l3_dsq_id(cell_idx, l3);
+			u64 nr_queued = scx_bpf_dsq_nr_queued(dsq_id.raw);
+
+			if (nr_queued > 0 || cell->l3_cpu_cnt[l3] > 0) {
+				scx_bpf_dump(
+					"    Cell[%d]: %d CPUs, vtime=%llu, nr_queued=%llu\n",
+					cell_idx, cell->l3_cpu_cnt[l3],
+					READ_ONCE(cell->l3_vtime_now[l3]),
+					nr_queued);
+			}
+		}
+	}
 }
 
 void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx)
@@ -1439,7 +1498,6 @@ void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx)
 	}
 
 	dump_l3_state();
-
 }
 
 void BPF_STRUCT_OPS(mitosis_dump_task, struct scx_dump_ctx *dctx,
@@ -1485,7 +1543,8 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init)
 		if ((u8_ptr = MEMBER_VPTR(all_cpus, [i / 8]))) {
 			if (*u8_ptr & (1 << (i % 8))) {
 				bpf_cpumask_set_cpu(i, cpumask);
-				ret = scx_bpf_create_dsq(get_cpu_dsq_id(i).raw, ANY_NUMA);
+				ret = scx_bpf_create_dsq(get_cpu_dsq_id(i).raw,
+							 ANY_NUMA);
 				if (ret < 0) {
 					bpf_cpumask_release(cpumask);
 					return ret;
@@ -1496,7 +1555,6 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init)
 		}
 	}
 
-
 	cpumask = bpf_kptr_xchg(&all_cpumask, cpumask);
 	if (cpumask)
 		bpf_cpumask_release(cpumask);
@@ -1553,9 +1611,12 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init)
 		u32 l3;
 		bpf_for(l3, 0, nr_l3)
 		{
-			ret = scx_bpf_create_dsq(get_cell_l3_dsq_id(i, l3).raw, ANY_NUMA);
+			ret = scx_bpf_create_dsq(get_cell_l3_dsq_id(i, l3).raw,
+						 ANY_NUMA);
 			if (ret < 0)
-				scx_bpf_error( "Failed to create DSQ for cell %d, L3 %d: err %d", i, l3, ret);
+				scx_bpf_error(
+					"Failed to create DSQ for cell %d, L3 %d: err %d",
+					i, l3, ret);
 		}
 	}
 
diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
index e42f7379f2..52738c6a21 100644
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
@@ -75,9 +75,8 @@ static inline void copy_cell_skip_lock(struct cell *dst, const struct cell *src)
 	 * Since lock is first and 4 bytes (verified by static assertions),
 	 * we skip it and copy the remainder of the struct.
 	 */
-	__builtin_memcpy(&dst->in_use,
-	                 &src->in_use,
-	                 sizeof(struct cell) - sizeof(CELL_LOCK_T));
+	__builtin_memcpy(&dst->in_use, &src->in_use,
+			 sizeof(struct cell) - sizeof(CELL_LOCK_T));
 }
 
 static inline struct cell *lookup_cell(int idx)
@@ -166,35 +165,43 @@ struct rcu_read_guard {
 	bool active;
 };
 
-static inline struct rcu_read_guard rcu_read_lock_guard(void) {
+static inline struct rcu_read_guard rcu_read_lock_guard(void)
+{
 	bpf_rcu_read_lock();
-	return (struct rcu_read_guard){.active = true};
+	return (struct rcu_read_guard){ .active = true };
 }
 
-static inline void rcu_read_guard_release(struct rcu_read_guard *guard) {
+static inline void rcu_read_guard_release(struct rcu_read_guard *guard)
+{
 	if (guard->active) {
 		bpf_rcu_read_unlock();
 		guard->active = false;
 	}
 }
-#define RCU_READ_GUARD() \
-	struct rcu_read_guard __rcu_guard __attribute__((__cleanup__(rcu_read_guard_release))) = rcu_read_lock_guard()
+#define RCU_READ_GUARD()                                               \
+	struct rcu_read_guard __rcu_guard                              \
+		__attribute__((__cleanup__(rcu_read_guard_release))) = \
+			rcu_read_lock_guard()
 
 struct cpumask_guard {
 	struct bpf_cpumask *mask;
 };
 
-static inline struct cpumask_guard cpumask_create_guard(void) {
+static inline struct cpumask_guard cpumask_create_guard(void)
+{
 	struct bpf_cpumask *mask = bpf_cpumask_create();
-	return (struct cpumask_guard){.mask = mask};
+	return (struct cpumask_guard){ .mask = mask };
 }
 
-static inline void cpumask_guard_release(struct cpumask_guard *guard) {
+static inline void cpumask_guard_release(struct cpumask_guard *guard)
+{
 	if (guard->mask) {
 		bpf_cpumask_release(guard->mask);
 		guard->mask = NULL;
 	}
 }
 
-#define CPUMASK_GUARD(var_name) \
-	struct cpumask_guard var_name __attribute__((__cleanup__(cpumask_guard_release))) = cpumask_create_guard()
+#define CPUMASK_GUARD(var_name)                                       \
+	struct cpumask_guard var_name                                 \
+		__attribute__((__cleanup__(cpumask_guard_release))) = \
+			cpumask_create_guard()

From d0a7eed1d999faf8d15093b2eda02eff4fc46f53 Mon Sep 17 00:00:00 2001
From: tommy-u <tunger10@gmail.com>
Date: Thu, 9 Oct 2025 14:54:41 -0700
Subject: [PATCH 11/12] remove accidental code file

---
 code.txt | 2382 ------------------------------------------------------
 1 file changed, 2382 deletions(-)
 delete mode 100644 code.txt

diff --git a/code.txt b/code.txt
deleted file mode 100644
index 64c3002bbe..0000000000
--- a/code.txt
+++ /dev/null
@@ -1,2382 +0,0 @@
-]633;E;for file in scheds/rust/scx_mitosis/src/bpf/*;7dc75c10-53e2-4af4-8cab-ea0159bd7502]633;C# File: scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h
-/* Copyright (c) Meta Platforms, Inc. and affiliates. */
-/*
- * This software may be used and distributed according to the terms of the
- * GNU General Public License version 2.
- *
- * This header defines the 64-bit dispatch queue (DSQ) ID encoding
- * scheme for scx_mitosis, using type fields to distinguish between
- * per-CPU and cell+L3 domain queues. It includes helper functions to
- * construct, validate, and parse these DSQ IDs for queue management.
- */
-#pragma once
-
-#include "intf.h"
-#include "mitosis.bpf.h"
-
-/*
- * ================================
- * BPF DSQ ID Layout (64 bits wide)
- * ================================
- *
- * Top-level format:
- *   [63] [62..0]
- *   [ B] [  ID ]
- *
- * If B == 1 it is a Built-in DSQ
- * -------------------------
- *   [63] [62] [61 .. 32]  [31..0]
- *   [ 1] [ L] [   R    ]  [  V  ]
- *
- *   - L (bit 62): LOCAL_ON flag
- *       If L == 1 -> V = CPU number
- *   - R (30 bits): reserved / unused
- *   - V (32 bits): value (e.g., CPU#)
- *
- * If B == 0 -> User-defined DSQ
- * -----------------------------
- * Only the low 32 bits are used.
- *
- *   [63     ..     32] [31..0]
- *   [ 0][   unused   ] [ VAL ]
- *
- *   Mitosis uses VAL as follows:
- *
- *   [31..28] [27..0]
- *   [QTYPE ] [DATA ]
- *
- *   QTYPE encodes the queue type:
- *
- *     QTYPE = 0x1 -> Per-CPU Q
- *       [31..28] [27 ..          ..        0]
- *       [ 0001 ] [          CPU#            ]
- *       [Q-TYPE:1]
- *
- *     QTYPE = 0x2 -> Cell+L3 Q
- *       [31..28] [27 .. 16] [15      ..    0]
- *       [ 0010 ] [  CELL# ] [      L3ID     ]
- *       [Q-TYPE:2]
- *
- */
-/*
- * The use of these bitfields depends on compiler defined byte AND bit ordering.
- * Make sure we're only building with Clang/LLVM and that we're little-endian.
- */
-#ifndef __clang__
-#error "This code must be compiled with Clang/LLVM (eBPF: clang -target bpf)."
-#endif
-
-#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
-#error "dsq64 bitfield layout assumes little-endian (bpfel)."
-#endif
-
-/* ---- Bitfield widths (bits) ---- */
-#define CPU_B     28
-#define L3_B      16
-#define CELL_B    12
-#define TYPE_B     4
-#define DATA_B    28
-#define RSVD_B    32
-
-/* Sum checks (in bits) */
-_Static_assert(CPU_B  + TYPE_B          == 32, "CPU layout low half must be 32 bits");
-_Static_assert(L3_B   + CELL_B + TYPE_B == 32, "CELL+L3 layout low half must be 32 bits");
-_Static_assert(DATA_B + TYPE_B          == 32, "Common layout low half must be 32 bits");
-
-typedef union {
-	u64 raw;
-
-	/* Per-CPU user DSQ */
-	struct { u64 cpu: CPU_B;   u64 type: TYPE_B; u64 rsvd: RSVD_B; } cpu_dsq;
-
-	/* Cell+L3 user DSQ */
-	struct { u64 l3: L3_B;     u64 cell: CELL_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } cell_l3_dsq;
-
-	/* Generic user view */
-	struct { u64 data: DATA_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } user_dsq;
-
-	/* Built-in DSQ view */
-	struct { u64 value:32; u64 rsvd:30; u64 local_on:1; u64 builtin:1; } builtin_dsq;
-
-	/* NOTE: Considered packed and aligned attributes, but that's redundant */
-} dsq_id_t;
-
-/*
- * Invalid DSQ ID Sentinel:
- * invalid bc bit 63 clear (it's a user DSQ) && dsq_type == 0 (no type)
- * Good for catching uninitialized DSQ IDs.
-*/
-#define DSQ_INVALID ((u64) 0)
-
-_Static_assert(sizeof(((dsq_id_t){0}).cpu_dsq)     == sizeof(u64), "cpu view must be 8 bytes");
-_Static_assert(sizeof(((dsq_id_t){0}).cell_l3_dsq) == sizeof(u64), "cell+l3 view must be 8 bytes");
-_Static_assert(sizeof(((dsq_id_t){0}).user_dsq)    == sizeof(u64), "user common view must be 8 bytes");
-_Static_assert(sizeof(((dsq_id_t){0}).builtin_dsq) == sizeof(u64), "builtin view must be 8 bytes");
-
-/* Compile-time checks (in bytes) */
-_Static_assert(sizeof(dsq_id_t)   == sizeof(u64), "dsq_id_t must be 8 bytes (64 bits)");
-_Static_assert(_Alignof(dsq_id_t) == sizeof(u64), "dsq_id_t must be 8-byte aligned");
-
-/* DSQ type enumeration */
-enum dsq_type {
-	DSQ_TYPE_NONE,
-	DSQ_TYPE_CPU,
-	DSQ_TYPE_CELL_L3,
-};
-
-/* Range guards */
-_Static_assert(MAX_CPUS  <= (1u << CPU_B),  "MAX_CPUS must fit in field");
-_Static_assert(MAX_L3S   <= (1u << L3_B),   "MAX_L3S must fit in field");
-_Static_assert(MAX_CELLS <= (1u << CELL_B), "MAX_CELLS must fit in field");
-_Static_assert(DSQ_TYPE_CELL_L3 < (1u << TYPE_B), "DSQ_TYPE_CELL_L3 must fit in field");
-
-/*
- * While I considered error propagation, I decided to bail to force errors early.
-*/
-
-static inline bool is_user_dsq(dsq_id_t dsq_id){
-	return !dsq_id.builtin_dsq.builtin && dsq_id.user_dsq.type != DSQ_TYPE_NONE;
-}
-
-// Is this a per CPU DSQ?
-static inline bool is_cpu_dsq(dsq_id_t dsq_id)
-{
-	return is_user_dsq(dsq_id) && dsq_id.user_dsq.type == DSQ_TYPE_CPU;
-}
-
-// If this is a per cpu dsq, return the cpu
-static inline u32 get_cpu_from_dsq(dsq_id_t dsq_id)
-{
-	if (!is_cpu_dsq(dsq_id))
-		scx_bpf_error("trying to get cpu from non-cpu dsq\n");
-
-	return dsq_id.cpu_dsq.cpu;
-}
-
-/* Helper functions to construct DSQ IDs */
-static inline dsq_id_t get_cpu_dsq_id(u32 cpu)
-{
-	// Check for valid CPU range, 0 indexed so >=.
-	if (cpu >= MAX_CPUS)
-		scx_bpf_error("invalid cpu %u\n", cpu);
-
-	return (dsq_id_t){ .cpu_dsq = { .cpu = cpu, .type = DSQ_TYPE_CPU } };
-}
-
-static inline dsq_id_t get_cell_l3_dsq_id(u32 cell, u32 l3)
-{
-	if (cell >= MAX_CELLS || l3 >= MAX_L3S)
-		scx_bpf_error("cell %u or l3 %u too large\n", cell, l3);
-
-	return (dsq_id_t){ .cell_l3_dsq = { .l3 = l3, .cell = cell, .type = DSQ_TYPE_CELL_L3 } };
-}
-# File: scheds/rust/scx_mitosis/src/bpf/intf.h
-// Copyright (c) Meta Platforms, Inc. and affiliates.
-
-// This software may be used and distributed according to the terms of the
-// GNU General Public License version 2.
-#ifndef __INTF_H
-#define __INTF_H
-
-#ifndef __KERNEL__
-typedef unsigned long long u64;
-typedef unsigned int u32;
-typedef _Bool bool;
-#endif
-
-#ifdef LSP
-#define __bpf__
-#include "../../../../include/scx/ravg.bpf.h"
-#else
-#include <scx/ravg.bpf.h>
-#endif
-
-/* ---- Work stealing config (compile-time) ------------------------------- */
-#ifndef MITOSIS_ENABLE_STEALING
-#define MITOSIS_ENABLE_STEALING 1
-#endif
-/* ----------------------------------------------------------------------- */
-
-enum consts {
-	CACHELINE_SIZE = 64,
-	MAX_CPUS_SHIFT = 9,
-	MAX_CPUS = 1 << MAX_CPUS_SHIFT,
-	MAX_CPUS_U8 = MAX_CPUS / 8,
-	MAX_CELLS = 16,
-	USAGE_HALF_LIFE = 100000000, /* 100ms */
-
-	PCPU_BASE = 0x80000000,
-	MAX_CG_DEPTH = 256,
-};
-
-/* Statistics */
-enum cell_stat_idx {
-	CSTAT_LOCAL,
-	CSTAT_CPU_DSQ,
-	CSTAT_CELL_DSQ,
-	CSTAT_AFFN_VIOL,
-	NR_CSTATS,
-};
-
-/* Function invocation counters */
-enum fn_counter_idx {
-	COUNTER_SELECT_CPU,
-	COUNTER_ENQUEUE,
-	COUNTER_DISPATCH,
-	NR_COUNTERS,
-};
-
-struct cpu_ctx {
-	u64 cstats[MAX_CELLS][NR_CSTATS];
-	u64 cell_cycles[MAX_CELLS];
-	u32 cell;
-	u64 vtime_now;
-};
-
-struct cgrp_ctx {
-	u32 cell;
-	bool cell_owner;
-};
-
-#endif /* __INTF_H */
-# File: scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
-/* Copyright (c) Meta Platforms, Inc. and affiliates. */
-/*
- * This software may be used and distributed according to the terms of the
- * GNU General Public License version 2.
- *
- * This header assists adding L3 cache awareness to scx_mitosis by defining
- * maps and fns for managing CPU-to-L3 domain mappings. It provides code to
- * recalculate per-L3 CPU counts within cells and implements weighted
- * random L3 selection for tasks. It also tracks work-stealing
- * statistics for cross-L3 task migrations.
- */
-#pragma once
-
-#include "mitosis.bpf.h"
-#include "intf.h"
-
-typedef u32 l3_id_t;
-#define L3_INVALID ((l3_id_t)~0u)
-
-// Configure how aggressively we steal work.
-// When task is detected as a steal candidate, skip it this many times
-// On a web server workload, 100 reduced steal count by ~90%
-#ifdef MITOSIS_ENABLE_STEALING
-#define PREVENT_N_STEALS 0
-#endif
-
-/* Work stealing statistics map - accessible from both BPF and userspace */
-struct steal_stats_map {
-	__uint(type, BPF_MAP_TYPE_ARRAY);
-	__type(key, u32);
-	__type(value, u64);
-	__uint(max_entries, 1);
-};
-
-// A CPU -> L3 cache ID map
-struct cpu_to_l3_map {
-	__uint(type, BPF_MAP_TYPE_ARRAY);
-	__type(key, u32);
-	__type(value, u32);
-	__uint(max_entries, MAX_CPUS);
-};
-
-struct l3_to_cpus_map {
-	__uint(type, BPF_MAP_TYPE_ARRAY);
-	__type(key, u32);
-	__type(value, struct cpumask);
-	__uint(max_entries, MAX_L3S);
-};
-
-extern struct cpu_to_l3_map cpu_to_l3;
-extern struct l3_to_cpus_map l3_to_cpus;
-extern struct steal_stats_map steal_stats;
-
-static inline const bool l3_is_valid(u32 l3_id)
-{
-	if (l3_id == L3_INVALID)
-		return false;
-
-	return (l3_id >= 0) && (l3_id < MAX_L3S);
-}
-
-static inline void init_task_l3(struct task_ctx *tctx)
-{
-	tctx->l3 = L3_INVALID;
-
-#if MITOSIS_ENABLE_STEALING
-	tctx->pending_l3 = L3_INVALID;
-	tctx->steal_count = 0;
-	tctx->last_stolen_at = 0;
-	tctx->steals_prevented = 0;
-#endif
-}
-
-static inline const struct cpumask *lookup_l3_cpumask(u32 l3)
-{
-	struct cpumask *mask;
-
-	if (!(mask = bpf_map_lookup_elem(&l3_to_cpus, &l3))) {
-		scx_bpf_error("no l3 cpumask, l3: %d, %p", l3, &l3_to_cpus);
-		return NULL;
-	}
-
-	return mask;
-}
-
-/* Recompute cell->l3_cpu_cnt[] after cell cpumask changes */
-// TODO: use RAII and lock around updates (races with )
-static __always_inline void recalc_cell_l3_counts(u32 cell_idx)
-{
-	struct cell *cell = lookup_cell(cell_idx);
-	if (!cell) {
-		scx_bpf_error("recalc_cell_l3_counts: invalid cell %d",
-			      cell_idx);
-		return;
-	}
-
-	CPUMASK_GUARD(tmp_guard);
-	if (!tmp_guard.mask) {
-		scx_bpf_error(
-			"recalc_cell_l3_counts: failed to create tmp mask");
-		return;
-	}
-
-	u32 l3, l3s_present = 0, total_cpus = 0;
-	// Just so we don't hold the lock longer than necessary
-	u32 l3_cpu_cnt_tmp[MAX_L3S] = {0};
-
-	{ // RCU context
-		RCU_READ_GUARD();
-		const struct cpumask *cell_mask =
-			lookup_cell_cpumask(cell_idx); // RCU ptr
-
-		if (!cell_mask) {
-			scx_bpf_error(
-				"recalc_cell_l3_counts: invalid cell mask");
-			return;
-		}
-
-		bpf_for(l3, 0, nr_l3)
-		{
-			const struct cpumask *l3_mask = lookup_l3_cpumask(l3);
-			if (!l3_mask) {
-				scx_bpf_error(
-					"recalc_cell_l3_counts: invalid l3 mask");
-				return;
-			}
-
-			bpf_cpumask_and(tmp_guard.mask, cell_mask, l3_mask);
-
-			u32 cnt = bpf_cpumask_weight( (const struct cpumask *)tmp_guard.mask);
-
-			l3_cpu_cnt_tmp[l3] = cnt;
-
-			bpf_printk("recalc_cell_l3_counts: cnt %d", cnt);
-
-			// These are counted across the whole cell
-			total_cpus += cnt;
-
-			if (cnt)
-				l3s_present++;
-		}
-	} // bpf_rcu_read_unlock();
-
-	// WITH_CELL_LOCK(cell, cell_idx, {
-	for (u32 l3 = 0; l3 < nr_l3; l3++) {
-		cell->l3_cpu_cnt[l3] = l3_cpu_cnt_tmp[l3];
-	}
-
-	cell->l3_present_cnt = l3s_present;
-	cell->cpu_cnt = total_cpus;
-	// });
-}
-
-/**
- * Weighted random selection of an L3 cache domain for a task.
- *
- * Uses the CPU count in each L3 domain within the cell as weights to
- * probabilistically select an L3. L3 domains with more CPUs in the cell
- * have higher probability of being selected.
- *
- * @cell_id: The cell ID to select an L3 from
- * @return: L3 ID on success, L3_INVALID on error
- */
-// TODO: Lock
-static inline s32 pick_l3_for_task(u32 cell_id)
-{
-	struct cell *cell;
-
-	/* Look up the cell structure */
-	if (!(cell = lookup_cell(cell_id))) {
-		scx_bpf_error("pick_l3_for_task: invalid cell %d", cell_id);
-		return L3_INVALID;
-	}
-
-	// No cells
-	if (!cell->cpu_cnt) {
-		scx_bpf_error( "pick_l3_for_task: cell %d has no CPUs accounted yet", cell_id);
-		return L3_INVALID;
-	}
-
-	/* Find the L3 domain corresponding to the target value using
-	 * weighted selection - accumulate CPU counts until we exceed target */
-
-	/* Generate random target value in range [0, cpu_cnt) */
-	u32 target = bpf_get_prandom_u32() % cell->cpu_cnt;
-	u32 l3, cur = 0;
-	s32 ret = L3_INVALID;
-
-	// This could be a prefix sum. Find first l3 where we exceed target
-	bpf_for(l3, 0, nr_l3)
-	{
-		cur += cell->l3_cpu_cnt[l3];
-		if (target < cur) {
-			ret = (s32)l3;
-			break;
-		}
-	}
-
-	if (ret == L3_INVALID) {
-		scx_bpf_error("pick_l3_for_task: invalid L3");
-		return L3_INVALID;
-	}
-
-	return ret;
-}
-
-#ifdef MITOSIS_ENABLE_STEALING
-
-static inline bool try_stealing_this_task(struct task_ctx *task_ctx,
-					  s32 local_l3, u64 candidate_dsq)
-{
-	// Attempt the steal, can fail beacuse it's a race.
-	if (!scx_bpf_dsq_move_to_local(candidate_dsq))
-		return false;
-
-	// We got the task!
-	task_ctx->steal_count++;
-	task_ctx->last_stolen_at = scx_bpf_now();
-	/* Retag to thief L3 (the one for this cpu) */
-	task_ctx->pending_l3 = local_l3;
-	task_ctx->steals_prevented = 0;
-
-	/* Increment steal counter in map */
-	u32 key = 0;
-	u64 *count = bpf_map_lookup_elem(&steal_stats, &key);
-	// NOTE: This could get expensive, but I'm not anticipating that many steals. Percpu if we care.
-	if (count)
-		__sync_fetch_and_add(count, 1);
-
-	return true;
-}
-
-/* Work stealing:
- * Scan sibling (cell,L3) DSQs in the same cell and steal the first queued task if it can run on this cpu
-*/
-static inline bool try_stealing_work(u32 cell, s32 local_l3)
-{
-	if (!l3_is_valid(local_l3))
-		scx_bpf_error("try_stealing_work: invalid local_l3");
-
-	struct cell *cell_ptr = lookup_cell(cell);
-	if (!cell_ptr)
-		scx_bpf_error("try_stealing_work: invalid cell");
-
-	// Loop over all other L3s, looking for a queued task to steal
-	u32 i;
-	bpf_for(i, 1, nr_l3)
-	{
-		// Start with the next one to spread out the load
-		u32 candidate_l3 = (local_l3 + i) % nr_l3;
-
-		// Prevents the optimizer from removing the following conditional return
-		// so that the verifier knows the read wil be safe
-		barrier_var(candidate_l3);
-
-		if (candidate_l3 >= MAX_L3S)
-			continue;
-
-		// Skip L3s that are not present in this cell
-		// Note: rechecking cell_ptr for verifier
-		// TODO: Lock?
-		if (cell_ptr && cell_ptr->l3_cpu_cnt[candidate_l3] == 0)
-			continue;
-
-		u64 candidate_dsq = get_cell_l3_dsq_id(cell, candidate_l3).raw;
-
-		struct task_struct *task = NULL;
-		struct task_ctx *task_ctx;
-		// I'm only using this for the verifier
-		bool found_task = false;
-
-		// Optimization: skip if faster than constructing an iterator
-		// Not redundant with later checking if task found (race)
-		if (scx_bpf_dsq_nr_queued(candidate_dsq))
-			continue;
-
-		// Just a trick for peeking the head element
-		bpf_for_each(scx_dsq, task, candidate_dsq, 0)
-		{
-			task_ctx = lookup_task_ctx(task);
-			found_task = (task_ctx != NULL);
-			break;
-		}
-
-		// No task? Try next L3
-		if (!found_task)
-			continue;
-
-		// This knob throttles stealing.
-		// TODO: make runtime configurable
-		if (task_ctx->steals_prevented++ < PREVENT_N_STEALS) {
-			continue;
-		}
-
-		if (!try_stealing_this_task(task_ctx, local_l3, candidate_dsq))
-			continue;
-
-		// Success, we got a task (no guarantee it was the one we peeked though... race)
-		return true;
-	}
-	return false;
-}
-#endif
-# File: scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
-/* Copyright (c) Meta Platforms, Inc. and affiliates. */
-/*
- * This software may be used and distributed according to the terms of the
- * GNU General Public License version 2.
- *
- * scx_mitosis is a dynamic affinity scheduler. Cgroups (and their tasks) are
- * assigned to Cells which are affinitized to discrete sets of CPUs. The number
- * of cells is dynamic, as is cgroup to cell assignment and cell to CPU
- * assignment (all are determined by userspace).
- *
- * Each cell has an associated DSQ which it uses for vtime scheduling of the
- * cgroups belonging to the cell.
- */
-
-// TODO: fix debug printer.
-#include "intf.h"
-
-#include "mitosis.bpf.h"
-#include "dsq.bpf.h"
-#include "l3_aware.bpf.h"
-
-char _license[] SEC("license") = "GPL";
-
-/*
- * Variables populated by userspace
- */
-const volatile u32 nr_possible_cpus = 1;
-const volatile bool smt_enabled = true;
-const volatile unsigned char all_cpus[MAX_CPUS_U8];
-
-const volatile u64 slice_ns;
-const volatile u64 root_cgid = 1;
-
-const volatile u32 nr_l3 = 1;
-/*
- * CPU assignment changes aren't fully in effect until a subsequent tick()
- * configuration_seq is bumped on each assignment change
- * applied_configuration_seq is bumped when the effect is fully applied
- */
-u32 configuration_seq;
-u32 applied_configuration_seq;
-
-private(all_cpumask) struct bpf_cpumask __kptr *all_cpumask;
-private(root_cgrp) struct cgroup __kptr *root_cgrp;
-
-UEI_DEFINE(uei);
-
-// Cells now defined as a map so we can lock.
-struct cell_map cells SEC(".maps");
-
-/*
- * Maps used for L3-aware scheduling
-*/
-#if 0
-struct cell_locks_map cell_locks SEC(".maps");
-#endif
-struct cpu_to_l3_map cpu_to_l3 SEC(".maps");
-struct l3_to_cpus_map l3_to_cpus SEC(".maps");
-
-/*
- * Maps for statistics
-*/
-struct function_counters_map function_counters SEC(".maps");
-struct steal_stats_map steal_stats SEC(".maps");
-
-static inline void increment_counter(enum fn_counter_idx idx) {
-	u64 *counter;
-	u32 key = idx;
-
-	counter = bpf_map_lookup_elem(&function_counters, &key);
-	if (counter)
-		(*counter)++;
-}
-
-static inline struct cgroup *lookup_cgrp_ancestor(struct cgroup *cgrp,
-						  u32 ancestor)
-{
-	struct cgroup *cg;
-
-	if (!(cg = bpf_cgroup_ancestor(cgrp, ancestor))) {
-		scx_bpf_error("Failed to get ancestor level %d for cgid %llu",
-			      ancestor, cgrp->kn->id);
-		return NULL;
-	}
-
-	return cg;
-}
-
-struct {
-	__uint(type, BPF_MAP_TYPE_CGRP_STORAGE);
-	__uint(map_flags, BPF_F_NO_PREALLOC);
-	__type(key, int);
-	__type(value, struct cgrp_ctx);
-} cgrp_ctxs SEC(".maps");
-
-static inline struct cgrp_ctx *lookup_cgrp_ctx_fallible(struct cgroup *cgrp)
-{
-	struct cgrp_ctx *cgc;
-
-	if (!(cgc = bpf_cgrp_storage_get(&cgrp_ctxs, cgrp, 0, 0))) {
-		return NULL;
-	}
-
-	return cgc;
-}
-
-static inline struct cgrp_ctx *lookup_cgrp_ctx(struct cgroup *cgrp)
-{
-	struct cgrp_ctx *cgc = lookup_cgrp_ctx_fallible(cgrp);
-
-	if (!cgc)
-		scx_bpf_error("cgrp_ctx lookup failed for cgid %llu",
-			      cgrp->kn->id);
-
-	return cgc;
-}
-
-static inline struct cgroup *task_cgroup(struct task_struct *p)
-{
-	struct cgroup *cgrp = __COMPAT_scx_bpf_task_cgroup(p);
-	if (!cgrp) {
-		scx_bpf_error("Failed to get cgroup for task %d", p->pid);
-	}
-	return cgrp;
-}
-
-struct {
-	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
-	__uint(map_flags, BPF_F_NO_PREALLOC);
-	__type(key, int);
-	__type(value, struct task_ctx);
-} task_ctxs SEC(".maps");
-
-static inline struct task_ctx *lookup_task_ctx(struct task_struct *p)
-{
-	struct task_ctx *tctx;
-
-	if ((tctx = bpf_task_storage_get(&task_ctxs, p, 0, 0))) {
-		return tctx;
-	}
-
-	scx_bpf_error("task_ctx lookup failed");
-	return NULL;
-}
-
-struct {
-	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-	__type(key, u32);
-	__type(value, struct cpu_ctx);
-	__uint(max_entries, 1);
-} cpu_ctxs SEC(".maps");
-
-static inline struct cpu_ctx *lookup_cpu_ctx(int cpu)
-{
-	struct cpu_ctx *cctx;
-	u32 zero = 0;
-
-	if (cpu < 0)
-		cctx = bpf_map_lookup_elem(&cpu_ctxs, &zero);
-	else
-		cctx = bpf_map_lookup_percpu_elem(&cpu_ctxs, &zero, cpu);
-
-	if (!cctx) {
-		scx_bpf_error("no cpu_ctx for cpu %d", cpu);
-		return NULL;
-	}
-
-	return cctx;
-}
-
-
-
-/*
- * Cells are allocated concurrently in some cases (e.g. cgroup_init).
- * allocate_cell and free_cell enable these allocations to be done safely
- */
-static inline int allocate_cell()
-{
-	int cell_idx;
-	bpf_for(cell_idx, 0, MAX_CELLS)
-	{
-		struct cell *c;
-		if (!(c = lookup_cell(cell_idx)))
-			return -1;
-
-		if (__sync_bool_compare_and_swap(&c->in_use, 0, 1)) {
-			// TODO XXX, I think we need to make this concurrent safe
-			// TODO, lock with recalc_cell...()
-			__builtin_memset(c->l3_cpu_cnt, 0, sizeof(c->l3_cpu_cnt));
-			c->l3_present_cnt = 0;
-			// TODO zero cpu_cnt
-			// TODO Just zero the whole cell struct?
-			return cell_idx;
-		}
-	}
-	scx_bpf_error("No available cells to allocate");
-	return -1;
-}
-
-static inline int free_cell(int cell_idx)
-{
-	struct cell *c;
-
-	if (cell_idx < 0 || cell_idx >= MAX_CELLS) {
-		scx_bpf_error("Invalid cell %d", cell_idx);
-		return -1;
-	}
-
-	if (!(c = lookup_cell(cell_idx)))
-		return -1;
-
-	WRITE_ONCE(c->in_use, 0);
-	return 0;
-}
-
-/*
- * Store the cpumask for each cell (owned by BPF logic). We need this in an
- * explicit map to allow for these to be kptrs.
- */
-struct cell_cpumask_wrapper {
-	struct bpf_cpumask __kptr *cpumask;
-	/*
-	 * To avoid allocation on the reconfiguration path, have a second cpumask we
-	 * can just do an xchg on.
-	 */
-	struct bpf_cpumask __kptr *tmp_cpumask;
-};
-
-struct {
-	__uint(type, BPF_MAP_TYPE_ARRAY);
-	__type(key, u32);
-	__type(value, struct cell_cpumask_wrapper);
-	__uint(max_entries, MAX_CELLS);
-	__uint(map_flags, 0);
-} cell_cpumasks SEC(".maps");
-
-static inline const struct cpumask *lookup_cell_cpumask(int idx)
-{
-	struct cell_cpumask_wrapper *cpumaskw;
-
-	if (!(cpumaskw = bpf_map_lookup_elem(&cell_cpumasks, &idx))) {
-		scx_bpf_error("no cell cpumask");
-		return NULL;
-	}
-
-	return (const struct cpumask *)cpumaskw->cpumask;
-}
-
-/*
- * Helper functions for bumping per-cell stats
- */
-static void cstat_add(enum cell_stat_idx idx, u32 cell, struct cpu_ctx *cctx,
-		      s64 delta)
-{
-	u64 *vptr;
-
-	if ((vptr = MEMBER_VPTR(*cctx, .cstats[cell][idx])))
-		(*vptr) += delta;
-	else
-		scx_bpf_error("invalid cell or stat idxs: %d, %d", idx, cell);
-}
-
-static void cstat_inc(enum cell_stat_idx idx, u32 cell, struct cpu_ctx *cctx)
-{
-	cstat_add(idx, cell, cctx, 1);
-}
-
-static inline int update_task_cpumask(struct task_struct *p,
-				      struct task_ctx *tctx)
-{
-	const struct cpumask *cell_cpumask;
-	struct cpu_ctx *cpu_ctx;
-	u32 cpu;
-
-	if (!(cell_cpumask = lookup_cell_cpumask(tctx->cell)))
-		return -ENOENT;
-
-	if (!tctx->cpumask)
-		return -EINVAL;
-
-	/*
-	 * Calculate the intersection of CPUs that are both:
-	 * 1. In this task's assigned cell (cell_cpumask)
-	 * 2. Allowed by the task's CPU affinity (p->cpus_ptr)
-	 * Store result in tctx->cpumask - this becomes the effective CPU set
-	 * where this task can actually run.
-	 */
-	bpf_cpumask_and(tctx->cpumask, cell_cpumask, p->cpus_ptr);
-
-	/*
-	 * Check if the task can run on ALL CPUs in its assigned cell.
-	 * If cell_cpumask is a subset of p->cpus_ptr, it means the task's
-	 * CPU affinity doesn't restrict it within the cell - it can use
-	 * any CPU in the cell. This affects scheduling decisions later.
-	 * True if all the bits in cell_cpumask are set in p->cpus_ptr.
-	 */
-	tctx->all_cell_cpus_allowed =
-		bpf_cpumask_subset(cell_cpumask, p->cpus_ptr);
-
-	/*
-	 * XXX - To be correct, we'd need to calculate the vtime
-	 * delta in the previous dsq, scale it by the load
-	 * fraction difference and then offset from the new
-	 * dsq's vtime_now. For now, just do the simple thing
-	 * and assume the offset to be zero.
-	 *
-	 * Revisit if high frequency dynamic cell switching
-	 * needs to be supported.
-	 */
-
-	// We want to set the task vtime to that of the cell it's joining.
-	if (tctx->all_cell_cpus_allowed) {
-
-		const struct cpumask *l3_mask = NULL;
-		if (tctx->l3 != L3_INVALID) {
-			l3_mask = lookup_l3_cpumask((u32)tctx->l3);
-			/* If the L3 no longer intersects the cell's cpumask, invalidate it */
-			if (!l3_mask || !bpf_cpumask_intersects(cell_cpumask, l3_mask))
-				tctx->l3 = L3_INVALID;
-		}
-
-		/* --- Pick a new L3 if needed --- */
-		if (tctx->l3 == L3_INVALID) {
-			s32 new_l3 = pick_l3_for_task(tctx->cell);
-			if (new_l3 < 0) {
-				scx_bpf_error("bad L3: %d", new_l3);
-				return -ENODEV;
-			}
-			tctx->l3 = new_l3;
-			l3_mask = lookup_l3_cpumask((u32)tctx->l3);
-			if (!l3_mask)
-				return -ENOENT;
-		}
-
-		/* --- Narrow the effective cpumask by the chosen L3 --- */
-		/* tctx->cpumask already contains (task_affinity ∧ cell_mask) */
-		if (tctx->cpumask)
-			bpf_cpumask_and(tctx->cpumask, (const struct cpumask *)tctx->cpumask, l3_mask);
-
-		/* If empty after intersection, nothing can run here */
-		if (tctx->cpumask && bpf_cpumask_empty((const struct cpumask *)tctx->cpumask)) {
-			scx_bpf_error("Empty cpumask after intersection");
-			return -ENODEV;
-		}
-
-		/* --- Point to the correct (cell,L3) DSQ and set vtime baseline --- */
-		tctx->dsq = get_cell_l3_dsq_id(tctx->cell, tctx->l3);
-
-		struct cell *cell = lookup_cell(tctx->cell);
-		if (!cell)
-			return -ENOENT;
-
-		if (!l3_is_valid(tctx->l3)){
-			scx_bpf_error("Invalid L3 %d", tctx->l3);
-			return -EINVAL;
-		}
-
-		p->scx.dsq_vtime = READ_ONCE(cell->l3_vtime_now[tctx->l3]);
-	} else {
-		/* Task is CPU-restricted, use task mask */
-		cpu = bpf_cpumask_any_distribute(p->cpus_ptr);
-		if (!(cpu_ctx = lookup_cpu_ctx(cpu)))
-			return -ENOENT;
-		tctx->dsq = get_cpu_dsq_id(cpu);
-		p->scx.dsq_vtime = READ_ONCE(cpu_ctx->vtime_now);
-	}
-
-	return 0;
-}
-
-/*
- * Figure out the task's cell, dsq and store the corresponding cpumask in the
- * task_ctx.
- */
-static inline int update_task_cell(struct task_struct *p, struct task_ctx *tctx,
-				   struct cgroup *cg)
-{
-	struct cgrp_ctx *cgc;
-
-	if (!(cgc = lookup_cgrp_ctx(cg)))
-		return -ENOENT;
-
-	/*
-	 * This ordering is pretty important, we read applied_configuration_seq
-	 * before reading everything else expecting that the updater will update
-	 * everything and then bump applied_configuration_seq last. This ensures
-	 * that we cannot miss an update.
-	 */
-	tctx->configuration_seq = READ_ONCE(applied_configuration_seq);
-	barrier();
-	tctx->cell = cgc->cell;
-
-	return update_task_cpumask(p, tctx);
-}
-
-/* Helper function for picking an idle cpu out of a candidate set */
-static s32 pick_idle_cpu_from(struct task_struct *p,
-			      const struct cpumask *cand_cpumask, s32 prev_cpu,
-			      const struct cpumask *idle_smtmask)
-{
-	bool prev_in_cand = bpf_cpumask_test_cpu(prev_cpu, cand_cpumask);
-	s32 cpu;
-
-	/*
-	 * If CPU has SMT, any wholly idle CPU is likely a better pick than
-	 * partially idle @prev_cpu.
-	 */
-	if (smt_enabled) {
-		if (prev_in_cand &&
-		    bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) &&
-		    scx_bpf_test_and_clear_cpu_idle(prev_cpu))
-			return prev_cpu;
-
-		cpu = scx_bpf_pick_idle_cpu(cand_cpumask, SCX_PICK_IDLE_CORE);
-		if (cpu >= 0)
-			return cpu;
-	}
-
-	if (prev_in_cand && scx_bpf_test_and_clear_cpu_idle(prev_cpu))
-		return prev_cpu;
-
-	return scx_bpf_pick_idle_cpu(cand_cpumask, 0);
-}
-
-/* Check if we need to update the cell/cpumask mapping */
-static __always_inline int maybe_refresh_cell(struct task_struct *p,
-					      struct task_ctx *tctx)
-{
-	struct cgroup *cgrp;
-	int ret = 0;
-	if (tctx->configuration_seq != READ_ONCE(applied_configuration_seq)) {
-		if (!(cgrp = task_cgroup(p)))
-			return -1;
-		if (update_task_cell(p, tctx, cgrp))
-			ret = -1;
-		bpf_cgroup_release(cgrp);
-	}
-	return ret;
-}
-
-static __always_inline s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
-					 struct cpu_ctx *cctx,
-					 struct task_ctx *tctx)
-{
-	struct cpumask *task_cpumask;
-	const struct cpumask *idle_smtmask;
-	s32 cpu;
-
-	if (!(task_cpumask = (struct cpumask *)tctx->cpumask) ||
-	    !(idle_smtmask = scx_bpf_get_idle_smtmask())) {
-		scx_bpf_error("Failed to get task cpumask or idle smtmask");
-		return -1;
-	}
-
-	/* No overlap between cell cpus and task cpus, just find some idle cpu */
-	if (bpf_cpumask_empty(task_cpumask)) {
-		cstat_inc(CSTAT_AFFN_VIOL, tctx->cell, cctx);
-		cpu = pick_idle_cpu_from(p, p->cpus_ptr, prev_cpu,
-					 idle_smtmask);
-		goto out;
-	}
-
-	cpu = pick_idle_cpu_from(p, task_cpumask, prev_cpu, idle_smtmask);
-out:
-	scx_bpf_put_idle_cpumask(idle_smtmask);
-	return cpu;
-}
-
-/*
- * select_cpu is where we update each task's cell assignment and then try to
- * dispatch to an idle core in the cell if possible
- */
-s32 BPF_STRUCT_OPS(mitosis_select_cpu, struct task_struct *p, s32 prev_cpu,
-		   u64 wake_flags)
-{
-	s32 cpu;
-	struct cpu_ctx *cctx;
-	struct task_ctx *tctx;
-
-	increment_counter(COUNTER_SELECT_CPU);
-
-	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
-		return prev_cpu;
-
-	if (maybe_refresh_cell(p, tctx) < 0)
-		return prev_cpu;
-
-	/* Pinned path: only if our task really requires a per-CPU queue. */
-	if (!tctx->all_cell_cpus_allowed) {
-		cstat_inc(CSTAT_AFFN_VIOL, tctx->cell, cctx);
-		cpu = get_cpu_from_dsq(tctx->dsq);
-		if (scx_bpf_test_and_clear_cpu_idle(cpu))
-			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
-		return cpu;
-	}
-
-	// Grab an idle core
-	if ((cpu = pick_idle_cpu(p, prev_cpu, cctx, tctx)) >= 0) {
-		cstat_inc(CSTAT_LOCAL, tctx->cell, cctx);
-		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
-		return cpu;
-	}
-
-	if (!tctx->cpumask) {
-		scx_bpf_error("tctx->cpumask should never be NULL");
-		return prev_cpu;
-	}
-	/*
-	 * All else failed, send it to the prev cpu (if that's valid), otherwise any
-	 * valid cpu.
-	 */
-	if (!bpf_cpumask_test_cpu(prev_cpu, cast_mask(tctx->cpumask)) &&
-	    tctx->cpumask)
-		cpu = bpf_cpumask_any_distribute(cast_mask(tctx->cpumask));
-	else
-		cpu = prev_cpu;
-
-	return cpu;
-}
-
-void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags)
-{
-	struct cpu_ctx *cctx;
-	struct task_ctx *tctx;
-	struct cell *cell;
-	s32 task_cpu = scx_bpf_task_cpu(p);
-	u64 vtime = p->scx.dsq_vtime;
-	s32 cpu = -1;
-	u64 basis_vtime;
-
-	increment_counter(COUNTER_ENQUEUE);
-
-	if (!(tctx = lookup_task_ctx(p)) || !(cctx = lookup_cpu_ctx(-1)))
-		return;
-
-	if (maybe_refresh_cell(p, tctx) < 0)
-		return;
-
-	// Cpu pinned work
-	if (!tctx->all_cell_cpus_allowed) {
-		cpu = get_cpu_from_dsq(tctx->dsq);
-	} else if (!__COMPAT_is_enq_cpu_selected(enq_flags)) {
-		/*
-		 * If we haven't selected a cpu, then we haven't looked for and kicked an
-		 * idle CPU. Let's do the lookup now and kick at the end.
-		 */
-		if (!(cctx = lookup_cpu_ctx(-1)))
-			return;
-		cpu = pick_idle_cpu(p, task_cpu, cctx, tctx);
-		if (cpu == -1)
-			return;
-		if (cpu == -EBUSY) {
-			/*
-			 * Verifier gets unhappy claiming two different pointer types for
-			 * the same instruction here. This fixes it
-			 */
-			barrier_var(tctx);
-			if (tctx->cpumask)
-				cpu = bpf_cpumask_any_distribute(
-					(const struct cpumask *)tctx->cpumask);
-		}
-	}
-
-	if (tctx->all_cell_cpus_allowed) {
-		// This is a task that can run on any cpu in the cell
-
-		cstat_inc(CSTAT_CELL_DSQ, tctx->cell, cctx);
-
-		/* Task can use any CPU in its cell, set basis_vtime from per-(cell, L3) vtime */
-		if (!(cell = lookup_cell(tctx->cell)))
-			return;
-
-		if (!l3_is_valid(tctx->l3)) {
-			scx_bpf_error("Invalid L3 ID for task %d in enqueue", p->pid);
-			return;
-		}
-		basis_vtime = READ_ONCE(cell->l3_vtime_now[tctx->l3]);
-
-	} else {
-		// This is a task that can only run on a specific cpu
-		cstat_inc(CSTAT_CPU_DSQ, tctx->cell, cctx);
-
-		/*
-		 * cctx is the local core cpu (where enqueue is running), not the core
-		 * the task belongs to. Fetch the right cctx
-		 */
-		if (!(cctx = lookup_cpu_ctx(cpu)))
-			return;
-		/* Task is pinned to specific CPUs, use per-CPU DSQ */
-		basis_vtime = READ_ONCE(cctx->vtime_now);
-	}
-
-	tctx->basis_vtime = basis_vtime;
-
-	if (time_after(vtime,
-		       basis_vtime + VTIME_MAX_FUTURE_MULTIPLIER * slice_ns)) {
-		scx_bpf_error("vtime is too far in the future for %d", p->pid);
-		return;
-	}
-	/*
-	 * Limit the amount of budget that an idling task can accumulate
-	 * to one slice.
-	 */
-	// TODO: Should this be time_before64?
-	if (time_before(vtime, basis_vtime - slice_ns))
-		vtime = basis_vtime - slice_ns;
-
-	scx_bpf_dsq_insert_vtime(p, tctx->dsq.raw, slice_ns, vtime, enq_flags);
-
-	/* Kick the CPU if needed */
-	if (!__COMPAT_is_enq_cpu_selected(enq_flags) && cpu >= 0)
-		scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
-}
-
-void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev)
-{
-	struct cpu_ctx *cctx;
-	u32 cell;
-
-	increment_counter(COUNTER_DISPATCH);
-
-	if (!(cctx = lookup_cpu_ctx(-1)))
-		return;
-
-	cell = READ_ONCE(cctx->cell);
-
-	/* Start from a valid DSQ */
-	dsq_id_t local_dsq = get_cpu_dsq_id(cpu);
-
-	bool found = false;
-	dsq_id_t min_vtime_dsq = local_dsq;
-	u64 min_vtime = ~0ULL; /* U64_MAX */
-	struct task_struct *p;
-
-	// Get L3
-	u32 cpu_key = (u32)cpu;
-	u32 *l3_ptr = bpf_map_lookup_elem(&cpu_to_l3, &cpu_key);
-	s32 l3 = l3_ptr ? (s32)*l3_ptr : L3_INVALID;
-
-	/* Check the L3 queue */
-	if (l3 != L3_INVALID) {
-		dsq_id_t cell_l3_dsq = get_cell_l3_dsq_id(cell, l3);
-		bpf_for_each(scx_dsq, p, cell_l3_dsq.raw, 0) {
-			min_vtime = p->scx.dsq_vtime;
-			min_vtime_dsq = cell_l3_dsq;
-			found = true;
-			break;
-		}
-	}
-
-	/* Check the CPU DSQ for a lower vtime */
-	bpf_for_each(scx_dsq, p, local_dsq.raw, 0) {
-		if (!found || time_before(p->scx.dsq_vtime, min_vtime)) {
-			min_vtime = p->scx.dsq_vtime;
-			min_vtime_dsq = local_dsq;
-			found = true;
-		}
-		break;
-	}
-
-	/*
-	* The move_to_local can fail if we raced with some other cpu in the cell
-	* and now the cell is empty. We have to ensure to try the cpu_dsq or else
-	* we might never wakeup.
-	*/
-
-
-	if (found) {
-		// We found a task in the local or cell-L3 DSQ
-
-		// If it was in the per cpu DSQ, there is no competation, grab it and return
-		if (min_vtime_dsq.raw == local_dsq.raw) {
-			scx_bpf_dsq_move_to_local(min_vtime_dsq.raw);
-			return;
-		}
-
-		// If it was in the cell L3 DSQ, we are competing with other cpus in the cell-l3
-		// try to move it to the local DSQ
-		if (scx_bpf_dsq_move_to_local(min_vtime_dsq.raw)) {
-			// We won the race and got the task, return
-			return;
-		}
-	}
-
-#if MITOSIS_ENABLE_STEALING
-	// We didn't find a task in either DSQ, or lost the race.
-	// Instead of going straight to idle, attempt to steal a task from another
-	// L3 in the cell.
-
-	// Try stealing. If successful, this moves the task to the local runqueue
-	try_stealing_work(cell, l3);
-#endif
-}
-
-struct cpumask_entry {
-	unsigned long cpumask[CPUMASK_LONG_ENTRIES];
-	u64 used;
-};
-
-struct {
-	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-	__type(key, u32);
-	__type(value, struct cpumask_entry);
-	__uint(max_entries, MAX_CPUMASK_ENTRIES);
-} cgrp_init_percpu_cpumask SEC(".maps");
-
-static inline struct cpumask_entry *allocate_cpumask_entry()
-{
-	int cpumask_idx;
-	bpf_for(cpumask_idx, 0, MAX_CPUMASK_ENTRIES)
-	{
-		struct cpumask_entry *ent = bpf_map_lookup_elem(
-			&cgrp_init_percpu_cpumask, &cpumask_idx);
-		if (!ent) {
-			scx_bpf_error("Failed to fetch cpumask_entry");
-			return NULL;
-		}
-		if (__sync_bool_compare_and_swap(&ent->used, 0, 1))
-			return ent;
-	}
-	scx_bpf_error("All cpumask entries are in use");
-	return NULL;
-}
-
-static inline void free_cpumask_entry(struct cpumask_entry *entry)
-{
-	WRITE_ONCE(entry->used, 0);
-}
-
-/* For use by cleanup attribute */
-static inline void __free_cpumask_entry(struct cpumask_entry **entry)
-{
-	if (entry)
-		if (*entry)
-			free_cpumask_entry(*entry);
-}
-
-#define DECLARE_CPUMASK_ENTRY(var) \
-	struct cpumask_entry *var __attribute__((cleanup(__free_cpumask_entry)))
-
-/* Define types for cpumasks in-situ vs as a ptr in struct cpuset */
-struct cpumask___local {};
-
-typedef struct cpumask___local *cpumask_var_t___ptr;
-
-struct cpuset___cpumask_ptr {
-	cpumask_var_t___ptr cpus_allowed;
-};
-
-typedef struct cpumask___local cpumask_var_t___arr[1];
-
-struct cpuset___cpumask_arr {
-	cpumask_var_t___arr cpus_allowed;
-};
-
-/*
- * Given a cgroup, get its cpumask (populated in entry), returns 0 if no
- * cpumask, < 0 on error and > 0 on a populated cpumask.
- */
-static inline int get_cgroup_cpumask(struct cgroup *cgrp,
-				     struct cpumask_entry *entry)
-{
-	if (!cgrp->subsys[cpuset_cgrp_id])
-		return 0;
-
-	struct cpuset *cpuset =
-		container_of(cgrp->subsys[cpuset_cgrp_id], struct cpuset, css);
-
-	if (!cpuset)
-		return 0;
-
-	unsigned long runtime_cpumask_size = bpf_core_type_size(struct cpumask);
-	if (runtime_cpumask_size > CPUMASK_SIZE) {
-		scx_bpf_error(
-			"Definition of struct cpumask is too large. Please increase CPUMASK_LONG_ENTRIES");
-		return -EINVAL;
-	}
-
-	int err;
-	if (bpf_core_type_matches(struct cpuset___cpumask_arr)) {
-		struct cpuset___cpumask_arr *cpuset_typed =
-			(void *)bpf_core_cast(cpuset, struct cpuset);
-		err = bpf_core_read(&entry->cpumask, runtime_cpumask_size,
-				    &cpuset_typed->cpus_allowed);
-	} else if (bpf_core_type_matches(struct cpuset___cpumask_ptr)) {
-		struct cpuset___cpumask_ptr *cpuset_typed =
-			(void *)bpf_core_cast(cpuset, struct cpuset);
-		err = bpf_core_read(&entry->cpumask, runtime_cpumask_size,
-				    cpuset_typed->cpus_allowed);
-	} else {
-		scx_bpf_error(
-			"Definition of struct cpuset did not match any expected struct");
-		return -EINVAL;
-	}
-
-	if (err < 0) {
-		scx_bpf_error(
-			"bpf_core_read of cpuset->cpus_allowed failed for cgid %llu",
-			cgrp->kn->id);
-		return err;
-	}
-
-	if (bpf_cpumask_empty((const struct cpumask *)&entry->cpumask))
-		return 0;
-
-	if (!all_cpumask) {
-		scx_bpf_error("all_cpumask should not be NULL");
-		return -EINVAL;
-	}
-
-	if (bpf_cpumask_subset((const struct cpumask *)all_cpumask,
-			       (const struct cpumask *)&entry->cpumask))
-		return 0;
-
-	return 1;
-}
-
-/*
- * This array keeps track of the cgroup ancestor's cell as we iterate over the
- * cgroup hierarchy.
- */
-u32 level_cells[MAX_CG_DEPTH];
-int running;
-
-/* The guard is a stack variable. When it falls out of scope,
- * we drop the running lock. */
-static inline void __running_unlock(int *guard) {
-	(void)guard; /* unused */
-	WRITE_ONCE(running, 0);
-}
-
-/*
- * On tick, we identify new cells and apply CPU assignment
- */
-void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run)
-{
-
-	u32 local_configuration_seq = READ_ONCE(configuration_seq);
-	if (local_configuration_seq == READ_ONCE(applied_configuration_seq))
-		return;
-
-	int zero = 0;
-	if (!__atomic_compare_exchange_n(&running, &zero, 1, false,
-					 __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
-		return;
-
-	int __attribute__((cleanup(__running_unlock), unused)) __running_guard;
-
-	DECLARE_CPUMASK_ENTRY(entry) = allocate_cpumask_entry();
-	if (!entry)
-		return;
-
-	/* Get the root cell (cell 0) and its cpumask */
-	struct cell_cpumask_wrapper *root_cell_cpumaskw;
-	if (!(root_cell_cpumaskw =
-		      bpf_map_lookup_elem(&cell_cpumasks, &zero))) {
-		scx_bpf_error("Failed to find root cell cpumask");
-		return;
-	}
-
-	struct bpf_cpumask *root_bpf_cpumask;
-	root_bpf_cpumask =
-		bpf_kptr_xchg(&root_cell_cpumaskw->tmp_cpumask, NULL);
-	if (!root_bpf_cpumask) {
-		scx_bpf_error("tmp_cpumask should never be null");
-		return;
-	}
-	if (!root_cell_cpumaskw->cpumask) {
-		scx_bpf_error("root cpumasks should never be null");
-		goto out;
-	}
-
-	if (!all_cpumask) {
-		scx_bpf_error("NULL all_cpumask");
-		goto out;
-	}
-
-	/*
-	 * Initialize root cell cpumask to all cpus, and then remove from it as we go
-	 */
-	bpf_cpumask_copy(root_bpf_cpumask, (const struct cpumask *)all_cpumask);
-
-	struct cgroup_subsys_state *root_css, *pos;
-	struct cgroup *cur_cgrp, *root_cgrp_ref;
-
-	if (!root_cgrp) {
-		scx_bpf_error("root_cgrp should not be null");
-		goto out;
-	}
-
-	struct cgrp_ctx *root_cgrp_ctx;
-	if (!(root_cgrp_ctx = lookup_cgrp_ctx(root_cgrp)))
-		goto out;
-
-	if (!root_cgrp) {
-		scx_bpf_error("root_cgrp should not be null");
-		goto out;
-	}
-
-	if (!(root_cgrp_ref = bpf_cgroup_acquire(root_cgrp))) {
-		scx_bpf_error("Failed to acquire reference to root_cgrp");
-		goto out;
-	}
-	root_css = &root_cgrp_ref->self;
-
-	bpf_rcu_read_lock();
-	/*
-	 * Iterate over all cgroups, check if any have a cpumask and populate them
-	 * as a separate cell.
-	 */
-	bpf_for_each(css, pos, root_css, BPF_CGROUP_ITER_DESCENDANTS_PRE) {
-		cur_cgrp = pos->cgroup;
-
-		/*
-		 * We can iterate over dying cgroups, in which case this lookup will
-		 * fail. These cgroups can't have tasks in them so just continue.
-		 */
-		struct cgrp_ctx *cgrp_ctx;
-		if (!(cgrp_ctx = lookup_cgrp_ctx_fallible(cur_cgrp)))
-			continue;
-
-		int rc = get_cgroup_cpumask(cur_cgrp, entry);
-		if (!rc) {
-			/*
-			 * TODO: If this was a cell owner that just had its cpuset removed,
-			 * it should free the cell. Doing so would require draining
-			 * in-flight tasks scheduled to the dsq.
-			 */
-			/* No cpuset, assign to parent cell and continue */
-			if (cur_cgrp->kn->id != root_cgid) {
-				u32 level = cur_cgrp->level;
-				if (level <= 0 || level >= MAX_CG_DEPTH) {
-					scx_bpf_error(
-						"Cgroup hierarchy is too deep: %d",
-						level);
-					goto out_rcu_unlock;
-				}
-				/*
-				 * This is a janky way of getting the parent cell, ideally we'd
-				 * lookup the parent cgrp_ctx and get it that way, but some
-				 * cgroup lookups don't work here because they are (erroneously)
-				 * only operating on the cgroup namespace of current. Given this
-				 * is a tick() it could be anything. See
-				 * https://lore.kernel.org/bpf/20250811175045.1055202-1-memxor@gmail.com/
-				 * for details.
-				 *
-				 * Instead, we just track the parent cells as we walk the cgroup
-				 * hierarchy in a separate array. Because the iteration is
-				 * pre-order traversal, we're guaranteed to have the current
-				 * cgroup's ancestor's cells in level_cells.
-				 */
-				u32 parent_cell = level_cells[level - 1];
-				WRITE_ONCE(cgrp_ctx->cell, parent_cell);
-				level_cells[level] = parent_cell;
-			}
-			continue;
-		} else if (rc < 0)
-			goto out_rcu_unlock;
-
-		/*
-		 * cgroup has a cpumask, allocate a new cell if needed, and assign cpus
-		 */
-		int cell_idx = READ_ONCE(cgrp_ctx->cell);
-		if (!cgrp_ctx->cell_owner) {
-			cell_idx = allocate_cell();
-			if (cell_idx < 0)
-				goto out_rcu_unlock;
-			cgrp_ctx->cell_owner = true;
-		}
-
-		struct cell_cpumask_wrapper *cell_cpumaskw;
-		if (!(cell_cpumaskw =
-			      bpf_map_lookup_elem(&cell_cpumasks, &cell_idx))) {
-			scx_bpf_error("Failed to find cell cpumask: %d",
-				      cell_idx);
-			goto out_rcu_unlock;
-		}
-
-		struct bpf_cpumask *bpf_cpumask;
-		bpf_cpumask = bpf_kptr_xchg(&cell_cpumaskw->tmp_cpumask, NULL);
-		if (!bpf_cpumask) {
-			scx_bpf_error("tmp_cpumask should never be null");
-			goto out_rcu_unlock;
-		}
-		bpf_cpumask_copy(bpf_cpumask,
-				 (const struct cpumask *)&entry->cpumask);
-		int cpu_idx;
-		bpf_for(cpu_idx, 0, nr_possible_cpus)
-		{
-			if (bpf_cpumask_test_cpu(
-				    cpu_idx,
-				    (const struct cpumask *)&entry->cpumask)) {
-				struct cpu_ctx *cpu_ctx;
-				if (!(cpu_ctx = lookup_cpu_ctx(cpu_idx))) {
-					bpf_cpumask_release(bpf_cpumask);
-					goto out_rcu_unlock;
-				}
-				cpu_ctx->cell = cell_idx;
-				bpf_cpumask_clear_cpu(cpu_idx,
-						      root_bpf_cpumask);
-			}
-		}
-		bpf_cpumask =
-			bpf_kptr_xchg(&cell_cpumaskw->cpumask, bpf_cpumask);
-		if (!bpf_cpumask) {
-			scx_bpf_error("cpumask should never be null");
-			goto out_rcu_unlock;
-		}
-
-		bpf_cpumask =
-			bpf_kptr_xchg(&cell_cpumaskw->tmp_cpumask, bpf_cpumask);
-		if (bpf_cpumask) {
-			scx_bpf_error("tmp_cpumask should be null");
-			bpf_cpumask_release(bpf_cpumask);
-			goto out_rcu_unlock;
-		}
-
-		barrier();
-		WRITE_ONCE(cgrp_ctx->cell, cell_idx);
-		u32 level = cur_cgrp->level;
-		if (level <= 0 || level >= MAX_CG_DEPTH) {
-			scx_bpf_error("Cgroup hierarchy is too deep: %d",
-				      level);
-			goto out_rcu_unlock;
-		}
-		level_cells[level] = cell_idx;
-	}
-	bpf_rcu_read_unlock();
-
-	/*
-	 * assign root cell cpus that are left over
-	 */
-	int cpu_idx;
-	bpf_for(cpu_idx, 0, nr_possible_cpus)
-	{
-		if (bpf_cpumask_test_cpu( cpu_idx, (const struct cpumask *)root_bpf_cpumask)) {
-			struct cpu_ctx *cpu_ctx;
-			if (!(cpu_ctx = lookup_cpu_ctx(cpu_idx)))
-				goto out_root_cgrp;
-			cpu_ctx->cell = 0;
-		}
-	}
-
-	root_bpf_cpumask =
-		bpf_kptr_xchg(&root_cell_cpumaskw->cpumask, root_bpf_cpumask);
-	if (!root_bpf_cpumask) {
-		scx_bpf_error("root cpumask should never be null");
-		bpf_cgroup_release(root_cgrp_ref);
-		return;
-	}
-
-	root_bpf_cpumask = bpf_kptr_xchg(&root_cell_cpumaskw->tmp_cpumask,
-					 root_bpf_cpumask);
-	if (root_bpf_cpumask) {
-		scx_bpf_error("root tmp_cpumask should be null");
-		goto out_root_cgrp;
-	}
-
-	int cell_idx;
-	/* Recalculate L3 counts for all active cells after CPU assignment changes */
-	bpf_for(cell_idx, 1, MAX_CELLS) {
-		struct cell *cell;
-		if (!(cell = lookup_cell(cell_idx))) {
-			scx_bpf_error("Lookup for cell %d failed in tick()", cell_idx);
-			goto out_root_cgrp;
-		}
-
-		if (!cell->in_use)
-			continue;
-
-		/* Recalculate L3 counts for each active cell */
-		recalc_cell_l3_counts(cell_idx);
-	}
-
-	/* Recalculate root cell's L3 counts after cpumask update */
-	recalc_cell_l3_counts(ROOT_CELL_ID);
-
-	barrier();
-	WRITE_ONCE(applied_configuration_seq, local_configuration_seq);
-
-	bpf_cgroup_release(root_cgrp_ref);
-	return;
-
-out_rcu_unlock:
-	bpf_rcu_read_unlock();
-out_root_cgrp:
-	bpf_cgroup_release(root_cgrp_ref);
-out:
-	if (root_bpf_cpumask)
-		bpf_cpumask_release(root_bpf_cpumask);
-}
-
-void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p)
-{
-	struct cpu_ctx *cctx;
-	struct task_ctx *tctx;
-	struct cell *cell;
-
-	if (!(tctx = lookup_task_ctx(p)) || !(cctx = lookup_cpu_ctx(-1)) ||
-	    !(cell = lookup_cell(cctx->cell)))
-		return;
-
-	/*
-	 * If this task was stolen across L3s, retag to thief L3 and recompute
-	 * effective cpumask+DSQ. Preserve vtime to keep fairness.
-	 */
-#if MITOSIS_ENABLE_STEALING
-	if (l3_is_valid(tctx->pending_l3)) {
-		u64 save_v = p->scx.dsq_vtime;
-		tctx->l3 = tctx->pending_l3;
-		tctx->pending_l3 = L3_INVALID;
-		update_task_cpumask(p, tctx);
-		p->scx.dsq_vtime = save_v;
-	}
-#endif
-
-	/* Validate task's DSQ before it starts running */
-	if (tctx->dsq.raw == DSQ_INVALID) {
-		if (tctx->all_cell_cpus_allowed) {
-			scx_bpf_error(
-				"Task %d has invalid DSQ 0 in running callback (CELL-SCHEDULABLE task, can run on any CPU in cell %d)",
-				p->pid, tctx->cell);
-		} else {
-			scx_bpf_error(
-				"Task %d has invalid DSQ 0 in running callback (CORE-PINNED task, restricted to specific CPUs)",
-				p->pid);
-		}
-		return;
-	}
-
-	/*
-	 * Update per-(cell, L3) vtime for cell-schedulable tasks
-	 */
-	if (tctx->all_cell_cpus_allowed && l3_is_valid(tctx->l3)) {
-		if (time_before(READ_ONCE(cell->l3_vtime_now[tctx->l3]), p->scx.dsq_vtime))
-			WRITE_ONCE(cell->l3_vtime_now[tctx->l3], p->scx.dsq_vtime);
-	}
-
-	/*
-	 * Update CPU vtime for CPU-pinned tasks
-	 */
-	if (time_before(READ_ONCE(cctx->vtime_now), p->scx.dsq_vtime))
-		WRITE_ONCE(cctx->vtime_now, p->scx.dsq_vtime);
-
-	tctx->started_running_at = scx_bpf_now();
-}
-
-void BPF_STRUCT_OPS(mitosis_stopping, struct task_struct *p, bool runnable)
-{
-	struct cpu_ctx *cctx;
-	struct task_ctx *tctx;
-	struct cell *cell;
-	u64 now, used;
-	u32 cidx;
-
-	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
-		return;
-
-	cidx = tctx->cell;
-	if (!(cell = lookup_cell(cidx)))
-		return;
-
-	now = scx_bpf_now();
-	used = now - tctx->started_running_at;
-	tctx->started_running_at = now;
-	/* scale the execution time by the inverse of the weight and charge */
-	p->scx.dsq_vtime += used * DEFAULT_WEIGHT_MULTIPLIER / p->scx.weight;
-
-	if (cidx != 0 || tctx->all_cell_cpus_allowed) {
-		u64 *cell_cycles = MEMBER_VPTR(cctx->cell_cycles, [cidx]);
-		if (!cell_cycles) {
-			scx_bpf_error("Cell index is too large: %d", cidx);
-			return;
-		}
-		*cell_cycles += used;
-
-		/*
-		 * For cell-schedulable tasks, also accumulate vtime into
-		 * per-cell per-L3 queues
-		 */
-		if (tctx->all_cell_cpus_allowed && l3_is_valid(tctx->l3)) {
-			/* Accumulate weighted execution time into per-(cell, L3) vtime */
-			cell->l3_vtime_now[tctx->l3] +=
-				used * DEFAULT_WEIGHT_MULTIPLIER /
-				p->scx.weight;
-		}
-	}
-}
-
-SEC("fentry/cpuset_write_resmask")
-int BPF_PROG(fentry_cpuset_write_resmask, struct kernfs_open_file *of,
-	     char *buf, size_t nbytes, loff_t off, ssize_t retval)
-{
-	/*
-	 * On a write to cpuset.cpus, we'll need to configure new cells, bump
-	 * configuration_seq so tick() does that.
-	 */
-	__atomic_add_fetch(&configuration_seq, 1, __ATOMIC_RELEASE);
-	return 0;
-}
-
-s32 BPF_STRUCT_OPS(mitosis_cgroup_init, struct cgroup *cgrp,
-		   struct scx_cgroup_init_args *args)
-{
-	struct cgrp_ctx *cgc;
-	if (!(cgc = bpf_cgrp_storage_get(&cgrp_ctxs, cgrp, 0,
-					 BPF_LOCAL_STORAGE_GET_F_CREATE))) {
-		scx_bpf_error("cgrp_ctx creation failed for cgid %llu",
-			      cgrp->kn->id);
-		return -ENOENT;
-	}
-
-	// Special case for root cell
-	if (cgrp->kn->id == root_cgid) {
-		WRITE_ONCE(cgc->cell, ROOT_CELL_ID);
-		return 0;
-	}
-
-	DECLARE_CPUMASK_ENTRY(entry) = allocate_cpumask_entry();
-	if (!entry)
-		return -EINVAL;
-	int rc = get_cgroup_cpumask(cgrp, entry);
-	if (rc < 0)
-		return rc;
-	else if (rc > 0) {
-		/*
-		 * This cgroup has a cpuset, bump configuration_seq so tick()
-		 * configures it.
-		 */
-		__atomic_add_fetch(&configuration_seq, 1, __ATOMIC_RELEASE);
-	}
-
-	/* Initialize to parent's cell */
-	struct cgroup *parent_cg;
-	if (!(parent_cg = lookup_cgrp_ancestor(cgrp, cgrp->level - 1)))
-		return -ENOENT;
-
-	struct cgrp_ctx *parent_cgc;
-	if (!(parent_cgc = lookup_cgrp_ctx(parent_cg))) {
-		bpf_cgroup_release(parent_cg);
-		return -ENOENT;
-	}
-
-	bpf_cgroup_release(parent_cg);
-	cgc->cell = parent_cgc->cell;
-	return 0;
-}
-
-s32 BPF_STRUCT_OPS(mitosis_cgroup_exit, struct cgroup *cgrp)
-{
-	struct cgrp_ctx *cgc;
-	if (!(cgc = bpf_cgrp_storage_get(&cgrp_ctxs, cgrp, 0,
-					 BPF_LOCAL_STORAGE_GET_F_CREATE))) {
-		scx_bpf_error("cgrp_ctx creation failed for cgid %llu",
-			      cgrp->kn->id);
-		return -ENOENT;
-	}
-
-	if (cgc->cell_owner) {
-		int ret;
-		if ((ret = free_cell(cgc->cell)))
-			return ret;
-		/*
-		 * Need to make sure the cpus of this cell are freed back to the root
-		 * cell and the root cell cpumask can be expanded. Bump
-		 * configuration_seq so tick() does that.
-		 */
-		__atomic_add_fetch(&configuration_seq, 1, __ATOMIC_RELEASE);
-	}
-
-	return 0;
-}
-
-void BPF_STRUCT_OPS(mitosis_cgroup_move, struct task_struct *p,
-		    struct cgroup *from, struct cgroup *to)
-{
-	struct task_ctx *tctx;
-
-	if (!(tctx = lookup_task_ctx(p)))
-		return;
-
-	update_task_cell(p, tctx, to);
-}
-
-void BPF_STRUCT_OPS(mitosis_set_cpumask, struct task_struct *p,
-		    const struct cpumask *cpumask)
-{
-	struct task_ctx *tctx;
-
-	if (!(tctx = lookup_task_ctx(p)))
-		return;
-
-	if (!all_cpumask) {
-		scx_bpf_error("NULL all_cpumask");
-		return;
-	}
-
-	update_task_cpumask(p, tctx);
-}
-
-s32 BPF_STRUCT_OPS(mitosis_init_task, struct task_struct *p,
-		   struct scx_init_task_args *args)
-{
-	struct task_ctx *tctx;
-	struct bpf_cpumask *cpumask;
-	int ret;
-
-	tctx = bpf_task_storage_get(&task_ctxs, p, 0,
-				    BPF_LOCAL_STORAGE_GET_F_CREATE);
-	if (!tctx) {
-		scx_bpf_error("task_ctx allocation failure");
-		return -ENOMEM;
-	}
-
-	cpumask = bpf_cpumask_create();
-	if (!cpumask)
-		return -ENOMEM;
-
-	cpumask = bpf_kptr_xchg(&tctx->cpumask, cpumask);
-	if (cpumask) {
-		/* Should never happen as we just inserted it above. */
-		bpf_cpumask_release(cpumask);
-		scx_bpf_error("tctx cpumask is unexpectedly populated on init");
-		return -EINVAL;
-	}
-
-	if (!all_cpumask) {
-		scx_bpf_error("missing all_cpumask");
-		return -EINVAL;
-	}
-
-	/* Initialize L3 to invalid before cell assignment */
-	init_task_l3(tctx);
-
-	// TODO clean this up
-	if ((ret = update_task_cell(p, tctx, args->cgroup))) {
-		return ret;
-	}
-
-	return 0;
-}
-
-__hidden void dump_cpumask_word(s32 word, const struct cpumask *cpumask)
-{
-	u32 u, v = 0;
-
-	bpf_for(u, 0, BITS_PER_U32)
-	{
-		s32 cpu = BITS_PER_U32 * word + u;
-		if (cpu < nr_possible_cpus &&
-		    bpf_cpumask_test_cpu(cpu, cpumask))
-			v |= 1 << u;
-	}
-	scx_bpf_dump("%08x", v);
-}
-
-static void dump_cpumask(const struct cpumask *cpumask)
-{
-	u32 word, nr_words = (nr_possible_cpus + 31) / 32;
-
-	bpf_for(word, 0, nr_words)
-	{
-		if (word)
-			scx_bpf_dump(",");
-		dump_cpumask_word(nr_words - word - 1, cpumask);
-	}
-}
-
-static void dump_cell_cpumask(int id)
-{
-	const struct cpumask *cell_cpumask;
-
-	if (!(cell_cpumask = lookup_cell_cpumask(id)))
-		return;
-
-	dump_cpumask(cell_cpumask);
-}
-
-/* Print cell state for debugging */
-static __always_inline void dump_cell_state(u32 cell_idx)
-{
-	struct cell *cell = lookup_cell(cell_idx);
-	if (!cell) {
-		scx_bpf_dump("Cell %d: NOT FOUND", cell_idx);
-		return;
-	}
-
-	scx_bpf_dump("Cell %d: in_use=%d, cpu_cnt=%d, l3_present_cnt=%d",
-		   cell_idx, cell->in_use, cell->cpu_cnt, cell->l3_present_cnt);
-
-	u32 l3;
-	// TODO Print vtimes for L3s
-	// TODO lock
-	bpf_for(l3, 0, nr_l3) {
-		if (cell->l3_cpu_cnt[l3] > 0) {
-			scx_bpf_dump("  L3[%d]: %d CPUs", l3, cell->l3_cpu_cnt[l3]);
-		}
-	}
-}
-
-// TODO: FIX THIS
-static __always_inline void dump_l3_state(){
-}
-
-void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx)
-{
-	dsq_id_t dsq_id;
-	int i;
-	struct cell *cell;
-	struct cpu_ctx *cpu_ctx;
-
-	scx_bpf_dump_header();
-
-	bpf_for(i, 0, MAX_CELLS)
-	{
-		if (!(cell = lookup_cell(i)))
-			return;
-
-		if (!cell->in_use)
-			continue;
-
-		scx_bpf_dump("CELL[%d] CPUS=", i);
-		dump_cell_cpumask(i);
-		scx_bpf_dump("\n");
-		dump_cell_state(i);
-	}
-
-	bpf_for(i, 0, nr_possible_cpus)
-	{
-		if (!(cpu_ctx = lookup_cpu_ctx(i)))
-			return;
-
-		dsq_id = get_cpu_dsq_id(i);
-		scx_bpf_dump("CPU[%d] cell=%d vtime=%llu nr_queued=%d\n", i,
-			     cpu_ctx->cell, READ_ONCE(cpu_ctx->vtime_now),
-			     scx_bpf_dsq_nr_queued(dsq_id.raw));
-	}
-
-	dump_l3_state();
-
-}
-
-void BPF_STRUCT_OPS(mitosis_dump_task, struct scx_dump_ctx *dctx,
-		    struct task_struct *p)
-{
-	struct task_ctx *tctx;
-
-	if (!(tctx = lookup_task_ctx(p)))
-		return;
-
-	scx_bpf_dump(
-		"Task[%d] vtime=%llu basis_vtime=%llu cell=%u dsq=%llu all_cell_cpus_allowed=%d\n",
-		p->pid, p->scx.dsq_vtime, tctx->basis_vtime, tctx->cell,
-		tctx->dsq.raw, tctx->all_cell_cpus_allowed);
-	scx_bpf_dump("Task[%d] CPUS=", p->pid);
-	dump_cpumask(p->cpus_ptr);
-	scx_bpf_dump("\n");
-}
-
-s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init)
-{
-	struct bpf_cpumask *cpumask;
-	u32 i;
-	s32 ret;
-
-	struct cgroup *rootcg;
-	if (!(rootcg = bpf_cgroup_from_id(root_cgid)))
-		return -ENOENT;
-
-	rootcg = bpf_kptr_xchg(&root_cgrp, rootcg);
-	if (rootcg)
-		bpf_cgroup_release(rootcg);
-
-	/* setup all_cpumask */
-	cpumask = bpf_cpumask_create();
-	if (!cpumask)
-		return -ENOMEM;
-
-	bpf_for(i, 0, nr_possible_cpus)
-	{
-		const volatile u8 *u8_ptr;
-
-		if ((u8_ptr = MEMBER_VPTR(all_cpus, [i / 8]))) {
-			if (*u8_ptr & (1 << (i % 8))) {
-				bpf_cpumask_set_cpu(i, cpumask);
-				ret = scx_bpf_create_dsq(get_cpu_dsq_id(i).raw, ANY_NUMA);
-				if (ret < 0) {
-					bpf_cpumask_release(cpumask);
-					return ret;
-				}
-			}
-		} else {
-			return -EINVAL;
-		}
-	}
-
-
-	cpumask = bpf_kptr_xchg(&all_cpumask, cpumask);
-	if (cpumask)
-		bpf_cpumask_release(cpumask);
-
-	/* setup cell cpumasks */
-	bpf_for(i, 0, MAX_CELLS)
-	{
-		struct cell_cpumask_wrapper *cpumaskw;
-		if (!(cpumaskw = bpf_map_lookup_elem(&cell_cpumasks, &i)))
-			return -ENOENT;
-
-		cpumask = bpf_cpumask_create();
-		if (!cpumask)
-			return -ENOMEM;
-
-		/*
-		 * Start with all full cpumask for all cells. They'll get setup in
-		 * cgroup_init
-		 */
-		bpf_cpumask_setall(cpumask);
-
-		cpumask = bpf_kptr_xchg(&cpumaskw->cpumask, cpumask);
-		if (cpumask) {
-			/* Should be impossible, we just initialized the cell cpumask */
-			bpf_cpumask_release(cpumask);
-			return -EINVAL;
-		}
-
-		cpumask = bpf_cpumask_create();
-		if (!cpumask)
-			return -ENOMEM;
-		cpumask = bpf_kptr_xchg(&cpumaskw->tmp_cpumask, cpumask);
-		if (cpumask) {
-			/* Should be impossible, we just initialized the cell tmp_cpumask */
-			bpf_cpumask_release(cpumask);
-			return -EINVAL;
-		}
-	}
-
-	// cells[0].in_use = true;
-	lookup_cell(0)->in_use = true;
-
-	/* Configure root cell (cell 0) topology at init time using nr_l3 and l3_to_cpu masks */
-	recalc_cell_l3_counts(ROOT_CELL_ID);
-
-	/* Create (cell,L3) DSQs for all pairs. Userspace will populate maps. */
-	// This is a crazy over-estimate
-	bpf_for(i, 0, MAX_CELLS)
-	{
-		u32 l3;
-		bpf_for(l3, 0, nr_l3)
-		{
-			ret = scx_bpf_create_dsq(get_cell_l3_dsq_id(i, l3).raw, ANY_NUMA);
-			if (ret < 0)
-				scx_bpf_error( "Failed to create DSQ for cell %d, L3 %d: err %d", i, l3, ret);
-		}
-	}
-
-	return 0;
-}
-
-void BPF_STRUCT_OPS(mitosis_exit, struct scx_exit_info *ei)
-{
-	// int i;
-	// bpf_for(i, 0, MAX_CELLS); {
-	// 	dump_cell_state((u32)i);
-	// }
-
-	UEI_RECORD(uei, ei);
-}
-
-SEC(".struct_ops.link")
-struct sched_ext_ops mitosis = {
-	.select_cpu = (void *)mitosis_select_cpu,
-	.enqueue = (void *)mitosis_enqueue,
-	.dispatch = (void *)mitosis_dispatch,
-	.tick = (void *)mitosis_tick,
-	.running = (void *)mitosis_running,
-	.stopping = (void *)mitosis_stopping,
-	.set_cpumask = (void *)mitosis_set_cpumask,
-	.init_task = (void *)mitosis_init_task,
-	.cgroup_init = (void *)mitosis_cgroup_init,
-	.cgroup_exit = (void *)mitosis_cgroup_exit,
-	.cgroup_move = (void *)mitosis_cgroup_move,
-	.dump = (void *)mitosis_dump,
-	.dump_task = (void *)mitosis_dump_task,
-	.init = (void *)mitosis_init,
-	.exit = (void *)mitosis_exit,
-	.name = "mitosis",
-};
-# File: scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
-/* Copyright (c) Meta Platforms, Inc. and affiliates. */
-/*
- * This software may be used and distributed according to the terms of the
- * GNU General Public License version 2.
- *
- * This defines the core data structures, types, and constants
- * for the scx_mitosis scheduler, primarily containing `struct cell`
- * and `struct task_ctx`.
- */
-
-#pragma once
-
-#ifdef LSP
-#define __bpf__
-#include "../../../../include/scx/common.bpf.h"
-#include "../../../../include/scx/ravg_impl.bpf.h"
-#else
-#include <scx/common.bpf.h>
-#include <scx/ravg_impl.bpf.h>
-#endif
-
-#include "intf.h"
-
-#define MAX_L3S 16
-
-#include "dsq.bpf.h"
-
-/*
- * A couple of tricky things about checking a cgroup's cpumask:
- *
- * First, we need an RCU pointer to pass to cpumask kfuncs. The only way to get
- * this right now is to copy the cpumask to a map entry. Given that cgroup init
- * could be re-entrant we have a few per-cpu entries in a map to make this
- * doable.
- *
- * Second, cpumask can sometimes be stored as an array in-situ or as a pointer
- * and with different lengths. Some bpf_core_type_matches finagling can make
- * this all work.
- */
-#define MAX_CPUMASK_ENTRIES (4)
-
-/*
- * We don't know how big struct cpumask is at compile time, so just allocate a
- * large space and check that it is big enough at runtime
- * TODO: This should be deduplicated with the rust code and put in intf.h
- */
-#define CPUMASK_LONG_ENTRIES (128)
-#define CPUMASK_SIZE (sizeof(long) * CPUMASK_LONG_ENTRIES)
-
-extern const volatile u32 nr_l3;
-
-extern struct cell_map cells;
-
-
-enum mitosis_constants {
-
-	/* Root cell index */
-	ROOT_CELL_ID = 0,
-
-	/* Invalid/unset L3 value */
-	// INVALID_L3_ID = -1,
-
-	/* Default weight divisor for vtime calculation */
-	DEFAULT_WEIGHT_MULTIPLIER = 100,
-
-	/* Vtime validation multiplier (slice_ns * 8192) */
-	VTIME_MAX_FUTURE_MULTIPLIER = 8192,
-
-	/* Bits per u32 for cpumask operations */
-	BITS_PER_U32 = 32,
-
-	/* No NUMA constraint for DSQ creation */
-	ANY_NUMA = -1,
-};
-
-struct cell {
-	struct bpf_spin_lock lock;
-
-	// Whether or not the cell is used or not
-	u32 in_use;
-	// Number of CPUs in this cell
-	u32 cpu_cnt;
-	// per-L3 vtimes within this cell
-	u64 l3_vtime_now[MAX_L3S];
-	// Number of CPUs from each L3 assigned to this cell
-	u32 l3_cpu_cnt[MAX_L3S];
-	// Number of L3s with at least one CPU in this cell
-	u32 l3_present_cnt;
-
-	// TODO XXX remove this, only here temporarily to make the code compile
-	// current vtime of the cell
-	u64 vtime_now;
-};
-
-// #if 0
-/* Wrap the spin lock in a struct for verifier */
-// struct cell_lock_wrapper {
-//     struct bpf_spin_lock lock;
-// };
-
-// struct cell_locks_map {
-//     __uint(type, BPF_MAP_TYPE_ARRAY);
-//     __type(key, u32);
-//     __type(value, struct cell_lock_wrapper);
-//     __uint(max_entries, MAX_CELLS);
-// };
-
-#define WITH_CELL_LOCK(cell_ptr, cell_idx, block)                       \
-	do {                                                            \
-		struct bpf_spin_lock *lock = get_cell_lock(cell_idx);   \
-		if (!lock) {                                            \
-			scx_bpf_error("Failed to get lock for cell %d", \
-				      cell_idx);                        \
-			break;                                          \
-		}                                                       \
-		bpf_spin_lock(lock);                                    \
-		block bpf_spin_unlock(lock);                            \
-	} while (0)
-
-static inline struct cell *lookup_cell(int idx)
-{
-	struct cell *cell;
-
-	// cell = MEMBER_VPTR(cells, [idx]);
-	cell = bpf_map_lookup_elem(&cells, &idx);
-
-
-	if (!cell) {
-		scx_bpf_error("Invalid cell %d", idx);
-		return NULL;
-	}
-	return cell;
-}
-
-static inline struct bpf_spin_lock *get_cell_lock(u32 cell_idx)
-{
-	if (cell_idx >= MAX_CELLS) {
-		scx_bpf_error("Invalid cell index %d", cell_idx);
-		return NULL;
-	}
-
-	struct cell *cell = lookup_cell(cell_idx);
-	if (!cell) {
-		scx_bpf_error("Cell %d not found", cell_idx);
-		return NULL;
-	}
-	return &cell->lock;
-}
-// #endif
-
-/*
- * task_ctx is the per-task information kept by scx_mitosis
- */
-struct task_ctx {
-	/* cpumask is the set of valid cpus this task can schedule on */
-	/* (tasks cpumask anded with its cell cpumask) */
-	struct bpf_cpumask __kptr *cpumask;
-	/* started_running_at for recording runtime */
-	u64 started_running_at;
-	u64 basis_vtime;
-	/* For the sake of monitoring, each task is owned by a cell */
-	u32 cell;
-	/* For the sake of scheduling, a task is exclusively owned by either a cell
-	 * or a cpu */
-	dsq_id_t dsq;
-	/* latest configuration that was applied for this task */
-	/* (to know if it has to be re-applied) */
-	u32 configuration_seq;
-	/* Is this task allowed on all cores of its cell? */
-	bool all_cell_cpus_allowed;
-	// Which L3 this task is assigned to
-	s32 l3;
-
-#if MITOSIS_ENABLE_STEALING
-	/* When a task is stolen, dispatch() marks the destination L3 here.
-	 * running() applies the retag and recomputes cpumask (vtime preserved).
-	*/
-	s32 pending_l3;
-	u32 steal_count; /* how many times this task has been stolen */
-	u64 last_stolen_at; /* ns timestamp of the last steal (scx_bpf_now) */
-	u32 steals_prevented; /* how many times this task has been prevented from being stolen */
-#endif
-};
-
-// These could go in mitosis.bpf.h, but we'll cross that bridge when we get
-static inline const struct cpumask *lookup_cell_cpumask(int idx);
-
-static inline struct task_ctx *lookup_task_ctx(struct task_struct *p);
-
-/* MAP TYPES */
-struct function_counters_map {
-	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-	__type(key, u32);
-	__type(value, u64);
-	__uint(max_entries, NR_COUNTERS);
-};
-
-struct cell_map {
-	__uint(type, BPF_MAP_TYPE_ARRAY);
-	__type(key, u32);
-	__type(value, struct cell);
-	__uint(max_entries, MAX_CELLS);
-};
-
-struct rcu_read_guard {
-	bool active;
-};
-
-static inline struct rcu_read_guard rcu_read_lock_guard(void)
-{
-	bpf_rcu_read_lock();
-	return (struct rcu_read_guard){ .active = true };
-}
-
-static inline void rcu_read_guard_release(struct rcu_read_guard *guard)
-{
-	if (guard->active) {
-		bpf_rcu_read_unlock();
-		guard->active = false;
-	}
-}
-#define RCU_READ_GUARD()                                               \
-	struct rcu_read_guard __rcu_guard                              \
-		__attribute__((__cleanup__(rcu_read_guard_release))) = \
-			rcu_read_lock_guard()
-
-struct cpumask_guard {
-	struct bpf_cpumask *mask;
-};
-
-static inline struct cpumask_guard cpumask_create_guard(void)
-{
-	struct bpf_cpumask *mask = bpf_cpumask_create();
-	return (struct cpumask_guard){ .mask = mask };
-}
-
-static inline void cpumask_guard_release(struct cpumask_guard *guard)
-{
-	if (guard->mask) {
-		bpf_cpumask_release(guard->mask);
-		guard->mask = NULL;
-	}
-}
-
-#define CPUMASK_GUARD(var_name)                                       \
-	struct cpumask_guard var_name                                 \
-		__attribute__((__cleanup__(cpumask_guard_release))) = \
-			cpumask_create_guard()

From 0b126e969bb266799307904d074f4b35a55a8ba2 Mon Sep 17 00:00:00 2001
From: tommy-u <tunger10@gmail.com>
Date: Fri, 10 Oct 2025 14:33:24 -0700
Subject: [PATCH 12/12] Fix work stealing bug

---
 scheds/rust/scx_mitosis/src/bpf/intf.h         | 2 --
 scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h | 6 ++----
 scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c  | 5 -----
 3 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/scheds/rust/scx_mitosis/src/bpf/intf.h b/scheds/rust/scx_mitosis/src/bpf/intf.h
index b1612430c6..b1fcbf7941 100644
--- a/scheds/rust/scx_mitosis/src/bpf/intf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/intf.h
@@ -20,9 +20,7 @@ typedef _Bool bool;
 #endif
 
 /* ---- Work stealing config (compile-time) ------------------------------- */
-#ifndef MITOSIS_ENABLE_STEALING
 #define MITOSIS_ENABLE_STEALING 1
-#endif
 /* ----------------------------------------------------------------------- */
 
 enum consts {
diff --git a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
index 2e5281984b..492b2723c7 100644
--- a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
@@ -20,9 +20,7 @@ typedef u32 l3_id_t;
 // Configure how aggressively we steal work.
 // When task is detected as a steal candidate, skip it this many times
 // On a web server workload, 100 reduced steal count by ~90%
-#ifdef MITOSIS_ENABLE_STEALING
 #define PREVENT_N_STEALS 0
-#endif
 
 /* Work stealing statistics map - accessible from both BPF and userspace */
 struct steal_stats_map {
@@ -213,7 +211,7 @@ static inline s32 pick_l3_for_task(u32 cell_id)
 	return ret;
 }
 
-#ifdef MITOSIS_ENABLE_STEALING
+#if MITOSIS_ENABLE_STEALING
 
 static inline bool try_stealing_this_task(struct task_ctx *task_ctx,
 					  s32 local_l3, u64 candidate_dsq)
@@ -280,7 +278,7 @@ static inline bool try_stealing_work(u32 cell, s32 local_l3)
 
 		// Optimization: skip if faster than constructing an iterator
 		// Not redundant with later checking if task found (race)
-		if (scx_bpf_dsq_nr_queued(candidate_dsq))
+		if (!scx_bpf_dsq_nr_queued(candidate_dsq))
 			continue;
 
 		// Just a trick for peeking the head element
diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
index 44cfee2f3d..3e1eac406e 100644
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
@@ -1625,11 +1625,6 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init)
 
 void BPF_STRUCT_OPS(mitosis_exit, struct scx_exit_info *ei)
 {
-	// int i;
-	// bpf_for(i, 0, MAX_CELLS); {
-	// 	dump_cell_state((u32)i);
-	// }
-
 	UEI_RECORD(uei, ei);
 }