diff --git a/scheds/rust/scx_mitosis/build.rs b/scheds/rust/scx_mitosis/build.rs
index f617cea07d..a5854f718c 100644
--- a/scheds/rust/scx_mitosis/build.rs
+++ b/scheds/rust/scx_mitosis/build.rs
@@ -6,7 +6,7 @@
 fn main() {
     scx_cargo::BpfBuilder::new()
         .unwrap()
-        .enable_intf("src/bpf/intf.h", "bpf_intf.rs")
+        .enable_intf("src/bpf/intf_rust.h", "bpf_intf.rs")
         .enable_skel("src/bpf/mitosis.bpf.c", "bpf")
         .build()
         .unwrap();
diff --git a/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h
new file mode 100644
index 0000000000..fc50f17fba
--- /dev/null
+++ b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h
@@ -0,0 +1,201 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates. */
+/*
+ * This software may be used and distributed according to the terms of the
+ * GNU General Public License version 2.
+ *
+ * This header defines the 64-bit dispatch queue (DSQ) ID encoding
+ * scheme for scx_mitosis, using type fields to distinguish between
+ * per-CPU and cell+L3 domain queues. It includes helper functions to
+ * construct, validate, and parse these DSQ IDs for queue management.
+ */
+#pragma once
+
+#include "intf.h"
+#include "mitosis.bpf.h"
+
+/*
+ * ================================
+ * BPF DSQ ID Layout (64 bits wide)
+ * ================================
+ *
+ * Top-level format:
+ *   [63] [62..0]
+ *   [ B] [  ID ]
+ *
+ * If B == 1 it is a Built-in DSQ
+ * -------------------------
+ *   [63] [62] [61 .. 32]  [31..0]
+ *   [ 1] [ L] [   R    ]  [  V  ]
+ *
+ *   - L (bit 62): LOCAL_ON flag
+ *       If L == 1 -> V = CPU number
+ *   - R (30 bits): reserved / unused
+ *   - V (32 bits): value (e.g., CPU#)
+ *
+ * If B == 0 -> User-defined DSQ
+ * -----------------------------
+ * Only the low 32 bits are used.
+ *
+ *   [63     ..     32] [31..0]
+ *   [ 0][   unused   ] [ VAL ]
+ *
+ *   Mitosis uses VAL as follows:
+ *
+ *   [31..28] [27..0]
+ *   [QTYPE ] [DATA ]
+ *
+ *   QTYPE encodes the queue type:
+ *
+ *     QTYPE = 0x1 -> Per-CPU Q
+ *       [31..28] [27 ..          ..        0]
+ *       [ 0001 ] [          CPU#            ]
+ *       [Q-TYPE:1]
+ *
+ *     QTYPE = 0x2 -> Cell+L3 Q
+ *       [31..28] [27 .. 16] [15      ..    0]
+ *       [ 0010 ] [  CELL# ] [      L3ID     ]
+ *       [Q-TYPE:2]
+ *
+ */
+/*
+ * The use of these bitfields depends on compiler defined byte AND bit ordering.
+ * Make sure we're only building with Clang/LLVM and that we're little-endian.
+ */
+#ifndef __clang__
+#error "This code must be compiled with Clang/LLVM (eBPF: clang -target bpf)."
+#endif
+
+#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+#error "dsq64 bitfield layout assumes little-endian (bpfel)."
+#endif
+
+/* ---- Bitfield widths (bits) ---- */
+#define CPU_B 28
+#define L3_B 16
+#define CELL_B 12
+#define TYPE_B 4
+#define DATA_B 28
+#define RSVD_B 32
+
+/* Sum checks (in bits) */
+_Static_assert(CPU_B + TYPE_B == 32, "CPU layout low half must be 32 bits");
+_Static_assert(L3_B + CELL_B + TYPE_B == 32,
+	       "CELL+L3 layout low half must be 32 bits");
+_Static_assert(DATA_B + TYPE_B == 32, "Common layout low half must be 32 bits");
+
+typedef union {
+	u64 raw;
+
+	/* Per-CPU user DSQ */
+	struct {
+		u64 cpu : CPU_B;
+		u64 type : TYPE_B;
+		u64 rsvd : RSVD_B;
+	} cpu_dsq;
+
+	/* Cell+L3 user DSQ */
+	struct {
+		u64 l3 : L3_B;
+		u64 cell : CELL_B;
+		u64 type : TYPE_B;
+		u64 rsvd : RSVD_B;
+	} cell_l3_dsq;
+
+	/* Generic user view */
+	struct {
+		u64 data : DATA_B;
+		u64 type : TYPE_B;
+		u64 rsvd : RSVD_B;
+	} user_dsq;
+
+	/* Built-in DSQ view */
+	struct {
+		u64 value : 32;
+		u64 rsvd : 30;
+		u64 local_on : 1;
+		u64 builtin : 1;
+	} builtin_dsq;
+
+	/* NOTE: Considered packed and aligned attributes, but that's redundant */
+} dsq_id_t;
+
+/*
+ * Invalid DSQ ID Sentinel:
+ * invalid bc bit 63 clear (it's a user DSQ) && dsq_type == 0 (no type)
+ * Good for catching uninitialized DSQ IDs.
+*/
+#define DSQ_INVALID ((u64)0)
+
+_Static_assert(sizeof(((dsq_id_t){ 0 }).cpu_dsq) == sizeof(u64),
+	       "cpu view must be 8 bytes");
+_Static_assert(sizeof(((dsq_id_t){ 0 }).cell_l3_dsq) == sizeof(u64),
+	       "cell+l3 view must be 8 bytes");
+_Static_assert(sizeof(((dsq_id_t){ 0 }).user_dsq) == sizeof(u64),
+	       "user common view must be 8 bytes");
+_Static_assert(sizeof(((dsq_id_t){ 0 }).builtin_dsq) == sizeof(u64),
+	       "builtin view must be 8 bytes");
+
+/* Compile-time checks (in bytes) */
+_Static_assert(sizeof(dsq_id_t) == sizeof(u64),
+	       "dsq_id_t must be 8 bytes (64 bits)");
+_Static_assert(_Alignof(dsq_id_t) == sizeof(u64),
+	       "dsq_id_t must be 8-byte aligned");
+
+/* DSQ type enumeration */
+enum dsq_type {
+	DSQ_TYPE_NONE,
+	DSQ_TYPE_CPU,
+	DSQ_TYPE_CELL_L3,
+};
+
+/* Range guards */
+_Static_assert(MAX_CPUS <= (1u << CPU_B), "MAX_CPUS must fit in field");
+_Static_assert(MAX_L3S <= (1u << L3_B), "MAX_L3S must fit in field");
+_Static_assert(MAX_CELLS <= (1u << CELL_B), "MAX_CELLS must fit in field");
+_Static_assert(DSQ_TYPE_CELL_L3 < (1u << TYPE_B),
+	       "DSQ_TYPE_CELL_L3 must fit in field");
+
+/*
+ * While I considered error propagation, I decided to bail to force errors early.
+*/
+
+static inline bool is_user_dsq(dsq_id_t dsq_id)
+{
+	return !dsq_id.builtin_dsq.builtin &&
+	       dsq_id.user_dsq.type != DSQ_TYPE_NONE;
+}
+
+// Is this a per CPU DSQ?
+static inline bool is_cpu_dsq(dsq_id_t dsq_id)
+{
+	return is_user_dsq(dsq_id) && dsq_id.user_dsq.type == DSQ_TYPE_CPU;
+}
+
+// If this is a per cpu dsq, return the cpu
+static inline u32 get_cpu_from_dsq(dsq_id_t dsq_id)
+{
+	if (!is_cpu_dsq(dsq_id))
+		scx_bpf_error("trying to get cpu from non-cpu dsq\n");
+
+	return dsq_id.cpu_dsq.cpu;
+}
+
+/* Helper functions to construct DSQ IDs */
+static inline dsq_id_t get_cpu_dsq_id(u32 cpu)
+{
+	// Check for valid CPU range, 0 indexed so >=.
+	if (cpu >= MAX_CPUS)
+		scx_bpf_error("invalid cpu %u\n", cpu);
+
+	return (dsq_id_t){ .cpu_dsq = { .cpu = cpu, .type = DSQ_TYPE_CPU } };
+}
+
+static inline dsq_id_t get_cell_l3_dsq_id(u32 cell, u32 l3)
+{
+	if (cell >= MAX_CELLS || l3 >= MAX_L3S)
+		scx_bpf_error("cell %u or l3 %u too large\n", cell, l3);
+
+	return (dsq_id_t){ .cell_l3_dsq = { .l3 = l3,
+					    .cell = cell,
+					    .type = DSQ_TYPE_CELL_L3 } };
+}
diff --git a/scheds/rust/scx_mitosis/src/bpf/intf.h b/scheds/rust/scx_mitosis/src/bpf/intf.h
index 0658734545..b1fcbf7941 100644
--- a/scheds/rust/scx_mitosis/src/bpf/intf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/intf.h
@@ -5,7 +5,8 @@
 #ifndef __INTF_H
 #define __INTF_H
 
-#ifndef __KERNEL__
+#ifndef __BPF__
+#include <stddef.h>
 typedef unsigned long long u64;
 typedef unsigned int u32;
 typedef _Bool bool;
@@ -18,6 +19,10 @@ typedef _Bool bool;
 #include <scx/ravg.bpf.h>
 #endif
 
+/* ---- Work stealing config (compile-time) ------------------------------- */
+#define MITOSIS_ENABLE_STEALING 1
+/* ----------------------------------------------------------------------- */
+
 enum consts {
 	CACHELINE_SIZE = 64,
 	MAX_CPUS_SHIFT = 9,
@@ -28,8 +33,67 @@ enum consts {
 
 	PCPU_BASE = 0x80000000,
 	MAX_CG_DEPTH = 256,
+
+	MAX_L3S = 16,
+};
+
+/* Kernel side sees the real lock; userspace sees padded bytes of same size/alignment */
+#if defined(__BPF__)
+#define CELL_LOCK_T struct bpf_spin_lock
+#else
+/* userspace placeholder: kernel won’t copy spin_lock */
+#define CELL_LOCK_T        \
+	struct {           \
+		u32 __pad; \
+	} /* 4-byte aligned as required */
+#endif
+
+struct cell {
+	// This is a lock in the kernel and padding in the user
+	CELL_LOCK_T lock; // Assumed to be the first entry (see below)
+
+	// Whether or not the cell is used
+	u32 in_use;
+
+	// Number of CPUs in this cell
+	u32 cpu_cnt;
+
+	// Number of L3s with at least one CPU in this cell
+	u32 l3_present_cnt;
+
+	// Number of CPUs from each L3 assigned to this cell
+	u32 l3_cpu_cnt[MAX_L3S];
+
+	// per-L3 vtimes within this cell
+	u64 l3_vtime_now[MAX_L3S];
 };
 
+// Putting the lock first in the struct is our convention.
+// We pad this space when in Rust code that will never see the lock value.
+// We intentionally avoid it in copy_cell_no_lock to keep the verifier happy.
+// It is a BPF constraint that it is 4 byte aligned.
+
+// All assertions work for both BPF and userspace builds
+_Static_assert(offsetof(struct cell, lock) == 0,
+	       "lock/padding must be first field");
+
+_Static_assert(sizeof(((struct cell *)0)->lock) == 4,
+	       "lock/padding must be 4 bytes");
+
+_Static_assert(_Alignof(CELL_LOCK_T) == 4,
+	       "lock/padding must be 4-byte aligned");
+
+_Static_assert(offsetof(struct cell, in_use) == 4,
+	       "in_use must follow 4-byte lock/padding");
+
+// Verify these are the same size in both BPF and Rust.
+_Static_assert(sizeof(struct cell) ==
+		       ((4 * sizeof(u32)) + (4 * MAX_L3S) + (8 * MAX_L3S)),
+	       "struct cell size must be stable for Rust bindings");
+
+_Static_assert(sizeof(struct cell) == 208,
+	       "struct cell must be exactly 208 bytes");
+
 /* Statistics */
 enum cell_stat_idx {
 	CSTAT_LOCAL,
@@ -39,6 +103,14 @@ enum cell_stat_idx {
 	NR_CSTATS,
 };
 
+/* Function invocation counters */
+enum fn_counter_idx {
+	COUNTER_SELECT_CPU,
+	COUNTER_ENQUEUE,
+	COUNTER_DISPATCH,
+	NR_COUNTERS,
+};
+
 struct cpu_ctx {
 	u64 cstats[MAX_CELLS][NR_CSTATS];
 	u64 cell_cycles[MAX_CELLS];
@@ -51,14 +123,4 @@ struct cgrp_ctx {
 	bool cell_owner;
 };
 
-/*
- * cell is the per-cell book-keeping
-*/
-struct cell {
-	// current vtime of the cell
-	u64 vtime_now;
-	// Whether or not the cell is used or not
-	u32 in_use;
-};
-
 #endif /* __INTF_H */
diff --git a/scheds/rust/scx_mitosis/src/bpf/intf_rust.h b/scheds/rust/scx_mitosis/src/bpf/intf_rust.h
new file mode 100644
index 0000000000..f8ffd3252a
--- /dev/null
+++ b/scheds/rust/scx_mitosis/src/bpf/intf_rust.h
@@ -0,0 +1,4 @@
+/* Force userspace path for Rust bindgen */
+#undef __BPF__
+#undef __bpf__
+#include "intf.h"
diff --git a/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
new file mode 100644
index 0000000000..492b2723c7
--- /dev/null
+++ b/scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
@@ -0,0 +1,310 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates. */
+/*
+ * This software may be used and distributed according to the terms of the
+ * GNU General Public License version 2.
+ *
+ * This header assists adding L3 cache awareness to scx_mitosis by defining
+ * maps and fns for managing CPU-to-L3 domain mappings. It provides code to
+ * recalculate per-L3 CPU counts within cells and implements weighted
+ * random L3 selection for tasks. It also tracks work-stealing
+ * statistics for cross-L3 task migrations.
+ */
+#pragma once
+
+#include "mitosis.bpf.h"
+#include "intf.h"
+
+typedef u32 l3_id_t;
+#define L3_INVALID ((l3_id_t)~0u)
+
+// Configure how aggressively we steal work.
+// When task is detected as a steal candidate, skip it this many times
+// On a web server workload, 100 reduced steal count by ~90%
+#define PREVENT_N_STEALS 0
+
+/* Work stealing statistics map - accessible from both BPF and userspace */
+struct steal_stats_map {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, u64);
+	__uint(max_entries, 1);
+};
+
+// A CPU -> L3 cache ID map
+struct cpu_to_l3_map {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, u32);
+	__uint(max_entries, MAX_CPUS);
+};
+
+struct l3_to_cpus_map {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, struct cpumask);
+	__uint(max_entries, MAX_L3S);
+};
+
+extern struct cpu_to_l3_map cpu_to_l3;
+extern struct l3_to_cpus_map l3_to_cpus;
+extern struct steal_stats_map steal_stats;
+
+static inline const bool l3_is_valid(u32 l3_id)
+{
+	if (l3_id == L3_INVALID)
+		return false;
+
+	return (l3_id >= 0) && (l3_id < MAX_L3S);
+}
+
+static inline void init_task_l3(struct task_ctx *tctx)
+{
+	tctx->l3 = L3_INVALID;
+
+#if MITOSIS_ENABLE_STEALING
+	tctx->pending_l3 = L3_INVALID;
+	tctx->steal_count = 0;
+	tctx->last_stolen_at = 0;
+	tctx->steals_prevented = 0;
+#endif
+}
+
+static inline const struct cpumask *lookup_l3_cpumask(u32 l3)
+{
+	struct cpumask *mask;
+
+	if (!(mask = bpf_map_lookup_elem(&l3_to_cpus, &l3))) {
+		scx_bpf_error("no l3 cpumask, l3: %d, %p", l3, &l3_to_cpus);
+		return NULL;
+	}
+
+	return mask;
+}
+
+/* Recompute cell->l3_cpu_cnt[] after cell cpumask changes */
+static __always_inline void recalc_cell_l3_counts(u32 cell_idx)
+{
+	struct cell *cell = lookup_cell(cell_idx);
+	if (!cell) {
+		scx_bpf_error("recalc_cell_l3_counts: invalid cell %d",
+			      cell_idx);
+		return;
+	}
+
+	CPUMASK_GUARD(tmp_guard);
+	if (!tmp_guard.mask) {
+		scx_bpf_error(
+			"recalc_cell_l3_counts: failed to create tmp mask");
+		return;
+	}
+
+	u32 l3, l3s_present = 0, total_cpus = 0;
+	// Just so we don't hold the lock longer than necessary
+	u32 l3_cpu_cnt_tmp[MAX_L3S] = { 0 };
+
+	{ // RCU context
+		RCU_READ_GUARD();
+		const struct cpumask *cell_mask =
+			lookup_cell_cpumask(cell_idx); // RCU ptr
+
+		if (!cell_mask) {
+			scx_bpf_error(
+				"recalc_cell_l3_counts: invalid cell mask");
+			return;
+		}
+
+		bpf_for(l3, 0, nr_l3)
+		{
+			const struct cpumask *l3_mask = lookup_l3_cpumask(l3);
+			if (!l3_mask) {
+				scx_bpf_error(
+					"recalc_cell_l3_counts: invalid l3 mask");
+				return;
+			}
+
+			bpf_cpumask_and(tmp_guard.mask, cell_mask, l3_mask);
+
+			u32 cnt = bpf_cpumask_weight(
+				(const struct cpumask *)tmp_guard.mask);
+
+			l3_cpu_cnt_tmp[l3] = cnt;
+
+			bpf_printk("recalc_cell_l3_counts: cnt %d", cnt);
+
+			// These are counted across the whole cell
+			total_cpus += cnt;
+
+			// Number of non-empty L3s in this cell
+			if (cnt)
+				l3s_present++;
+		}
+	} // unlock RCU
+
+	// Write to cell
+	bpf_spin_lock(&cell->lock);
+	for (u32 l3 = 0; l3 < nr_l3; l3++) {
+		cell->l3_cpu_cnt[l3] = l3_cpu_cnt_tmp[l3];
+	}
+
+	cell->l3_present_cnt = l3s_present;
+	cell->cpu_cnt = total_cpus;
+	bpf_spin_unlock(&cell->lock);
+}
+
+/**
+ * Weighted random selection of an L3 cache domain for a task.
+ *
+ * Uses the CPU count in each L3 domain within the cell as weights to
+ * probabilistically select an L3. L3 domains with more CPUs in the cell
+ * have higher probability of being selected.
+ *
+ * @cell_id: The cell ID to select an L3 from
+ * @return: L3 ID on success, L3_INVALID on error
+ */
+static inline s32 pick_l3_for_task(u32 cell_id)
+{
+	struct cell *cell;
+
+	/* Look up the cell structure */
+	if (!(cell = lookup_cell(cell_id))) {
+		scx_bpf_error("pick_l3_for_task: invalid cell %d", cell_id);
+		return L3_INVALID;
+	}
+
+	// Snapshot the current state of the cell
+	struct cell cell_snapshot;
+	bpf_spin_lock(&cell->lock);
+	copy_cell_skip_lock(&cell_snapshot, cell);
+	bpf_spin_unlock(&cell->lock);
+
+	// No cpus
+	if (!cell_snapshot.cpu_cnt) {
+		scx_bpf_error(
+			"pick_l3_for_task: cell %d has no CPUs accounted yet",
+			cell_id);
+		return L3_INVALID;
+	}
+
+	/* Find the L3 domain corresponding to the target value using
+	 * weighted selection - accumulate CPU counts until we exceed target */
+
+	/* Generate random target value in range [0, cpu_cnt) */
+	u32 target = bpf_get_prandom_u32() % cell_snapshot.cpu_cnt;
+	u32 l3, cur = 0;
+	s32 ret = L3_INVALID;
+
+	// This could be a prefix sum. Find first l3 where we exceed target
+	bpf_for(l3, 0, nr_l3)
+	{
+		cur += cell_snapshot.l3_cpu_cnt[l3];
+		if (target < cur) {
+			ret = (s32)l3;
+			break;
+		}
+	}
+
+	if (ret == L3_INVALID) {
+		scx_bpf_error("pick_l3_for_task: invalid L3");
+		return L3_INVALID;
+	}
+
+	return ret;
+}
+
+#if MITOSIS_ENABLE_STEALING
+
+static inline bool try_stealing_this_task(struct task_ctx *task_ctx,
+					  s32 local_l3, u64 candidate_dsq)
+{
+	// Attempt the steal, can fail beacuse it's a race.
+	if (!scx_bpf_dsq_move_to_local(candidate_dsq))
+		return false;
+
+	// We got the task!
+	task_ctx->steal_count++;
+	task_ctx->last_stolen_at = scx_bpf_now();
+	/* Retag to thief L3 (the one for this cpu) */
+	task_ctx->pending_l3 = local_l3;
+	task_ctx->steals_prevented = 0;
+
+	/* Increment steal counter in map */
+	u32 key = 0;
+	u64 *count = bpf_map_lookup_elem(&steal_stats, &key);
+	// NOTE: This could get expensive, but I'm not anticipating that many steals. Percpu if we care.
+	if (count)
+		__sync_fetch_and_add(count, 1);
+
+	return true;
+}
+
+/* Work stealing:
+ * Scan sibling (cell,L3) DSQs in the same cell and steal the first queued task if it can run on this cpu
+*/
+static inline bool try_stealing_work(u32 cell, s32 local_l3)
+{
+	if (!l3_is_valid(local_l3))
+		scx_bpf_error("try_stealing_work: invalid local_l3");
+
+	struct cell *cell_ptr = lookup_cell(cell);
+	if (!cell_ptr)
+		scx_bpf_error("try_stealing_work: invalid cell");
+
+	// Loop over all other L3s, looking for a queued task to steal
+	u32 i;
+	bpf_for(i, 1, nr_l3)
+	{
+		// Start with the next one to spread out the load
+		u32 candidate_l3 = (local_l3 + i) % nr_l3;
+
+		// Prevents the optimizer from removing the following conditional return
+		// so that the verifier knows the read wil be safe
+		barrier_var(candidate_l3);
+
+		if (candidate_l3 >= MAX_L3S)
+			continue;
+
+		// Skip L3s that are not present in this cell
+		// Note: rechecking cell_ptr for verifier
+		// TODO: Lock?
+		if (cell_ptr && cell_ptr->l3_cpu_cnt[candidate_l3] == 0)
+			continue;
+
+		u64 candidate_dsq = get_cell_l3_dsq_id(cell, candidate_l3).raw;
+
+		struct task_struct *task = NULL;
+		struct task_ctx *task_ctx;
+		// I'm only using this for the verifier
+		bool found_task = false;
+
+		// Optimization: skip if faster than constructing an iterator
+		// Not redundant with later checking if task found (race)
+		if (!scx_bpf_dsq_nr_queued(candidate_dsq))
+			continue;
+
+		// Just a trick for peeking the head element
+		bpf_for_each(scx_dsq, task, candidate_dsq, 0)
+		{
+			task_ctx = lookup_task_ctx(task);
+			found_task = (task_ctx != NULL);
+			break;
+		}
+
+		// No task? Try next L3
+		if (!found_task)
+			continue;
+
+		// This knob throttles stealing.
+		// TODO: make runtime configurable
+		if (task_ctx->steals_prevented++ < PREVENT_N_STEALS) {
+			continue;
+		}
+
+		if (!try_stealing_this_task(task_ctx, local_l3, candidate_dsq))
+			continue;
+
+		// Success, we got a task (no guarantee it was the one we peeked though... race)
+		return true;
+	}
+	return false;
+}
+#endif
diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
index b40c06e79d..3e1eac406e 100644
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
@@ -12,16 +12,12 @@
  * cgroups belonging to the cell.
  */
 
-#include "intf.h"
-
-#ifdef LSP
-#define __bpf__
-#include "../../../../include/scx/common.bpf.h"
-#include "../../../../include/scx/ravg_impl.bpf.h"
-#else
-#include <scx/common.bpf.h>
-#include <scx/ravg_impl.bpf.h>
-#endif
+// TODO: fix debug printer.
+// #include "intf.h"
+
+#include "mitosis.bpf.h"
+#include "dsq.bpf.h"
+#include "l3_aware.bpf.h"
 
 char _license[] SEC("license") = "GPL";
 
@@ -35,6 +31,7 @@ const volatile unsigned char all_cpus[MAX_CPUS_U8];
 const volatile u64 slice_ns;
 const volatile u64 root_cgid = 1;
 
+const volatile u32 nr_l3 = 1;
 /*
  * CPU assignment changes aren't fully in effect until a subsequent tick()
  * configuration_seq is bumped on each assignment change
@@ -48,23 +45,32 @@ private(root_cgrp) struct cgroup __kptr *root_cgrp;
 
 UEI_DEFINE(uei);
 
+// Cells now defined as a map so we can lock.
+struct cell_map cells SEC(".maps");
+
 /*
- * We store per-cpu values along with per-cell values. Helper functions to
- * translate.
- */
-static inline u32 cpu_dsq(u32 cpu)
-{
-	return PCPU_BASE | cpu;
-}
+ * Maps used for L3-aware scheduling
+*/
+#if 0
+struct cell_locks_map cell_locks SEC(".maps");
+#endif
+struct cpu_to_l3_map cpu_to_l3 SEC(".maps");
+struct l3_to_cpus_map l3_to_cpus SEC(".maps");
 
-static inline u32 cell_dsq(u32 cell)
-{
-	return cell;
-}
+/*
+ * Maps for statistics
+*/
+struct function_counters_map function_counters SEC(".maps");
+struct steal_stats_map steal_stats SEC(".maps");
 
-static inline u32 dsq_to_cpu(u32 dsq)
+static inline void increment_counter(enum fn_counter_idx idx)
 {
-	return dsq & ~PCPU_BASE;
+	u64 *counter;
+	u32 key = idx;
+
+	counter = bpf_map_lookup_elem(&function_counters, &key);
+	if (counter)
+		(*counter)++;
 }
 
 static inline struct cgroup *lookup_cgrp_ancestor(struct cgroup *cgrp,
@@ -119,28 +125,6 @@ static inline struct cgroup *task_cgroup(struct task_struct *p)
 	return cgrp;
 }
 
-/*
- * task_ctx is the per-task information kept by scx_mitosis
- */
-struct task_ctx {
-	/* cpumask is the set of valid cpus this task can schedule on */
-	/* (tasks cpumask anded with its cell cpumask) */
-	struct bpf_cpumask __kptr *cpumask;
-	/* started_running_at for recording runtime */
-	u64 started_running_at;
-	u64 basis_vtime;
-	/* For the sake of monitoring, each task is owned by a cell */
-	u32 cell;
-	/* For the sake of scheduling, a task is exclusively owned by either a cell
-	 * or a cpu */
-	u32 dsq;
-	/* latest configuration that was applied for this task */
-	/* (to know if it has to be re-applied) */
-	u32 configuration_seq;
-	/* Is this task allowed on all cores of its cell? */
-	bool all_cell_cpus_allowed;
-};
-
 struct {
 	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
 	__uint(map_flags, BPF_F_NO_PREALLOC);
@@ -185,20 +169,6 @@ static inline struct cpu_ctx *lookup_cpu_ctx(int cpu)
 	return cctx;
 }
 
-struct cell cells[MAX_CELLS];
-
-static inline struct cell *lookup_cell(int idx)
-{
-	struct cell *cell;
-
-	cell = MEMBER_VPTR(cells, [idx]);
-	if (!cell) {
-		scx_bpf_error("Invalid cell %d", idx);
-		return NULL;
-	}
-	return cell;
-}
-
 /*
  * Cells are allocated concurrently in some cases (e.g. cgroup_init).
  * allocate_cell and free_cell enable these allocations to be done safely
@@ -212,8 +182,17 @@ static inline int allocate_cell()
 		if (!(c = lookup_cell(cell_idx)))
 			return -1;
 
-		if (__sync_bool_compare_and_swap(&c->in_use, 0, 1))
+		bpf_spin_lock(&c->lock);
+		if (c->in_use == 0) {
+			// Zero everything except the lock (which is first)
+			__builtin_memset(&c->in_use, 0,
+					 sizeof(struct cell) -
+						 sizeof(CELL_LOCK_T));
+			c->in_use = 1; // Then mark as in use
+			bpf_spin_unlock(&c->lock);
 			return cell_idx;
+		}
+		bpf_spin_unlock(&c->lock);
 	}
 	scx_bpf_error("No available cells to allocate");
 	return -1;
@@ -292,7 +271,6 @@ static inline int update_task_cpumask(struct task_struct *p,
 {
 	const struct cpumask *cell_cpumask;
 	struct cpu_ctx *cpu_ctx;
-	struct cell *cell;
 	u32 cpu;
 
 	if (!(cell_cpumask = lookup_cell_cpumask(tctx->cell)))
@@ -301,11 +279,24 @@ static inline int update_task_cpumask(struct task_struct *p,
 	if (!tctx->cpumask)
 		return -EINVAL;
 
+	/*
+	 * Calculate the intersection of CPUs that are both:
+	 * 1. In this task's assigned cell (cell_cpumask)
+	 * 2. Allowed by the task's CPU affinity (p->cpus_ptr)
+	 * Store result in tctx->cpumask - this becomes the effective CPU set
+	 * where this task can actually run.
+	 */
 	bpf_cpumask_and(tctx->cpumask, cell_cpumask, p->cpus_ptr);
 
-	if (cell_cpumask)
-		tctx->all_cell_cpus_allowed =
-			bpf_cpumask_subset(cell_cpumask, p->cpus_ptr);
+	/*
+	 * Check if the task can run on ALL CPUs in its assigned cell.
+	 * If cell_cpumask is a subset of p->cpus_ptr, it means the task's
+	 * CPU affinity doesn't restrict it within the cell - it can use
+	 * any CPU in the cell. This affects scheduling decisions later.
+	 * True if all the bits in cell_cpumask are set in p->cpus_ptr.
+	 */
+	tctx->all_cell_cpus_allowed =
+		bpf_cpumask_subset(cell_cpumask, p->cpus_ptr);
 
 	/*
 	 * XXX - To be correct, we'd need to calculate the vtime
@@ -317,16 +308,66 @@ static inline int update_task_cpumask(struct task_struct *p,
 	 * Revisit if high frequency dynamic cell switching
 	 * needs to be supported.
 	 */
+
+	// We want to set the task vtime to that of the cell it's joining.
 	if (tctx->all_cell_cpus_allowed) {
-		tctx->dsq = cell_dsq(tctx->cell);
-		if (!(cell = lookup_cell(tctx->cell)))
+		const struct cpumask *l3_mask = NULL;
+		if (tctx->l3 != L3_INVALID) {
+			l3_mask = lookup_l3_cpumask((u32)tctx->l3);
+			/* If the L3 no longer intersects the cell's cpumask, invalidate it */
+			if (!l3_mask ||
+			    !bpf_cpumask_intersects(cell_cpumask, l3_mask))
+				tctx->l3 = L3_INVALID;
+		}
+
+		/* --- Pick a new L3 if needed --- */
+		if (tctx->l3 == L3_INVALID) {
+			s32 new_l3 = pick_l3_for_task(tctx->cell);
+			if (new_l3 < 0) {
+				scx_bpf_error("bad L3: %d", new_l3);
+				return -ENODEV;
+			}
+			tctx->l3 = new_l3;
+			l3_mask = lookup_l3_cpumask((u32)tctx->l3);
+			if (!l3_mask)
+				return -ENOENT;
+		}
+
+		/* --- Narrow the effective cpumask by the chosen L3 --- */
+		/* tctx->cpumask already contains (task_affinity ∧ cell_mask) */
+		if (tctx->cpumask)
+			bpf_cpumask_and(tctx->cpumask,
+					(const struct cpumask *)tctx->cpumask,
+					l3_mask);
+
+		/* If empty after intersection, nothing can run here */
+		if (tctx->cpumask &&
+		    bpf_cpumask_empty((const struct cpumask *)tctx->cpumask)) {
+			scx_bpf_error("Empty cpumask after intersection");
+			return -ENODEV;
+		}
+
+		/* --- Point to the correct (cell,L3) DSQ and set vtime baseline --- */
+		tctx->dsq = get_cell_l3_dsq_id(tctx->cell, tctx->l3);
+
+		struct cell *cell = lookup_cell(tctx->cell);
+		if (!cell) {
+			scx_bpf_error("Invalid cell");
 			return -ENOENT;
-		p->scx.dsq_vtime = READ_ONCE(cell->vtime_now);
+		}
+
+		if (!l3_is_valid(tctx->l3)) {
+			scx_bpf_error("Invalid L3 %d", tctx->l3);
+			return -EINVAL;
+		}
+
+		p->scx.dsq_vtime = READ_ONCE(cell->l3_vtime_now[tctx->l3]);
 	} else {
+		/* Task is CPU-restricted, use task mask */
 		cpu = bpf_cpumask_any_distribute(p->cpus_ptr);
 		if (!(cpu_ctx = lookup_cpu_ctx(cpu)))
 			return -ENOENT;
-		tctx->dsq = cpu_dsq(cpu);
+		tctx->dsq = get_cpu_dsq_id(cpu);
 		p->scx.dsq_vtime = READ_ONCE(cpu_ctx->vtime_now);
 	}
 
@@ -442,20 +483,24 @@ s32 BPF_STRUCT_OPS(mitosis_select_cpu, struct task_struct *p, s32 prev_cpu,
 	struct cpu_ctx *cctx;
 	struct task_ctx *tctx;
 
+	increment_counter(COUNTER_SELECT_CPU);
+
 	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
 		return prev_cpu;
 
 	if (maybe_refresh_cell(p, tctx) < 0)
 		return prev_cpu;
 
+	/* Pinned path: only if our task really requires a per-CPU queue. */
 	if (!tctx->all_cell_cpus_allowed) {
 		cstat_inc(CSTAT_AFFN_VIOL, tctx->cell, cctx);
-		cpu = dsq_to_cpu(tctx->dsq);
+		cpu = get_cpu_from_dsq(tctx->dsq);
 		if (scx_bpf_test_and_clear_cpu_idle(cpu))
 			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
 		return cpu;
 	}
 
+	// Grab an idle core
 	if ((cpu = pick_idle_cpu(p, prev_cpu, cctx, tctx)) >= 0) {
 		cstat_inc(CSTAT_LOCAL, tctx->cell, cctx);
 		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
@@ -489,14 +534,17 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags)
 	s32 cpu = -1;
 	u64 basis_vtime;
 
+	increment_counter(COUNTER_ENQUEUE);
+
 	if (!(tctx = lookup_task_ctx(p)) || !(cctx = lookup_cpu_ctx(-1)))
 		return;
 
 	if (maybe_refresh_cell(p, tctx) < 0)
 		return;
 
+	// Cpu pinned work
 	if (!tctx->all_cell_cpus_allowed) {
-		cpu = dsq_to_cpu(tctx->dsq);
+		cpu = get_cpu_from_dsq(tctx->dsq);
 	} else if (!__COMPAT_is_enq_cpu_selected(enq_flags)) {
 		/*
 		 * If we haven't selected a cpu, then we haven't looked for and kicked an
@@ -520,12 +568,23 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags)
 	}
 
 	if (tctx->all_cell_cpus_allowed) {
+		// This is a task that can run on any cpu in the cell
+
 		cstat_inc(CSTAT_CELL_DSQ, tctx->cell, cctx);
-		/* Task can use any CPU in its cell, so use the cell DSQ */
+
+		/* Task can use any CPU in its cell, set basis_vtime from per-(cell, L3) vtime */
 		if (!(cell = lookup_cell(tctx->cell)))
 			return;
-		basis_vtime = READ_ONCE(cell->vtime_now);
+
+		if (!l3_is_valid(tctx->l3)) {
+			scx_bpf_error("Invalid L3 ID for task %d in enqueue",
+				      p->pid);
+			return;
+		}
+		basis_vtime = READ_ONCE(cell->l3_vtime_now[tctx->l3]);
+
 	} else {
+		// This is a task that can only run on a specific cpu
 		cstat_inc(CSTAT_CPU_DSQ, tctx->cell, cctx);
 
 		/*
@@ -540,7 +599,8 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags)
 
 	tctx->basis_vtime = basis_vtime;
 
-	if (time_after(vtime, basis_vtime + 8192 * slice_ns)) {
+	if (time_after(vtime,
+		       basis_vtime + VTIME_MAX_FUTURE_MULTIPLIER * slice_ns)) {
 		scx_bpf_error("vtime is too far in the future for %d", p->pid);
 		return;
 	}
@@ -548,10 +608,11 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags)
 	 * Limit the amount of budget that an idling task can accumulate
 	 * to one slice.
 	 */
+	// TODO: Should this be time_before64?
 	if (time_before(vtime, basis_vtime - slice_ns))
 		vtime = basis_vtime - slice_ns;
 
-	scx_bpf_dsq_insert_vtime(p, tctx->dsq, slice_ns, vtime, enq_flags);
+	scx_bpf_dsq_insert_vtime(p, tctx->dsq.raw, slice_ns, vtime, enq_flags);
 
 	/* Kick the CPU if needed */
 	if (!__COMPAT_is_enq_cpu_selected(enq_flags) && cpu >= 0)
@@ -563,70 +624,81 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev)
 	struct cpu_ctx *cctx;
 	u32 cell;
 
+	increment_counter(COUNTER_DISPATCH);
+
 	if (!(cctx = lookup_cpu_ctx(-1)))
 		return;
 
 	cell = READ_ONCE(cctx->cell);
 
-	bool found = false;
-	u64 min_vtime_dsq;
-	u64 min_vtime;
+	/* Start from a valid DSQ */
+	dsq_id_t local_dsq = get_cpu_dsq_id(cpu);
 
+	bool found = false;
+	dsq_id_t min_vtime_dsq = local_dsq;
+	u64 min_vtime = ~0ULL; /* U64_MAX */
 	struct task_struct *p;
-	bpf_for_each(scx_dsq, p, cell, 0) {
-		min_vtime = p->scx.dsq_vtime;
-		min_vtime_dsq = cell;
-		found = true;
-		break;
+
+	// Get L3
+	u32 cpu_key = (u32)cpu;
+	u32 *l3_ptr = bpf_map_lookup_elem(&cpu_to_l3, &cpu_key);
+	s32 l3 = l3_ptr ? (s32)*l3_ptr : L3_INVALID;
+
+	/* Check the L3 queue */
+	if (l3 != L3_INVALID) {
+		dsq_id_t cell_l3_dsq = get_cell_l3_dsq_id(cell, l3);
+		bpf_for_each(scx_dsq, p, cell_l3_dsq.raw, 0)
+		{
+			min_vtime = p->scx.dsq_vtime;
+			min_vtime_dsq = cell_l3_dsq;
+			found = true;
+			break;
+		}
 	}
 
-	u64 dsq = cpu_dsq(cpu);
-	bpf_for_each(scx_dsq, p, dsq, 0) {
+	/* Check the CPU DSQ for a lower vtime */
+	bpf_for_each(scx_dsq, p, local_dsq.raw, 0)
+	{
 		if (!found || time_before(p->scx.dsq_vtime, min_vtime)) {
 			min_vtime = p->scx.dsq_vtime;
-			min_vtime_dsq = dsq;
+			min_vtime_dsq = local_dsq;
 			found = true;
 		}
 		break;
 	}
 
 	/*
-	 * If we failed to find an eligible task, scx will keep running prev if
-	 * prev->scx.flags & SCX_TASK_QUEUED (we don't set SCX_OPS_ENQ_LAST), and
-	 * otherwise go idle.
-	 */
-	if (!found)
-		return;
-	/*
-	 * The move_to_local can fail if we raced with some other cpu in the cell
-	 * and now the cell is empty. We have to ensure to try the cpu_dsq or else
-	 * we might never wakeup.
-	 */
+	* The move_to_local can fail if we raced with some other cpu in the cell
+	* and now the cell is empty. We have to ensure to try the cpu_dsq or else
+	* we might never wakeup.
+	*/
 
-	if (!scx_bpf_dsq_move_to_local(min_vtime_dsq) && min_vtime_dsq != dsq)
-		scx_bpf_dsq_move_to_local(dsq);
-}
+	if (found) {
+		// We found a task in the local or cell-L3 DSQ
 
-/*
- * A couple of tricky things about checking a cgroup's cpumask:
- *
- * First, we need an RCU pointer to pass to cpumask kfuncs. The only way to get
- * this right now is to copy the cpumask to a map entry. Given that cgroup init
- * could be re-entrant we have a few per-cpu entries in a map to make this
- * doable.
- *
- * Second, cpumask can sometimes be stored as an array in-situ or as a pointer
- * and with different lengths. Some bpf_core_type_matches finagling can make
- * this all work.
- */
-#define MAX_CPUMASK_ENTRIES (4)
+		// If it was in the per cpu DSQ, there is no competation, grab it and return
+		if (min_vtime_dsq.raw == local_dsq.raw) {
+			scx_bpf_dsq_move_to_local(min_vtime_dsq.raw);
+			return;
+		}
 
-/*
- * We don't know how big struct cpumask is at compile time, so just allocate a
- * large space and check that it is big enough at runtime
- */
-#define CPUMASK_LONG_ENTRIES (128)
-#define CPUMASK_SIZE (sizeof(long) * CPUMASK_LONG_ENTRIES)
+		// If it was in the cell L3 DSQ, we are competing with other cpus in the cell-l3
+		// try to move it to the local DSQ
+		if (scx_bpf_dsq_move_to_local(min_vtime_dsq.raw)) {
+			// We won the race and got the task, return
+			return;
+		}
+	}
+
+#if MITOSIS_ENABLE_STEALING
+	// We didn't find a task in either DSQ, or lost the race.
+	// Instead of going straight to idle, attempt to steal a task from another
+	// L3 in the cell.
+
+	// Try stealing. If successful, this moves the task to the local runqueue
+	try_stealing_work(cell, l3);
+#endif
+}
 
 struct cpumask_entry {
 	unsigned long cpumask[CPUMASK_LONG_ENTRIES];
@@ -758,18 +830,19 @@ static inline int get_cgroup_cpumask(struct cgroup *cgrp,
 u32 level_cells[MAX_CG_DEPTH];
 int running;
 
+/* The guard is a stack variable. When it falls out of scope,
+ * we drop the running lock. */
+static inline void __running_unlock(int *guard)
+{
+	(void)guard; /* unused */
+	WRITE_ONCE(running, 0);
+}
+
 /*
  * On tick, we identify new cells and apply CPU assignment
  */
 void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run)
 {
-	/*
-	 * We serialize tick() on core 0 and ensure only one tick running at a time
-	 * to ensure this can only happen once.
-	 */
-	if (bpf_get_smp_processor_id())
-		return;
-
 	u32 local_configuration_seq = READ_ONCE(configuration_seq);
 	if (local_configuration_seq == READ_ONCE(applied_configuration_seq))
 		return;
@@ -779,6 +852,8 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run)
 					 __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
 		return;
 
+	int __attribute__((cleanup(__running_unlock), unused)) __running_guard;
+
 	DECLARE_CPUMASK_ENTRY(entry) = allocate_cpumask_entry();
 	if (!entry)
 		return;
@@ -809,8 +884,7 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run)
 	}
 
 	/*
-	 * Initialize root cell cpumask to all cpus, and then remove from it as we
-	 * go
+	 * Initialize root cell cpumask to all cpus, and then remove from it as we go
 	 */
 	bpf_cpumask_copy(root_bpf_cpumask, (const struct cpumask *)all_cpumask);
 
@@ -842,7 +916,8 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run)
 	 * Iterate over all cgroups, check if any have a cpumask and populate them
 	 * as a separate cell.
 	 */
-	bpf_for_each(css, pos, root_css, BPF_CGROUP_ITER_DESCENDANTS_PRE) {
+	bpf_for_each(css, pos, root_css, BPF_CGROUP_ITER_DESCENDANTS_PRE)
+	{
 		cur_cgrp = pos->cgroup;
 
 		/*
@@ -967,13 +1042,12 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run)
 	int cpu_idx;
 	bpf_for(cpu_idx, 0, nr_possible_cpus)
 	{
-		if (bpf_cpumask_test_cpu(
-			    cpu_idx, (const struct cpumask *)&entry->cpumask)) {
+		if (bpf_cpumask_test_cpu(cpu_idx, (const struct cpumask *)
+							  root_bpf_cpumask)) {
 			struct cpu_ctx *cpu_ctx;
 			if (!(cpu_ctx = lookup_cpu_ctx(cpu_idx)))
 				goto out_root_cgrp;
 			cpu_ctx->cell = 0;
-			bpf_cpumask_clear_cpu(cpu_idx, root_bpf_cpumask);
 		}
 	}
 
@@ -992,18 +1066,40 @@ void BPF_STRUCT_OPS(mitosis_tick, struct task_struct *p_run)
 		goto out_root_cgrp;
 	}
 
+	int cell_idx;
+	/* Recalculate L3 counts for all active cells after CPU assignment changes */
+	bpf_for(cell_idx, 1, MAX_CELLS)
+	{
+		struct cell *cell;
+		if (!(cell = lookup_cell(cell_idx))) {
+			scx_bpf_error("Lookup for cell %d failed in tick()",
+				      cell_idx);
+			goto out_root_cgrp;
+		}
+
+		if (!cell->in_use)
+			continue;
+
+		/* Recalculate L3 counts for each active cell */
+		recalc_cell_l3_counts(cell_idx);
+	}
+
+	/* Recalculate root cell's L3 counts after cpumask update */
+	recalc_cell_l3_counts(ROOT_CELL_ID);
+
 	barrier();
 	WRITE_ONCE(applied_configuration_seq, local_configuration_seq);
-	WRITE_ONCE(running, 0);
 
 	bpf_cgroup_release(root_cgrp_ref);
 	return;
+
 out_rcu_unlock:
 	bpf_rcu_read_unlock();
 out_root_cgrp:
 	bpf_cgroup_release(root_cgrp_ref);
 out:
-	bpf_cpumask_release(root_bpf_cpumask);
+	if (root_bpf_cpumask)
+		bpf_cpumask_release(root_bpf_cpumask);
 }
 
 void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p)
@@ -1016,13 +1112,47 @@ void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p)
 	    !(cell = lookup_cell(cctx->cell)))
 		return;
 
+		/*
+	 * If this task was stolen across L3s, retag to thief L3 and recompute
+	 * effective cpumask+DSQ. Preserve vtime to keep fairness.
+	 */
+#if MITOSIS_ENABLE_STEALING
+	if (l3_is_valid(tctx->pending_l3)) {
+		u64 save_v = p->scx.dsq_vtime;
+		tctx->l3 = tctx->pending_l3;
+		tctx->pending_l3 = L3_INVALID;
+		update_task_cpumask(p, tctx);
+		p->scx.dsq_vtime = save_v;
+	}
+#endif
+
+	/* Validate task's DSQ before it starts running */
+	if (tctx->dsq.raw == DSQ_INVALID) {
+		if (tctx->all_cell_cpus_allowed) {
+			scx_bpf_error(
+				"Task %d has invalid DSQ 0 in running callback (CELL-SCHEDULABLE task, can run on any CPU in cell %d)",
+				p->pid, tctx->cell);
+		} else {
+			scx_bpf_error(
+				"Task %d has invalid DSQ 0 in running callback (CORE-PINNED task, restricted to specific CPUs)",
+				p->pid);
+		}
+		return;
+	}
+
 	/*
-	 * Update both the CPU's cell and the cpu's vtime so the vtime's are
-	 * comparable at dispatch time.
+	 * Update per-(cell, L3) vtime for cell-schedulable tasks
 	 */
-	if (time_before(READ_ONCE(cell->vtime_now), p->scx.dsq_vtime))
-		WRITE_ONCE(cell->vtime_now, p->scx.dsq_vtime);
+	if (tctx->all_cell_cpus_allowed && l3_is_valid(tctx->l3)) {
+		if (time_before(READ_ONCE(cell->l3_vtime_now[tctx->l3]),
+				p->scx.dsq_vtime))
+			WRITE_ONCE(cell->l3_vtime_now[tctx->l3],
+				   p->scx.dsq_vtime);
+	}
 
+	/*
+	 * Update CPU vtime for CPU-pinned tasks
+	 */
 	if (time_before(READ_ONCE(cctx->vtime_now), p->scx.dsq_vtime))
 		WRITE_ONCE(cctx->vtime_now, p->scx.dsq_vtime);
 
@@ -1048,7 +1178,7 @@ void BPF_STRUCT_OPS(mitosis_stopping, struct task_struct *p, bool runnable)
 	used = now - tctx->started_running_at;
 	tctx->started_running_at = now;
 	/* scale the execution time by the inverse of the weight and charge */
-	p->scx.dsq_vtime += used * 100 / p->scx.weight;
+	p->scx.dsq_vtime += used * DEFAULT_WEIGHT_MULTIPLIER / p->scx.weight;
 
 	if (cidx != 0 || tctx->all_cell_cpus_allowed) {
 		u64 *cell_cycles = MEMBER_VPTR(cctx->cell_cycles, [cidx]);
@@ -1057,6 +1187,17 @@ void BPF_STRUCT_OPS(mitosis_stopping, struct task_struct *p, bool runnable)
 			return;
 		}
 		*cell_cycles += used;
+
+		/*
+		 * For cell-schedulable tasks, also accumulate vtime into
+		 * per-cell per-L3 queues
+		 */
+		if (tctx->all_cell_cpus_allowed && l3_is_valid(tctx->l3)) {
+			/* Accumulate weighted execution time into per-(cell, L3) vtime */
+			cell->l3_vtime_now[tctx->l3] +=
+				used * DEFAULT_WEIGHT_MULTIPLIER /
+				p->scx.weight;
+		}
 	}
 }
 
@@ -1083,8 +1224,9 @@ s32 BPF_STRUCT_OPS(mitosis_cgroup_init, struct cgroup *cgrp,
 		return -ENOENT;
 	}
 
+	// Special case for root cell
 	if (cgrp->kn->id == root_cgid) {
-		WRITE_ONCE(cgc->cell, 0);
+		WRITE_ONCE(cgc->cell, ROOT_CELL_ID);
 		return 0;
 	}
 
@@ -1175,6 +1317,7 @@ s32 BPF_STRUCT_OPS(mitosis_init_task, struct task_struct *p,
 {
 	struct task_ctx *tctx;
 	struct bpf_cpumask *cpumask;
+	int ret;
 
 	tctx = bpf_task_storage_get(&task_ctxs, p, 0,
 				    BPF_LOCAL_STORAGE_GET_F_CREATE);
@@ -1200,16 +1343,24 @@ s32 BPF_STRUCT_OPS(mitosis_init_task, struct task_struct *p,
 		return -EINVAL;
 	}
 
-	return update_task_cell(p, tctx, args->cgroup);
+	/* Initialize L3 to invalid before cell assignment */
+	init_task_l3(tctx);
+
+	// TODO clean this up
+	if ((ret = update_task_cell(p, tctx, args->cgroup))) {
+		return ret;
+	}
+
+	return 0;
 }
 
 __hidden void dump_cpumask_word(s32 word, const struct cpumask *cpumask)
 {
 	u32 u, v = 0;
 
-	bpf_for(u, 0, 32)
+	bpf_for(u, 0, BITS_PER_U32)
 	{
-		s32 cpu = 32 * word + u;
+		s32 cpu = BITS_PER_U32 * word + u;
 		if (cpu < nr_possible_cpus &&
 		    bpf_cpumask_test_cpu(cpu, cpumask))
 			v |= 1 << u;
@@ -1239,9 +1390,82 @@ static void dump_cell_cpumask(int id)
 	dump_cpumask(cell_cpumask);
 }
 
+/* Print cell state for debugging */
+static __always_inline void dump_cell_state(u32 cell_idx)
+{
+	struct cell *cell = lookup_cell(cell_idx);
+	if (!cell) {
+		scx_bpf_dump("Cell %d: NOT FOUND", cell_idx);
+		return;
+	}
+
+	scx_bpf_dump("Cell %d: in_use=%d, cpu_cnt=%d, l3_present_cnt=%d",
+		     cell_idx, cell->in_use, cell->cpu_cnt,
+		     cell->l3_present_cnt);
+
+	u32 l3;
+	// TODO Print vtimes for L3s
+	// TODO lock
+	bpf_for(l3, 0, nr_l3)
+	{
+		if (cell->l3_cpu_cnt[l3] > 0) {
+			scx_bpf_dump("  L3[%d]: %d CPUs", l3,
+				     cell->l3_cpu_cnt[l3]);
+		}
+	}
+}
+
+static __always_inline void dump_l3_state()
+{
+	u32 l3;
+	const struct cpumask *l3_mask;
+	dsq_id_t dsq_id;
+
+	scx_bpf_dump("\n=== L3 Cache Topology ===\n");
+	scx_bpf_dump("Total L3 domains: %d\n", nr_l3);
+
+	bpf_for(l3, 0, nr_l3)
+	{
+		l3_mask = lookup_l3_cpumask(l3);
+		if (!l3_mask) {
+			scx_bpf_dump(
+				"L3[%d]: ERROR - failed to lookup cpumask\n",
+				l3);
+			continue;
+		}
+
+		scx_bpf_dump("L3[%d] CPUS=", l3);
+		dump_cpumask(l3_mask);
+		scx_bpf_dump("\n");
+
+		scx_bpf_dump("  Per-cell DSQ stats:\n");
+		u32 cell_idx;
+		bpf_for(cell_idx, 0, MAX_CELLS)
+		{
+			struct cell *cell = lookup_cell(cell_idx);
+			if (!cell || !cell->in_use)
+				continue;
+
+			if (!l3_is_valid(l3))
+				continue;
+
+			dsq_id = get_cell_l3_dsq_id(cell_idx, l3);
+			u64 nr_queued = scx_bpf_dsq_nr_queued(dsq_id.raw);
+
+			if (nr_queued > 0 || cell->l3_cpu_cnt[l3] > 0) {
+				scx_bpf_dump(
+					"    Cell[%d]: %d CPUs, vtime=%llu, nr_queued=%llu\n",
+					cell_idx, cell->l3_cpu_cnt[l3],
+					READ_ONCE(cell->l3_vtime_now[l3]),
+					nr_queued);
+			}
+		}
+	}
+}
+
 void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx)
 {
-	u64 dsq_id;
+	dsq_id_t dsq_id;
 	int i;
 	struct cell *cell;
 	struct cpu_ctx *cpu_ctx;
@@ -1259,9 +1483,7 @@ void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx)
 		scx_bpf_dump("CELL[%d] CPUS=", i);
 		dump_cell_cpumask(i);
 		scx_bpf_dump("\n");
-		scx_bpf_dump("CELL[%d] vtime=%llu nr_queued=%d\n", i,
-			     READ_ONCE(cell->vtime_now),
-			     scx_bpf_dsq_nr_queued(i));
+		dump_cell_state(i);
 	}
 
 	bpf_for(i, 0, nr_possible_cpus)
@@ -1269,11 +1491,13 @@ void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx)
 		if (!(cpu_ctx = lookup_cpu_ctx(i)))
 			return;
 
-		dsq_id = cpu_dsq(i);
+		dsq_id = get_cpu_dsq_id(i);
 		scx_bpf_dump("CPU[%d] cell=%d vtime=%llu nr_queued=%d\n", i,
 			     cpu_ctx->cell, READ_ONCE(cpu_ctx->vtime_now),
-			     scx_bpf_dsq_nr_queued(dsq_id));
+			     scx_bpf_dsq_nr_queued(dsq_id.raw));
 	}
+
+	dump_l3_state();
 }
 
 void BPF_STRUCT_OPS(mitosis_dump_task, struct scx_dump_ctx *dctx,
@@ -1285,9 +1509,9 @@ void BPF_STRUCT_OPS(mitosis_dump_task, struct scx_dump_ctx *dctx,
 		return;
 
 	scx_bpf_dump(
-		"Task[%d] vtime=%llu basis_vtime=%llu cell=%u dsq=%x all_cell_cpus_allowed=%d\n",
+		"Task[%d] vtime=%llu basis_vtime=%llu cell=%u dsq=%llu all_cell_cpus_allowed=%d\n",
 		p->pid, p->scx.dsq_vtime, tctx->basis_vtime, tctx->cell,
-		tctx->dsq, tctx->all_cell_cpus_allowed);
+		tctx->dsq.raw, tctx->all_cell_cpus_allowed);
 	scx_bpf_dump("Task[%d] CPUS=", p->pid);
 	dump_cpumask(p->cpus_ptr);
 	scx_bpf_dump("\n");
@@ -1319,7 +1543,8 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init)
 		if ((u8_ptr = MEMBER_VPTR(all_cpus, [i / 8]))) {
 			if (*u8_ptr & (1 << (i % 8))) {
 				bpf_cpumask_set_cpu(i, cpumask);
-				ret = scx_bpf_create_dsq(cpu_dsq(i), -1);
+				ret = scx_bpf_create_dsq(get_cpu_dsq_id(i).raw,
+							 ANY_NUMA);
 				if (ret < 0) {
 					bpf_cpumask_release(cpumask);
 					return ret;
@@ -1334,14 +1559,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init)
 	if (cpumask)
 		bpf_cpumask_release(cpumask);
 
+	/* setup cell cpumasks */
 	bpf_for(i, 0, MAX_CELLS)
 	{
 		struct cell_cpumask_wrapper *cpumaskw;
-
-		ret = scx_bpf_create_dsq(i, -1);
-		if (ret < 0)
-			return ret;
-
 		if (!(cpumaskw = bpf_map_lookup_elem(&cell_cpumasks, &i)))
 			return -ENOENT;
 
@@ -1373,7 +1594,32 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(mitosis_init)
 		}
 	}
 
-	cells[0].in_use = true;
+	struct cell *cell = lookup_cell(0);
+	if (!cell) {
+		scx_bpf_error("Failed to lookup cell 0");
+		return -ENOENT;
+	}
+	cell->in_use = true;
+
+	/* Configure root cell (cell 0) topology at init time using nr_l3 and l3_to_cpu masks */
+	recalc_cell_l3_counts(ROOT_CELL_ID);
+
+	/* Create (cell,L3) DSQs for all pairs. Userspace will populate maps. */
+	// This is a crazy over-estimate
+	bpf_for(i, 0, MAX_CELLS)
+	{
+		u32 l3;
+		bpf_for(l3, 0, nr_l3)
+		{
+			ret = scx_bpf_create_dsq(get_cell_l3_dsq_id(i, l3).raw,
+						 ANY_NUMA);
+			if (ret < 0)
+				scx_bpf_error(
+					"Failed to create DSQ for cell %d, L3 %d: err %d",
+					i, l3, ret);
+		}
+	}
+
 	return 0;
 }
 
diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
new file mode 100644
index 0000000000..52738c6a21
--- /dev/null
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
@@ -0,0 +1,207 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates. */
+/*
+ * This software may be used and distributed according to the terms of the
+ * GNU General Public License version 2.
+ *
+ * This defines the core data structures, types, and constants
+ * for the scx_mitosis scheduler, primarily containing `struct cell`
+ * and `struct task_ctx`.
+ */
+
+#pragma once
+
+#ifdef LSP
+#define __bpf__
+#include "../../../../include/scx/common.bpf.h"
+#include "../../../../include/scx/ravg_impl.bpf.h"
+#else
+#include <scx/common.bpf.h>
+#include <scx/ravg_impl.bpf.h>
+#endif
+
+#include "intf.h"
+#include "dsq.bpf.h"
+
+/*
+ * A couple of tricky things about checking a cgroup's cpumask:
+ *
+ * First, we need an RCU pointer to pass to cpumask kfuncs. The only way to get
+ * this right now is to copy the cpumask to a map entry. Given that cgroup init
+ * could be re-entrant we have a few per-cpu entries in a map to make this
+ * doable.
+ *
+ * Second, cpumask can sometimes be stored as an array in-situ or as a pointer
+ * and with different lengths. Some bpf_core_type_matches finagling can make
+ * this all work.
+ */
+#define MAX_CPUMASK_ENTRIES (4)
+
+/*
+ * We don't know how big struct cpumask is at compile time, so just allocate a
+ * large space and check that it is big enough at runtime
+ * TODO: This should be deduplicated with the rust code and put in intf.h
+ */
+#define CPUMASK_LONG_ENTRIES (128)
+#define CPUMASK_SIZE (sizeof(long) * CPUMASK_LONG_ENTRIES)
+
+extern const volatile u32 nr_l3;
+
+extern struct cell_map cells;
+
+enum mitosis_constants {
+
+	/* Root cell index */
+	ROOT_CELL_ID = 0,
+
+	/* Invalid/unset L3 value */
+	// INVALID_L3_ID = -1,
+
+	/* Default weight divisor for vtime calculation */
+	DEFAULT_WEIGHT_MULTIPLIER = 100,
+
+	/* Vtime validation multiplier (slice_ns * 8192) */
+	VTIME_MAX_FUTURE_MULTIPLIER = 8192,
+
+	/* Bits per u32 for cpumask operations */
+	BITS_PER_U32 = 32,
+
+	/* No NUMA constraint for DSQ creation */
+	ANY_NUMA = -1,
+};
+
+static inline void copy_cell_skip_lock(struct cell *dst, const struct cell *src)
+{
+	/* Copy everything AFTER the lock field.
+	 * Since lock is first and 4 bytes (verified by static assertions),
+	 * we skip it and copy the remainder of the struct.
+	 */
+	__builtin_memcpy(&dst->in_use, &src->in_use,
+			 sizeof(struct cell) - sizeof(CELL_LOCK_T));
+}
+
+static inline struct cell *lookup_cell(int idx)
+{
+	struct cell *cell;
+
+	cell = bpf_map_lookup_elem(&cells, &idx);
+
+	if (!cell) {
+		scx_bpf_error("Invalid cell %d", idx);
+		return NULL;
+	}
+	return cell;
+}
+
+static inline struct bpf_spin_lock *get_cell_lock(u32 cell_idx)
+{
+	if (cell_idx >= MAX_CELLS) {
+		scx_bpf_error("Invalid cell index %d", cell_idx);
+		return NULL;
+	}
+
+	struct cell *cell = lookup_cell(cell_idx);
+	if (!cell) {
+		scx_bpf_error("Cell %d not found", cell_idx);
+		return NULL;
+	}
+	return &cell->lock;
+}
+
+/*
+ * task_ctx is the per-task information kept by scx_mitosis
+ */
+struct task_ctx {
+	/* cpumask is the set of valid cpus this task can schedule on */
+	/* (tasks cpumask anded with its cell cpumask) */
+	struct bpf_cpumask __kptr *cpumask;
+	/* started_running_at for recording runtime */
+	u64 started_running_at;
+	u64 basis_vtime;
+	/* For the sake of monitoring, each task is owned by a cell */
+	u32 cell;
+	/* For the sake of scheduling, a task is exclusively owned by either a cell
+	 * or a cpu */
+	dsq_id_t dsq;
+	/* latest configuration that was applied for this task */
+	/* (to know if it has to be re-applied) */
+	u32 configuration_seq;
+	/* Is this task allowed on all cores of its cell? */
+	bool all_cell_cpus_allowed;
+	// Which L3 this task is assigned to
+	s32 l3;
+
+#if MITOSIS_ENABLE_STEALING
+	/* When a task is stolen, dispatch() marks the destination L3 here.
+	 * running() applies the retag and recomputes cpumask (vtime preserved).
+	*/
+	s32 pending_l3;
+	u32 steal_count; /* how many times this task has been stolen */
+	u64 last_stolen_at; /* ns timestamp of the last steal (scx_bpf_now) */
+	u32 steals_prevented; /* how many times this task has been prevented from being stolen */
+#endif
+};
+
+// These could go in mitosis.bpf.h, but we'll cross that bridge when we get
+static inline const struct cpumask *lookup_cell_cpumask(int idx);
+
+static inline struct task_ctx *lookup_task_ctx(struct task_struct *p);
+
+/* MAP TYPES */
+struct function_counters_map {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, u32);
+	__type(value, u64);
+	__uint(max_entries, NR_COUNTERS);
+};
+
+struct cell_map {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, struct cell);
+	__uint(max_entries, MAX_CELLS);
+};
+
+struct rcu_read_guard {
+	bool active;
+};
+
+static inline struct rcu_read_guard rcu_read_lock_guard(void)
+{
+	bpf_rcu_read_lock();
+	return (struct rcu_read_guard){ .active = true };
+}
+
+static inline void rcu_read_guard_release(struct rcu_read_guard *guard)
+{
+	if (guard->active) {
+		bpf_rcu_read_unlock();
+		guard->active = false;
+	}
+}
+#define RCU_READ_GUARD()                                               \
+	struct rcu_read_guard __rcu_guard                              \
+		__attribute__((__cleanup__(rcu_read_guard_release))) = \
+			rcu_read_lock_guard()
+
+struct cpumask_guard {
+	struct bpf_cpumask *mask;
+};
+
+static inline struct cpumask_guard cpumask_create_guard(void)
+{
+	struct bpf_cpumask *mask = bpf_cpumask_create();
+	return (struct cpumask_guard){ .mask = mask };
+}
+
+static inline void cpumask_guard_release(struct cpumask_guard *guard)
+{
+	if (guard->mask) {
+		bpf_cpumask_release(guard->mask);
+		guard->mask = NULL;
+	}
+}
+
+#define CPUMASK_GUARD(var_name)                                       \
+	struct cpumask_guard var_name                                 \
+		__attribute__((__cleanup__(cpumask_guard_release))) = \
+			cpumask_create_guard()
diff --git a/scheds/rust/scx_mitosis/src/main.rs b/scheds/rust/scx_mitosis/src/main.rs
index 8f42568df6..b25be74326 100644
--- a/scheds/rust/scx_mitosis/src/main.rs
+++ b/scheds/rust/scx_mitosis/src/main.rs
@@ -6,6 +6,7 @@ mod bpf_skel;
 pub use bpf_skel::*;
 pub mod bpf_intf;
 mod stats;
+mod mitosis_topology_utils;
 
 use std::cmp::max;
 use std::collections::HashMap;
@@ -16,14 +17,14 @@ use std::sync::atomic::AtomicBool;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
 use std::time::Duration;
+use std::sync::Mutex;
 
 use anyhow::bail;
 use anyhow::Context;
 use anyhow::Result;
 use clap::Parser;
 use crossbeam::channel::RecvTimeoutError;
-use libbpf_rs::MapCore as _;
-use libbpf_rs::OpenObject;
+use libbpf_rs::{MapCore, OpenObject, MapFlags};
 use log::debug;
 use log::info;
 use log::trace;
@@ -46,11 +47,48 @@ use scx_utils::NR_CPUS_POSSIBLE;
 
 use stats::CellMetrics;
 use stats::Metrics;
+use crate::mitosis_topology_utils::{populate_topology_maps, MapKind};
+
+// This is the cell type from intf.h.
+// When copied to user, the lock field is omitted.
+// We can mmap it, or use calls to the BPF_MAP_LOOKUP_ELEM
+// command of the bpf() system call with the BPF_F_LOCK flag
+type BpfCell = bpf_intf::cell;
 
 const SCHEDULER_NAME: &str = "scx_mitosis";
 const MAX_CELLS: usize = bpf_intf::consts_MAX_CELLS as usize;
 const NR_CSTATS: usize = bpf_intf::cell_stat_idx_NR_CSTATS as usize;
 
+// Can we deduplicate this with mitosis.bpf.h?
+const CPUMASK_LONG_ENTRIES: usize = 128;
+
+// Global debug flags
+// TODO: These will be runtime adjustable via a CLI option.
+static DEBUG_FLAGS: std::sync::LazyLock<Mutex<HashMap<String, bool>>> = std::sync::LazyLock::new(|| {
+    let mut flags = HashMap::new();
+    flags.insert("cpu_to_l3".to_string(),  false);
+    flags.insert("l3_to_cpus".to_string(), false);
+    flags.insert("cells".to_string(),      true );
+    flags.insert("counters".to_string(),   true );
+    flags.insert("steals".to_string(),     true );
+    flags.insert("metrics".to_string(),    true );
+    Mutex::new(flags)
+});
+
+/// Debug Printers
+const ANSI_RED: &str = "\x1b[31m";
+const ANSI_GREEN: &str = "\x1b[32m";
+const ANSI_RESET: &str = "\x1b[0m";
+
+/// Check if a debug flag is enabled
+fn is_debug_flag_enabled(flag: &str) -> bool {
+    if let Ok(flags) = DEBUG_FLAGS.lock() {
+        flags.get(flag).copied().unwrap_or(false)
+    } else {
+        false
+    }
+}
+
 /// scx_mitosis: A dynamic affinity scheduler
 ///
 /// Cgroups are assigned to a dynamic number of Cells which are assigned to a
@@ -106,20 +144,22 @@ const QUEUE_STATS_IDX: [bpf_intf::cell_stat_idx; 3] = [
 
 // Per cell book-keeping
 #[derive(Debug)]
-struct Cell {
+struct CellMask {
     cpus: Cpumask,
 }
 
 struct Scheduler<'a> {
     skel: BpfSkel<'a>,
     monitor_interval: Duration,
-    cells: HashMap<u32, Cell>,
+    cells: HashMap<u32, CellMask>,
     // These are the per-cell cstats.
     // Note these are accumulated across all CPUs.
     prev_cell_stats: [[u64; NR_CSTATS]; MAX_CELLS],
+    prev_total_steals: u64,
     metrics: Metrics,
     stats_server: StatsServer<(), Metrics>,
     last_configuration_seq: Option<u32>,
+    iteration_count: u64,
 }
 
 struct DistributionStats {
@@ -146,7 +186,7 @@ impl Display for DistributionStats {
         );
         write!(
             f,
-            "{:width$} {:5.1}% | Local:{:4.1}% From: CPU:{:4.1}% Cell:{:4.1}% | V:{:4.1}%",
+            "{:width$} {:5.1}% | Local:{:5.1}% From: CPU:{:4.1}% Cell:{:5.1}% | V:{:4.1}%",
             self.total_decisions,
             self.share_of_decisions_pct,
             self.local_q_pct,
@@ -159,6 +199,38 @@ impl Display for DistributionStats {
 }
 
 impl<'a> Scheduler<'a> {
+    fn get_bpf_cell(&self, cell_id: u32) -> anyhow::Result<Option<BpfCell>> {
+        let key = cell_id.to_ne_bytes();
+        let map = &self.skel.maps.cells; // NOTE: map is a field, not a method
+
+        match map.lookup(&key, MapFlags::ANY)? {
+            Some(bytes) => {
+                let need = core::mem::size_of::<BpfCell>();
+                if bytes.len() != need {
+                    anyhow::bail!("cells value size {} != BpfCell {}", bytes.len(), need);
+                }
+                // Copy to an aligned buffer to avoid misaligned reference
+                let mut tmp = MaybeUninit::<BpfCell>::uninit();
+                unsafe {
+                    std::ptr::copy_nonoverlapping(
+                        bytes.as_ptr(),
+                        tmp.as_mut_ptr() as *mut u8,
+                        need,
+                    );
+                    Ok(Some(tmp.assume_init()))
+                }
+            }
+            None => Ok(None),
+        }
+    }
+
+    fn is_cell_in_use(&self, cell_id: u32) -> bool {
+        match self.get_bpf_cell(cell_id) {
+            Ok(Some(c)) => c.in_use != 0,
+            _ => false,
+        }
+    }
+
     fn init(opts: &Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
         let topology = Topology::new()?;
 
@@ -182,12 +254,35 @@ impl<'a> Scheduler<'a> {
             skel.maps.rodata_data.as_mut().unwrap().all_cpus[cpu / 8] |= 1 << (cpu % 8);
         }
 
+        skel.maps.rodata_data.as_mut().unwrap().nr_l3 = topology.all_llcs.len() as u32;
+
+        // print the number of l3s we detected
+        info!("Found {} L3s", topology.all_llcs.len());
+
         match *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP {
             0 => info!("Kernel does not support queued wakeup optimization."),
             v => skel.struct_ops.mitosis_mut().flags |= v,
         }
 
-        let skel = scx_ops_load!(skel, mitosis, uei)?;
+        let mut skel = scx_ops_load!(skel, mitosis, uei)?;
+
+        // Verify our version of the cell datastructure is the same size
+        // as the bpf one.
+        let cells_info = skel.maps.cells.info()?;
+        let usz = core::mem::size_of::<BpfCell>() as u32;
+        if cells_info.info.value_size != usz {
+            bail!(
+                "cells value_size={} but Rust expects {} (BpfCell)",
+                cells_info.info.value_size,
+                usz
+            );
+        }
+
+        // Set up CPU to L3 topology mapping using the common functionality
+        populate_topology_maps(&mut skel, MapKind::CpuToL3, None)?;
+
+        // Set up L3 to CPUs mapping using the common functionality
+        populate_topology_maps(&mut skel, MapKind::L3ToCpus, None)?;
 
         let stats_server = StatsServer::new(stats::server_data()).launch()?;
 
@@ -196,9 +291,11 @@ impl<'a> Scheduler<'a> {
             monitor_interval: Duration::from_secs(opts.monitor_interval_s),
             cells: HashMap::new(),
             prev_cell_stats: [[0; NR_CSTATS]; MAX_CELLS],
+            prev_total_steals: 0,
             metrics: Metrics::default(),
             stats_server,
             last_configuration_seq: None,
+            iteration_count: 0,
         })
     }
 
@@ -210,6 +307,7 @@ impl<'a> Scheduler<'a> {
         let (res_ch, req_ch) = self.stats_server.channels();
 
         while !shutdown.load(Ordering::Relaxed) && !uei_exited!(&self.skel, uei) {
+            self.iteration_count += 1;
             self.refresh_bpf_cells()?;
             self.collect_metrics()?;
 
@@ -292,7 +390,7 @@ impl<'a> Scheduler<'a> {
             }
         }
 
-        let prefix = "Total Decisions:";
+        let prefix = "  Total:  ";
 
         // Here we want to sum the affinity violations over all cells.
         let scope_affn_viols: u64 = cell_stats_delta
@@ -310,7 +408,10 @@ impl<'a> Scheduler<'a> {
 
         self.metrics.update(&stats);
 
-        trace!("{} {}", prefix, stats);
+        if is_debug_flag_enabled("metrics") {
+            trace!("{}{}{}", ANSI_GREEN, "metrics:", ANSI_RESET);
+            trace!("{} {}", prefix, stats);
+        }
 
         Ok(())
     }
@@ -327,8 +428,8 @@ impl<'a> Scheduler<'a> {
                 .map(|&stat| cell_stats_delta[cell][stat as usize])
                 .sum::<u64>();
 
-            // FIXME: This should really query if the cell is enabled or not.
-            if cell_queue_decisions == 0 {
+            // Only print stats for cells that are in use and have decisions
+            if !self.is_cell_in_use(cell as u32) {
                 continue;
             }
 
@@ -340,7 +441,7 @@ impl<'a> Scheduler<'a> {
             const MIN_CELL_WIDTH: usize = 2;
             let cell_width: usize = max(MIN_CELL_WIDTH, (MAX_CELLS as f64).log10().ceil() as usize);
 
-            let prefix = format!("        Cell {:width$}:", cell, width = cell_width);
+            let prefix = format!("  Cell {:width$}:", cell, width = cell_width);
 
             // Sum affinity violations for this cell
             let scope_affn_viols: u64 =
@@ -359,7 +460,9 @@ impl<'a> Scheduler<'a> {
                 .or_default()
                 .update(&stats);
 
-            trace!("{} {}", prefix, stats);
+            if is_debug_flag_enabled("metrics") {
+                trace!("{} {}", prefix, stats);
+            }
         }
         Ok(())
     }
@@ -410,17 +513,70 @@ impl<'a> Scheduler<'a> {
         }
         Ok(cell_stats_delta)
     }
+    /// Print debug printer status summary
+    fn print_debug_status(&self) {
+        if let Ok(flags) = DEBUG_FLAGS.lock() {
+            let mut disabled: Vec<_> = flags.iter().filter_map(|(flag, &enabled)| (!enabled).then_some(format!("{}~{}{}", ANSI_RED, flag, ANSI_RESET))).collect();
+            let enabled: Vec<_> = flags.iter().filter_map(|(flag, &enabled)| enabled.then_some(format!("{}+{}{}", ANSI_GREEN, flag, ANSI_RESET))).collect();
+            disabled.extend(enabled);
+            trace!("Debug Flags: {}", if disabled.is_empty() { "none".to_string() } else { disabled.join(" ") });
+            // trace!("hint: sudo ./scx_mitosis cli debug ~/+<flag_name>");
+        }
+    }
 
     /// Collect metrics and out various debugging data like per cell stats, per-cpu stats, etc.
     fn collect_metrics(&mut self) -> Result<()> {
+        trace!("");
+        trace!("Iteration #{}", self.iteration_count);
+
         let cell_stats_delta = self.calculate_cell_stat_delta()?;
 
         self.log_all_queue_stats(&cell_stats_delta)?;
 
+        // TODO: I don't really understand this.
         for (cell_id, cell) in &self.cells {
+            // Check if cell is actually in use from BPF before printing
+            if !self.is_cell_in_use(*cell_id) {
+                continue;
+            }
             trace!("CELL[{}]: {}", cell_id, cell.cpus);
         }
 
+        // Read total steals from BPF and update metrics
+        self.update_steal_metrics()?;
+
+        // Read and print function counters
+        self.print_and_reset_function_counters()?;
+        if is_debug_flag_enabled("cells") {
+            trace!("{}cells:{}", ANSI_GREEN, ANSI_RESET);
+            for i in 0..self.cells.len() {
+                if let Some(cell) = self.cells.get(&(i as u32)) {
+                    trace!("  CELL[{}]: {} ({:3} CPUs)", i, cell.cpus, cell.cpus.weight());
+                }
+            }
+        }
+
+        if is_debug_flag_enabled("cpu_to_l3") {
+            let cpu_to_l3 = read_cpu_to_l3(&self.skel)?;
+            let cpu_l3_pairs: Vec<String> = cpu_to_l3.iter().enumerate()
+                .map(|(cpu, l3)| format!("{:3}:{:2}", cpu, l3))
+                .collect();
+            let chunked_output = cpu_l3_pairs
+                .chunks(16)
+                .map(|chunk| chunk.join(" "))
+                .collect::<Vec<_>>()
+                .join("\n");
+            trace!("{}cpu_to_l3:{}\n{}", ANSI_GREEN, ANSI_RESET, chunked_output);
+        }
+
+        if is_debug_flag_enabled("l3_to_cpus") {
+            trace!("{}l3_to_cpus:{}", ANSI_GREEN, ANSI_RESET);
+            let l3_to_cpus = read_l3_to_cpus(&self.skel)?;
+            for (l3_id, mask) in l3_to_cpus.iter() {
+                trace!("l3_to_cpus: [{:2}] = {}", l3_id, mask);
+            }
+        }
+
         for (cell_id, cell) in self.cells.iter() {
             // Assume we have a CellMetrics entry if we have a known cell
             self.metrics
@@ -430,9 +586,167 @@ impl<'a> Scheduler<'a> {
         }
         self.metrics.num_cells = self.cells.len() as u32;
 
+        // Print debug printer status at the end of each cycle
+        self.print_debug_status();
+
         Ok(())
     }
 
+    fn print_and_reset_function_counters(&mut self) -> Result<()> {
+        if !is_debug_flag_enabled("counters") {
+            return Ok(());
+        }
+        trace!("{}counters:{}", ANSI_GREEN, ANSI_RESET);
+
+        let counter_names = ["select", "enqueue", "dispatch"];
+        let max_name_len = counter_names.iter().map(|name| name.len()).max().unwrap_or(0);
+        let mut all_counters = Vec::new();
+
+        // Read counters for each function
+        for counter_idx in 0..bpf_intf::fn_counter_idx_NR_COUNTERS {
+            let key = (counter_idx as u32).to_ne_bytes();
+
+            // Read per-CPU values
+            let percpu_values = self.skel
+                .maps
+                .function_counters
+                .lookup_percpu(&key, MapFlags::ANY)
+                .context("Failed to lookup function counter")?
+                .unwrap_or_default();
+
+            let mut cpu_values = Vec::new();
+            for cpu in 0..*NR_CPUS_POSSIBLE {
+                if cpu < percpu_values.len() {
+                    let value = u64::from_ne_bytes(
+                        percpu_values[cpu].as_slice().try_into()
+                            .context("Failed to convert counter bytes")?
+                    );
+                    cpu_values.push(value);
+                }
+            }
+
+            all_counters.push(cpu_values);
+        }
+
+        // Calculate and print statistics for each counter
+        for (idx, counter_values) in all_counters.iter().enumerate() {
+            if idx >= counter_names.len() {
+                break;
+            }
+
+            let name = counter_names[idx];
+            let non_zero_values: Vec<u64> = counter_values.iter().filter(|&&v| v > 0).copied().collect();
+
+            if non_zero_values.is_empty() {
+                trace!("  Fn[{:<width$}]: no activity", name, width = max_name_len);
+                continue;
+            }
+
+            let total: u64 = non_zero_values.iter().sum();
+            let min = *non_zero_values.iter().min().unwrap();
+            let max = *non_zero_values.iter().max().unwrap();
+
+            // Calculate median
+            let mut sorted_values = non_zero_values.clone();
+            sorted_values.sort();
+            let median = if sorted_values.len() % 2 == 0 {
+                let mid = sorted_values.len() / 2;
+                (sorted_values[mid - 1] + sorted_values[mid]) / 2
+            } else {
+                sorted_values[sorted_values.len() / 2]
+            };
+
+            trace!(
+                "  Fn[{:<width$}]: tot={:>6} min={:>4} med={:>4} max={:>5} ({:3} CPUs)",
+                name, total, min, median, max, non_zero_values.len(), width = max_name_len
+            );
+        }
+
+        // Zero out all counters after printing
+        for counter_idx in 0..bpf_intf::fn_counter_idx_NR_COUNTERS {
+            let key = (counter_idx as u32).to_ne_bytes();
+            let zero_value = 0u64.to_ne_bytes().to_vec();
+
+            // Create per-CPU values array (all zeros)
+            let percpu_values: Vec<Vec<u8>> = (0..*NR_CPUS_POSSIBLE)
+                .map(|_| zero_value.clone())
+                .collect();
+
+            self.skel
+                .maps
+                .function_counters
+                .update_percpu(&key, &percpu_values, MapFlags::ANY)
+                .context("Failed to reset function counter")?;
+        }
+
+        Ok(())
+    }
+
+fn update_steal_metrics(&mut self) -> Result<()> {
+    let steals_debug = is_debug_flag_enabled("steals");
+
+    // Early out if stealing is compiled out.
+    if bpf_intf::MITOSIS_ENABLE_STEALING == 0 {
+        self.metrics.total_steals = 0;
+        if steals_debug {
+            trace!("{}steals:{}", ANSI_GREEN, ANSI_RESET);
+            trace!("  Work stealing disabled at compile time (MITOSIS_ENABLE_STEALING=0)");
+        }
+        return Ok(());
+    }
+
+    let key = 0u32.to_ne_bytes();
+
+    // Read the count; lazily initialize the slot to 0 if it doesn't exist.
+    let steal_count = match self.skel.maps.steal_stats.lookup(&key, MapFlags::ANY) {
+        Ok(Some(data)) if data.len() >= 8 => {
+            u64::from_ne_bytes(data[..8].try_into().unwrap())
+        }
+        Ok(Some(_)) => {
+            if steals_debug {
+                debug!("steal_stats map data too small");
+            }
+            0
+        }
+        Ok(None) => {
+            let zero = 0u64.to_ne_bytes();
+            if let Err(e) = self.skel.maps.steal_stats.update(&key, &zero, MapFlags::ANY) {
+                if steals_debug {
+                    debug!("Failed to initialize steal_stats map: {e}");
+                }
+            }
+            0
+        }
+        Err(e) => {
+            if steals_debug {
+                debug!("Failed to read steal_stats map: {e}");
+            }
+            0
+        }
+    };
+
+    // Calculate steals since last update (delta)
+    let steals_delta = steal_count - self.prev_total_steals;
+    self.prev_total_steals = steal_count;
+    self.metrics.total_steals = steals_delta;
+
+    // Early out if we aren't logging.
+    if !steals_debug {
+        return Ok(());
+    }
+
+    if steals_delta > 0 {
+        trace!("{}steals:{}", ANSI_GREEN, ANSI_RESET);
+        trace!("  Work stealing active: steals_since_last={}", steals_delta);
+    } else {
+        trace!("{}steals:{}", ANSI_GREEN, ANSI_RESET);
+        trace!("  Work stealing enabled but no new steals: steals_since_last={}", steals_delta);
+    }
+
+    Ok(())
+}
+
+
     fn refresh_bpf_cells(&mut self) -> Result<()> {
         let applied_configuration = unsafe {
             std::ptr::read_volatile(
@@ -465,15 +779,12 @@ impl<'a> Scheduler<'a> {
         // Create cells we don't have yet, drop cells that are no longer in use.
         // If we continue to drop cell metrics once a cell is removed, we'll need to make sure we
         // flush metrics for a cell before we remove it completely.
-        let cells = &self.skel.maps.bss_data.as_ref().unwrap().cells;
         for i in 0..MAX_CELLS {
             let cell_idx = i as u32;
-            let bpf_cell = cells[i];
-            let in_use = unsafe { std::ptr::read_volatile(&bpf_cell.in_use as *const u32) };
-            if in_use > 0 {
+            if self.is_cell_in_use(cell_idx) {
                 self.cells
                     .entry(cell_idx)
-                    .or_insert_with(|| Cell {
+                    .or_insert_with(|| CellMask {
                         cpus: Cpumask::new(),
                     })
                     .cpus = cell_to_cpus
@@ -498,7 +809,7 @@ fn read_cpu_ctxs(skel: &BpfSkel) -> Result<Vec<bpf_intf::cpu_ctx>> {
     let cpu_ctxs_vec = skel
         .maps
         .cpu_ctxs
-        .lookup_percpu(&0u32.to_ne_bytes(), libbpf_rs::MapFlags::ANY)
+        .lookup_percpu(&0u32.to_ne_bytes(), MapFlags::ANY)
         .context("Failed to lookup cpu_ctx")?
         .unwrap();
     for cpu in 0..*NR_CPUS_POSSIBLE {
@@ -509,7 +820,52 @@ fn read_cpu_ctxs(skel: &BpfSkel) -> Result<Vec<bpf_intf::cpu_ctx>> {
     Ok(cpu_ctxs)
 }
 
+fn read_cpu_to_l3(skel: &BpfSkel) -> Result<Vec<u32>> {
+    let mut cpu_to_l3 = vec![];
+    for cpu in 0..*NR_CPUS_POSSIBLE {
+        let key = (cpu as u32).to_ne_bytes();
+        let val = skel
+            .maps
+            .cpu_to_l3
+            .lookup(&key, MapFlags::ANY)?
+            .map(|v| u32::from_ne_bytes(v.try_into().unwrap()))
+            .unwrap_or(0);
+        cpu_to_l3.push(val);
+    }
+    Ok(cpu_to_l3)
+}
+
+fn read_l3_to_cpus(skel: &BpfSkel) -> Result<Vec<(u32, Cpumask)>> {
+    let mut l3_to_cpus = vec![];
+
+    // Get the number of L3 caches from the BPF rodata
+    let nr_l3 = skel.maps.rodata_data.as_ref().unwrap().nr_l3;
+
+    for l3 in 0..nr_l3 {
+        let key = (l3 as u32).to_ne_bytes();
+        let mask = if let Some(v) = skel
+            .maps
+            .l3_to_cpus
+            .lookup(&key, MapFlags::ANY)?
+        {
+            let bytes = v.as_slice();
+            let mut longs = [0u64; CPUMASK_LONG_ENTRIES];
+            let mut i = 0;
+            while i < CPUMASK_LONG_ENTRIES && i * 8 + 8 <= bytes.len() {
+                longs[i] = u64::from_ne_bytes(bytes[i * 8..i * 8 + 8].try_into().unwrap());
+                i += 1;
+            }
+            Cpumask::from_vec(longs.to_vec())
+        } else {
+            Cpumask::new()
+        };
+        l3_to_cpus.push((l3, mask));
+    }
+    Ok(l3_to_cpus)
+}
+
 fn main() -> Result<()> {
+
     let opts = Opts::parse();
 
     if opts.version {
diff --git a/scheds/rust/scx_mitosis/src/mitosis_topology_utils.rs b/scheds/rust/scx_mitosis/src/mitosis_topology_utils.rs
new file mode 100644
index 0000000000..de19b1e02d
--- /dev/null
+++ b/scheds/rust/scx_mitosis/src/mitosis_topology_utils.rs
@@ -0,0 +1,168 @@
+use anyhow::{bail, Context, Result};
+use libbpf_rs::{MapCore, MapFlags};
+use scx_utils::Topology;
+use std::collections::HashMap;
+use std::io::{self, BufRead, BufReader};
+use std::path::Path;
+
+use crate::bpf_skel::BpfSkel;
+
+const CPUMASK_LONG_ENTRIES: usize = 128;
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum MapKind {
+    CpuToL3,
+    L3ToCpus,
+}
+
+impl std::str::FromStr for MapKind {
+    type Err = anyhow::Error;
+    fn from_str(s: &str) -> Result<Self> {
+        match s {
+            "cpu_to_l3" => Ok(MapKind::CpuToL3),
+            "l3_to_cpus" => Ok(MapKind::L3ToCpus),
+            _ => bail!("unknown map {s}"),
+        }
+    }
+}
+
+impl std::fmt::Display for MapKind {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(match self {
+            MapKind::CpuToL3 => "cpu_to_l3",
+            MapKind::L3ToCpus => "l3_to_cpus",
+        })
+    }
+}
+
+pub const SUPPORTED_MAPS: &[MapKind] = &[MapKind::CpuToL3, MapKind::L3ToCpus];
+
+/// Parse lines of the form `cpu,l3` from the provided reader.
+fn parse_cpu_l3_map<R: BufRead>(reader: R) -> Result<Vec<(usize, usize)>> {
+    let mut pairs = Vec::new();
+    for line in reader.lines() {
+        let line = line?;
+        let line = line.trim();
+        // Ignore blank lines and comments
+        if line.is_empty() || line.starts_with('#') {
+            continue;
+        }
+        let mut parts = line.split(',');
+        let cpu = parts
+            .next()
+            .ok_or_else(|| anyhow::anyhow!("missing cpu"))?
+            .trim()
+            .parse::<usize>()?;
+        let l3 = parts
+            .next()
+            .ok_or_else(|| anyhow::anyhow!("missing l3"))?
+            .trim()
+            .parse::<usize>()?;
+        pairs.push((cpu, l3));
+    }
+    Ok(pairs)
+}
+
+/// Read CPU/L3 pairs either from a file or standard input.
+fn read_cpu_l3_map(path: &str) -> Result<Vec<(usize, usize)>> {
+    if path == "-" {
+        println!("reading from stdin");
+        let stdin = io::stdin();
+        let reader = BufReader::new(stdin.lock());
+        parse_cpu_l3_map(reader)
+    } else {
+        println!("reading from {path}");
+        let file = std::fs::File::open(Path::new(path))?;
+        let reader = BufReader::new(file);
+        parse_cpu_l3_map(reader)
+    }
+}
+
+/// Update map entries either from a file or from the host topology.
+/// This function can be used by both the main scheduler and CLI tools.
+pub fn populate_topology_maps(skel: &mut BpfSkel, map: MapKind, file: Option<String>) -> Result<()> {
+    match map {
+        MapKind::CpuToL3 => {
+            let map_entries = if let Some(path) = file {
+                println!("loading from {path}");
+                read_cpu_l3_map(&path)?
+            } else {
+                println!("loading from host topology");
+                let topo = Topology::new()?;
+                (0..*scx_utils::NR_CPUS_POSSIBLE)
+                    // Use 0 if a CPU is missing from the topology
+                    .map(|cpu| (cpu, topo.all_cpus.get(&cpu).map(|c| c.l3_id).unwrap_or(0)))
+                    .collect()
+            };
+            for (cpu, l3) in map_entries {
+                // Each CPU index is stored as a 32bit key mapping to its L3 id
+                let key = (cpu as u32).to_ne_bytes();
+                let val = (l3 as u32).to_ne_bytes();
+                skel.maps.cpu_to_l3.update(&key, &val, MapFlags::ANY)?;
+            }
+        }
+        MapKind::L3ToCpus => {
+            if file.is_some() {
+                anyhow::bail!("Loading l3_to_cpus from file is not supported yet");
+            }
+
+            println!("loading l3_to_cpus from host topology");
+            let topo = Topology::new()?;
+
+            // Group CPUs by L3 cache ID
+            let mut l3_to_cpus: HashMap<usize, Vec<usize>> = HashMap::new();
+            for cpu in topo.all_cpus.values() {
+                l3_to_cpus.entry(cpu.l3_id).or_default().push(cpu.id);
+            }
+
+            // For each L3 cache, create a cpumask and populate the map
+            for (l3_id, cpus) in l3_to_cpus {
+                let key = (l3_id as u32).to_ne_bytes();
+
+                // Create a cpumask structure that matches the BPF side
+                let mut cpumask_longs = [0u64; CPUMASK_LONG_ENTRIES];
+
+                // Set bits for each CPU in this L3 cache
+                for cpu in cpus {
+                    let long_idx = cpu / 64;
+                    let bit_idx = cpu % 64;
+                    if long_idx < CPUMASK_LONG_ENTRIES {
+                        cpumask_longs[long_idx] |= 1u64 << bit_idx;
+                    }
+                }
+
+                // Convert to bytes for the map update
+                let mut value_bytes = Vec::new();
+                for long_val in cpumask_longs {
+                    value_bytes.extend_from_slice(&long_val.to_ne_bytes());
+                }
+
+                skel.maps.l3_to_cpus.update(&key, &value_bytes, MapFlags::ANY)
+                    .context(format!("Failed to update l3_to_cpus map for L3 {}", l3_id))?;
+            }
+        }
+    }
+    Ok(())
+}
+
+
+/// Display CPU to L3 cache relationships discovered from the host topology.
+pub fn print_topology() -> Result<()> {
+    let topo = Topology::new()?;
+    println!("Number L3 caches: {}", topo.all_llcs.len());
+    println!("CPU -> L3 id:");
+    for cpu in topo.all_cpus.values() {
+        println!("cpu {} -> {}", cpu.id, cpu.l3_id);
+    }
+    println!("\nL3 id -> [cpus]:");
+    let mut by_l3: std::collections::BTreeMap<usize, Vec<usize>> =
+        std::collections::BTreeMap::new();
+    for cpu in topo.all_cpus.values() {
+        by_l3.entry(cpu.l3_id).or_default().push(cpu.id);
+    }
+    for (l3, mut cpus) in by_l3 {
+        cpus.sort_unstable();
+        println!("{l3} -> {:?}", cpus);
+    }
+    Ok(())
+}
diff --git a/scheds/rust/scx_mitosis/src/stats.rs b/scheds/rust/scx_mitosis/src/stats.rs
index 749296a4ff..0cfc001667 100644
--- a/scheds/rust/scx_mitosis/src/stats.rs
+++ b/scheds/rust/scx_mitosis/src/stats.rs
@@ -65,6 +65,8 @@ pub struct Metrics {
     pub share_of_decisions_pct: f64,
     #[stat(desc = "Cell scheduling decisions")]
     total_decisions: u64,
+    #[stat(desc = "Work steals since last update")]
+    pub total_steals: u64,
     #[stat(desc = "Per-cell metrics")] // TODO: cell names
     pub cells: BTreeMap<u32, CellMetrics>,
 }