Skip to content
174 changes: 174 additions & 0 deletions scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
/* Copyright (c) Meta Platforms, Inc. and affiliates. */
/*
* This software may be used and distributed according to the terms of the
* GNU General Public License version 2.
*
* This header defines the 64-bit dispatch queue (DSQ) ID encoding
* scheme for scx_mitosis, using type fields to distinguish between
* per-CPU and cell+L3 domain queues. It includes helper functions to
* construct, validate, and parse these DSQ IDs for queue management.
*/
#pragma once

#include "intf.h"
#include "mitosis.bpf.h"

/*
* ================================
* BPF DSQ ID Layout (64 bits wide)
* ================================
*
* Top-level format:
* [63] [62..0]
* [ B] [ ID ]
*
* If B == 1 it is a Built-in DSQ
* -------------------------
* [63] [62] [61 .. 32] [31..0]
* [ 1] [ L] [ R ] [ V ]
*
* - L (bit 62): LOCAL_ON flag
* If L == 1 -> V = CPU number
* - R (30 bits): reserved / unused
* - V (32 bits): value (e.g., CPU#)
*
* If B == 0 -> User-defined DSQ
* -----------------------------
* Only the low 32 bits are used.
*
* [63 .. 32] [31..0]
* [ 0][ unused ] [ VAL ]
*
* Mitosis uses VAL as follows:
*
* [31..28] [27..0]
* [QTYPE ] [DATA ]
*
* QTYPE encodes the queue type:
*
* QTYPE = 0x1 -> Per-CPU Q
* [31..28] [27 .. .. 0]
* [ 0001 ] [ CPU# ]
* [Q-TYPE:1]
*
* QTYPE = 0x2 -> Cell+L3 Q
* [31..28] [27 .. 16] [15 .. 0]
* [ 0010 ] [ CELL# ] [ L3ID ]
* [Q-TYPE:2]
*
*/
/*
* The use of these bitfields depends on compiler defined byte AND bit ordering.
* Make sure we're only building with Clang/LLVM and that we're little-endian.
*/
#ifndef __clang__
#error "This code must be compiled with Clang/LLVM (eBPF: clang -target bpf)."
#endif

#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
#error "dsq64 bitfield layout assumes little-endian (bpfel)."
#endif

/* ---- Bitfield widths (bits) ---- */
#define CPU_B 28
#define L3_B 16
#define CELL_B 12
#define TYPE_B 4
#define DATA_B 28
#define RSVD_B 32

/* Sum checks (in bits) */
_Static_assert(CPU_B + TYPE_B == 32, "CPU layout low half must be 32 bits");
_Static_assert(L3_B + CELL_B + TYPE_B == 32, "CELL+L3 layout low half must be 32 bits");
_Static_assert(DATA_B + TYPE_B == 32, "Common layout low half must be 32 bits");

typedef union {
u64 raw;

/* Per-CPU user DSQ */
struct { u64 cpu: CPU_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } cpu_dsq;

/* Cell+L3 user DSQ */
struct { u64 l3: L3_B; u64 cell: CELL_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } cell_l3_dsq;

/* Generic user view */
struct { u64 data: DATA_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } user_dsq;

/* Built-in DSQ view */
struct { u64 value:32; u64 rsvd:30; u64 local_on:1; u64 builtin:1; } builtin_dsq;

/* NOTE: Considered packed and aligned attributes, but that's redundant */
} dsq_id_t;

/*
* Invalid DSQ ID Sentinel:
* invalid bc bit 63 clear (it's a user DSQ) && dsq_type == 0 (no type)
* Good for catching uninitialized DSQ IDs.
*/
#define DSQ_INVALID ((u64) 0)

_Static_assert(sizeof(((dsq_id_t){0}).cpu_dsq) == sizeof(u64), "cpu view must be 8 bytes");
_Static_assert(sizeof(((dsq_id_t){0}).cell_l3_dsq) == sizeof(u64), "cell+l3 view must be 8 bytes");
_Static_assert(sizeof(((dsq_id_t){0}).user_dsq) == sizeof(u64), "user common view must be 8 bytes");
_Static_assert(sizeof(((dsq_id_t){0}).builtin_dsq) == sizeof(u64), "builtin view must be 8 bytes");

/* Compile-time checks (in bytes) */
_Static_assert(sizeof(dsq_id_t) == sizeof(u64), "dsq_id_t must be 8 bytes (64 bits)");
_Static_assert(_Alignof(dsq_id_t) == sizeof(u64), "dsq_id_t must be 8-byte aligned");

/* DSQ type enumeration */
enum dsq_type {
DSQ_TYPE_NONE,
DSQ_TYPE_CPU,
DSQ_TYPE_CELL_L3,
};

/* Range guards */
_Static_assert(MAX_CPUS <= (1u << CPU_B), "MAX_CPUS must fit in field");
_Static_assert(MAX_L3S <= (1u << L3_B), "MAX_L3S must fit in field");
_Static_assert(MAX_CELLS <= (1u << CELL_B), "MAX_CELLS must fit in field");
_Static_assert(DSQ_TYPE_CELL_L3 < (1u << TYPE_B), "DSQ_TYPE_CELL_L3 must fit in field");

/*
* While I considered error propagation, I decided to bail to force errors early.
*/

static inline bool is_user_dsq(dsq_id_t dsq_id){
return !dsq_id.builtin_dsq.builtin && dsq_id.user_dsq.type != DSQ_TYPE_NONE;
}

// Is this a per CPU DSQ?
static inline bool is_cpu_dsq(dsq_id_t dsq_id)
{
return is_user_dsq(dsq_id) && dsq_id.user_dsq.type == DSQ_TYPE_CPU;
}

// If this is a per cpu dsq, return the cpu
static inline u32 get_cpu_from_dsq(u64 id)
{
dsq_id_t dsq_id = (dsq_id_t) {.raw = id};
if (!is_cpu_dsq(dsq_id))
scx_bpf_error("trying to get cpu from non-cpu dsq\n");

return dsq_id.cpu_dsq.cpu;
}

/* Helper functions to construct DSQ IDs */
static inline u64 get_cpu_dsq_id(u32 cpu)
{
// Check for valid CPU range, 0 indexed so >=.
if (cpu >= MAX_CPUS)
scx_bpf_error("invalid cpu %u\n", cpu);
dsq_id_t dsq_id = { .cpu_dsq = { .cpu = cpu, .type = DSQ_TYPE_CPU } };

return dsq_id.raw;
}

static inline u64 get_cell_l3_dsq_id(u32 cell, u32 l3)
{
if (cell >= MAX_CELLS || l3 >= MAX_L3S)
scx_bpf_error("cell %u or l3 %u too large\n", cell, l3);
dsq_id_t dsq_id = { .cell_l3_dsq = { .l3 = l3, .cell = cell, .type = DSQ_TYPE_CELL_L3 } };

return dsq_id.raw;
}
24 changes: 14 additions & 10 deletions scheds/rust/scx_mitosis/src/bpf/intf.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ typedef _Bool bool;
#include <scx/ravg.bpf.h>
#endif

/* ---- Work stealing config (compile-time) ------------------------------- */
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it might be best to have this as a runtime option - e.g. a flag passed to the user space binary that writes to a global static variable in the bpf code before running it

#ifndef MITOSIS_ENABLE_STEALING
#define MITOSIS_ENABLE_STEALING 1
#endif
/* ----------------------------------------------------------------------- */

enum consts {
CACHELINE_SIZE = 64,
MAX_CPUS_SHIFT = 9,
Expand All @@ -39,6 +45,14 @@ enum cell_stat_idx {
NR_CSTATS,
};

/* Function invocation counters */
enum counter_idx {
COUNTER_SELECT_CPU,
COUNTER_ENQUEUE,
COUNTER_DISPATCH,
NR_COUNTERS,
};

struct cpu_ctx {
u64 cstats[MAX_CELLS][NR_CSTATS];
u64 cell_cycles[MAX_CELLS];
Expand All @@ -51,14 +65,4 @@ struct cgrp_ctx {
bool cell_owner;
};

/*
* cell is the per-cell book-keeping
*/
struct cell {
// current vtime of the cell
u64 vtime_now;
// Whether or not the cell is used or not
u32 in_use;
};

#endif /* __INTF_H */
150 changes: 150 additions & 0 deletions scheds/rust/scx_mitosis/src/bpf/l3_aware.bpf.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
/* Copyright (c) Meta Platforms, Inc. and affiliates. */
/*
* This software may be used and distributed according to the terms of the
* GNU General Public License version 2.
*
* This header adds L3 cache awareness to scx_mitosis by defining BPF
* maps for CPU-to-L3 domain mappings. It provides functions to
* recalculate per-L3 CPU counts within cells and implements weighted
* random L3 selection for tasks. It also tracks work-stealing
* statistics for cross-L3 task migrations.
*/
#pragma once

#include "mitosis.bpf.h"
#include "intf.h"

// It's also an option to just compute this from the cpu_to_l3 map.
struct l3_cpu_mask {
unsigned long cpumask[CPUMASK_LONG_ENTRIES];
};

/* Work stealing statistics map - accessible from both BPF and userspace */
struct steal_stats_map {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, u32);
__type(value, u64);
__uint(max_entries, 1);
};

// A CPU -> L3 cache ID map
struct cpu_to_l3_map {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, u32);
__type(value, u32);
__uint(max_entries, MAX_CPUS);
};

struct l3_to_cpus_map {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, u32);
__type(value, struct l3_cpu_mask);
__uint(max_entries, MAX_L3S);
};

extern struct cpu_to_l3_map cpu_to_l3 SEC(".maps");
extern struct l3_to_cpus_map l3_to_cpus SEC(".maps");
extern struct steal_stats_map steal_stats SEC(".maps");

static inline const struct cpumask *lookup_l3_cpumask(u32 l3)
{
struct l3_cpu_mask *mask;

if (!(mask = bpf_map_lookup_elem(&l3_to_cpus, &l3))) {
scx_bpf_error("no l3 cpumask, l3: %d, %p", l3, &l3_to_cpus);
return NULL;
}

return (const struct cpumask *)mask;
}

/* Recompute cell->l3_cpu_cnt[] after cell cpumask changes (no persistent kptrs). */
static __always_inline void recalc_cell_l3_counts(u32 cell_idx)
{
struct cell *cell = lookup_cell(cell_idx);
if (!cell)
return;

struct bpf_cpumask *tmp = bpf_cpumask_create();
if (!tmp)
return;

u32 l3, present = 0, total_cpus = 0;

bpf_rcu_read_lock();
const struct cpumask *cell_mask =
lookup_cell_cpumask(cell_idx); // RCU ptr
if (!cell_mask) {
bpf_rcu_read_unlock();
bpf_cpumask_release(tmp);
return;
}

bpf_for(l3, 0, nr_l3)
{
const struct cpumask *l3_mask =
lookup_l3_cpumask(l3); // plain map memory
if (!l3_mask) {
cell->l3_cpu_cnt[l3] = 0;
continue;
}

/* ok: dst is bpf_cpumask*, sources are (RCU cpumask*, plain cpumask*) */
bpf_cpumask_and(tmp, cell_mask, l3_mask);

u32 cnt = bpf_cpumask_weight((const struct cpumask *)tmp);
cell->l3_cpu_cnt[l3] = cnt;
total_cpus += cnt;
if (cnt)
present++;
}
bpf_rcu_read_unlock();

cell->l3_present_cnt = present;
cell->cpu_cnt = total_cpus;
bpf_cpumask_release(tmp);
}

/**
* Weighted random selection of an L3 cache domain for a task.
*
* Uses the CPU count in each L3 domain within the cell as weights to
* probabilistically select an L3. L3 domains with more CPUs in the cell
* have higher probability of being selected.
*
* @cell_id: The cell ID to select an L3 from
* @return: L3 ID on success, INVALID_L3_ID on error, or 0 as fallback
*/
static inline s32 pick_l3_for_task(u32 cell_id)
{
struct cell *cell;
u32 l3, target, cur = 0;
s32 ret = INVALID_L3_ID;

/* Look up the cell structure */
if (!(cell = lookup_cell(cell_id)))
return INVALID_L3_ID;

/* Handle case where cell has no CPUs assigned yet */
if (!cell->cpu_cnt) {
scx_bpf_error(
"pick_l3_for_task: cell %d has no CPUs accounted yet",
cell_id);
return INVALID_L3_ID;
}

/* Generate random target value in range [0, cpu_cnt) */
target = bpf_get_prandom_u32() % cell->cpu_cnt;

/* Find the L3 domain corresponding to the target value using
* weighted selection - accumulate CPU counts until we exceed target */
bpf_for(l3, 0, nr_l3)
{
cur += cell->l3_cpu_cnt[l3];
if (target < cur) {
ret = (s32)l3;
break;
}
}
return ret;
}
Loading
Loading