-
Notifications
You must be signed in to change notification settings - Fork 198
scx_mitosis: add l3 awareness and work stealing #2761
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Draft
tommy-u
wants to merge
12
commits into
sched-ext:main
Choose a base branch
from
tommy-u:scx_mitosis_l3_aware
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Draft
Changes from 5 commits
Commits
Show all changes
12 commits
Select commit
Hold shift + click to select a range
1b11d2b
Only print stats when cell is in_use
tommy-u 0b7a1fc
Fix cpumask cleanup. RAII for running guard.
tommy-u 0c3c7bb
Preparing datastructures and helper functions for core scheduler modi…
tommy-u ba1924b
Prepare rust side for l3 awareness
tommy-u 0dd6be6
scx_mitosis: add L3 awareness and work stealing
tommy-u 8523b9d
scx_mitosis: major work stealing cleanup
tommy-u 7ddaba0
Use dsq_id_t type
tommy-u 7639d21
First cut at locking
tommy-u 7972846
Lock cell state
tommy-u 73a8623
Clang format
tommy-u d0a7eed
remove accidental code file
tommy-u 0b126e9
Fix work stealing bug
tommy-u File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,174 @@ | ||
| /* Copyright (c) Meta Platforms, Inc. and affiliates. */ | ||
| /* | ||
| * This software may be used and distributed according to the terms of the | ||
| * GNU General Public License version 2. | ||
| * | ||
| * This header defines the 64-bit dispatch queue (DSQ) ID encoding | ||
| * scheme for scx_mitosis, using type fields to distinguish between | ||
| * per-CPU and cell+L3 domain queues. It includes helper functions to | ||
| * construct, validate, and parse these DSQ IDs for queue management. | ||
| */ | ||
| #pragma once | ||
|
|
||
| #include "intf.h" | ||
| #include "mitosis.bpf.h" | ||
|
|
||
| /* | ||
| * ================================ | ||
| * BPF DSQ ID Layout (64 bits wide) | ||
| * ================================ | ||
| * | ||
| * Top-level format: | ||
| * [63] [62..0] | ||
| * [ B] [ ID ] | ||
| * | ||
| * If B == 1 it is a Built-in DSQ | ||
| * ------------------------- | ||
| * [63] [62] [61 .. 32] [31..0] | ||
| * [ 1] [ L] [ R ] [ V ] | ||
| * | ||
| * - L (bit 62): LOCAL_ON flag | ||
| * If L == 1 -> V = CPU number | ||
| * - R (30 bits): reserved / unused | ||
| * - V (32 bits): value (e.g., CPU#) | ||
| * | ||
| * If B == 0 -> User-defined DSQ | ||
| * ----------------------------- | ||
| * Only the low 32 bits are used. | ||
| * | ||
| * [63 .. 32] [31..0] | ||
| * [ 0][ unused ] [ VAL ] | ||
| * | ||
| * Mitosis uses VAL as follows: | ||
| * | ||
| * [31..28] [27..0] | ||
| * [QTYPE ] [DATA ] | ||
| * | ||
| * QTYPE encodes the queue type: | ||
| * | ||
| * QTYPE = 0x1 -> Per-CPU Q | ||
| * [31..28] [27 .. .. 0] | ||
| * [ 0001 ] [ CPU# ] | ||
| * [Q-TYPE:1] | ||
| * | ||
| * QTYPE = 0x2 -> Cell+L3 Q | ||
| * [31..28] [27 .. 16] [15 .. 0] | ||
| * [ 0010 ] [ CELL# ] [ L3ID ] | ||
| * [Q-TYPE:2] | ||
| * | ||
| */ | ||
| /* | ||
| * The use of these bitfields depends on compiler defined byte AND bit ordering. | ||
| * Make sure we're only building with Clang/LLVM and that we're little-endian. | ||
| */ | ||
| #ifndef __clang__ | ||
| #error "This code must be compiled with Clang/LLVM (eBPF: clang -target bpf)." | ||
| #endif | ||
|
|
||
| #if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ | ||
| #error "dsq64 bitfield layout assumes little-endian (bpfel)." | ||
| #endif | ||
|
|
||
| /* ---- Bitfield widths (bits) ---- */ | ||
| #define CPU_B 28 | ||
| #define L3_B 16 | ||
| #define CELL_B 12 | ||
| #define TYPE_B 4 | ||
| #define DATA_B 28 | ||
| #define RSVD_B 32 | ||
|
|
||
| /* Sum checks (in bits) */ | ||
| _Static_assert(CPU_B + TYPE_B == 32, "CPU layout low half must be 32 bits"); | ||
| _Static_assert(L3_B + CELL_B + TYPE_B == 32, "CELL+L3 layout low half must be 32 bits"); | ||
| _Static_assert(DATA_B + TYPE_B == 32, "Common layout low half must be 32 bits"); | ||
|
|
||
| typedef union { | ||
| u64 raw; | ||
|
|
||
| /* Per-CPU user DSQ */ | ||
| struct { u64 cpu: CPU_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } cpu_dsq; | ||
|
|
||
| /* Cell+L3 user DSQ */ | ||
| struct { u64 l3: L3_B; u64 cell: CELL_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } cell_l3_dsq; | ||
|
|
||
| /* Generic user view */ | ||
| struct { u64 data: DATA_B; u64 type: TYPE_B; u64 rsvd: RSVD_B; } user_dsq; | ||
|
|
||
| /* Built-in DSQ view */ | ||
| struct { u64 value:32; u64 rsvd:30; u64 local_on:1; u64 builtin:1; } builtin_dsq; | ||
|
|
||
| /* NOTE: Considered packed and aligned attributes, but that's redundant */ | ||
| } dsq_id_t; | ||
|
|
||
| /* | ||
| * Invalid DSQ ID Sentinel: | ||
| * invalid bc bit 63 clear (it's a user DSQ) && dsq_type == 0 (no type) | ||
| * Good for catching uninitialized DSQ IDs. | ||
| */ | ||
| #define DSQ_INVALID ((u64) 0) | ||
|
|
||
| _Static_assert(sizeof(((dsq_id_t){0}).cpu_dsq) == sizeof(u64), "cpu view must be 8 bytes"); | ||
| _Static_assert(sizeof(((dsq_id_t){0}).cell_l3_dsq) == sizeof(u64), "cell+l3 view must be 8 bytes"); | ||
| _Static_assert(sizeof(((dsq_id_t){0}).user_dsq) == sizeof(u64), "user common view must be 8 bytes"); | ||
| _Static_assert(sizeof(((dsq_id_t){0}).builtin_dsq) == sizeof(u64), "builtin view must be 8 bytes"); | ||
|
|
||
| /* Compile-time checks (in bytes) */ | ||
| _Static_assert(sizeof(dsq_id_t) == sizeof(u64), "dsq_id_t must be 8 bytes (64 bits)"); | ||
| _Static_assert(_Alignof(dsq_id_t) == sizeof(u64), "dsq_id_t must be 8-byte aligned"); | ||
|
|
||
| /* DSQ type enumeration */ | ||
| enum dsq_type { | ||
| DSQ_TYPE_NONE, | ||
| DSQ_TYPE_CPU, | ||
| DSQ_TYPE_CELL_L3, | ||
| }; | ||
|
|
||
| /* Range guards */ | ||
| _Static_assert(MAX_CPUS <= (1u << CPU_B), "MAX_CPUS must fit in field"); | ||
| _Static_assert(MAX_L3S <= (1u << L3_B), "MAX_L3S must fit in field"); | ||
| _Static_assert(MAX_CELLS <= (1u << CELL_B), "MAX_CELLS must fit in field"); | ||
| _Static_assert(DSQ_TYPE_CELL_L3 < (1u << TYPE_B), "DSQ_TYPE_CELL_L3 must fit in field"); | ||
|
|
||
| /* | ||
| * While I considered error propagation, I decided to bail to force errors early. | ||
| */ | ||
|
|
||
| static inline bool is_user_dsq(dsq_id_t dsq_id){ | ||
| return !dsq_id.builtin_dsq.builtin && dsq_id.user_dsq.type != DSQ_TYPE_NONE; | ||
| } | ||
|
|
||
| // Is this a per CPU DSQ? | ||
| static inline bool is_cpu_dsq(dsq_id_t dsq_id) | ||
| { | ||
| return is_user_dsq(dsq_id) && dsq_id.user_dsq.type == DSQ_TYPE_CPU; | ||
| } | ||
|
|
||
| // If this is a per cpu dsq, return the cpu | ||
| static inline u32 get_cpu_from_dsq(u64 id) | ||
| { | ||
| dsq_id_t dsq_id = (dsq_id_t) {.raw = id}; | ||
| if (!is_cpu_dsq(dsq_id)) | ||
| scx_bpf_error("trying to get cpu from non-cpu dsq\n"); | ||
|
|
||
| return dsq_id.cpu_dsq.cpu; | ||
| } | ||
|
|
||
| /* Helper functions to construct DSQ IDs */ | ||
| static inline u64 get_cpu_dsq_id(u32 cpu) | ||
| { | ||
| // Check for valid CPU range, 0 indexed so >=. | ||
| if (cpu >= MAX_CPUS) | ||
| scx_bpf_error("invalid cpu %u\n", cpu); | ||
| dsq_id_t dsq_id = { .cpu_dsq = { .cpu = cpu, .type = DSQ_TYPE_CPU } }; | ||
|
|
||
| return dsq_id.raw; | ||
| } | ||
|
|
||
| static inline u64 get_cell_l3_dsq_id(u32 cell, u32 l3) | ||
| { | ||
| if (cell >= MAX_CELLS || l3 >= MAX_L3S) | ||
| scx_bpf_error("cell %u or l3 %u too large\n", cell, l3); | ||
| dsq_id_t dsq_id = { .cell_l3_dsq = { .l3 = l3, .cell = cell, .type = DSQ_TYPE_CELL_L3 } }; | ||
|
|
||
| return dsq_id.raw; | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,12 @@ typedef _Bool bool; | |
| #include <scx/ravg.bpf.h> | ||
| #endif | ||
|
|
||
| /* ---- Work stealing config (compile-time) ------------------------------- */ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it might be best to have this as a runtime option - e.g. a flag passed to the user space binary that writes to a global static variable in the bpf code before running it |
||
| #ifndef MITOSIS_ENABLE_STEALING | ||
| #define MITOSIS_ENABLE_STEALING 1 | ||
| #endif | ||
| /* ----------------------------------------------------------------------- */ | ||
|
|
||
| enum consts { | ||
| CACHELINE_SIZE = 64, | ||
| MAX_CPUS_SHIFT = 9, | ||
|
|
@@ -39,6 +45,14 @@ enum cell_stat_idx { | |
| NR_CSTATS, | ||
| }; | ||
|
|
||
| /* Function invocation counters */ | ||
| enum counter_idx { | ||
tommy-u marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| COUNTER_SELECT_CPU, | ||
| COUNTER_ENQUEUE, | ||
| COUNTER_DISPATCH, | ||
| NR_COUNTERS, | ||
| }; | ||
|
|
||
| struct cpu_ctx { | ||
| u64 cstats[MAX_CELLS][NR_CSTATS]; | ||
| u64 cell_cycles[MAX_CELLS]; | ||
|
|
@@ -51,14 +65,4 @@ struct cgrp_ctx { | |
| bool cell_owner; | ||
| }; | ||
|
|
||
| /* | ||
| * cell is the per-cell book-keeping | ||
| */ | ||
| struct cell { | ||
| // current vtime of the cell | ||
| u64 vtime_now; | ||
| // Whether or not the cell is used or not | ||
| u32 in_use; | ||
| }; | ||
|
|
||
| #endif /* __INTF_H */ | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,150 @@ | ||
| /* Copyright (c) Meta Platforms, Inc. and affiliates. */ | ||
| /* | ||
| * This software may be used and distributed according to the terms of the | ||
| * GNU General Public License version 2. | ||
| * | ||
| * This header adds L3 cache awareness to scx_mitosis by defining BPF | ||
| * maps for CPU-to-L3 domain mappings. It provides functions to | ||
| * recalculate per-L3 CPU counts within cells and implements weighted | ||
| * random L3 selection for tasks. It also tracks work-stealing | ||
| * statistics for cross-L3 task migrations. | ||
| */ | ||
| #pragma once | ||
|
|
||
| #include "mitosis.bpf.h" | ||
| #include "intf.h" | ||
|
|
||
| // It's also an option to just compute this from the cpu_to_l3 map. | ||
| struct l3_cpu_mask { | ||
| unsigned long cpumask[CPUMASK_LONG_ENTRIES]; | ||
tommy-u marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| }; | ||
|
|
||
| /* Work stealing statistics map - accessible from both BPF and userspace */ | ||
| struct steal_stats_map { | ||
| __uint(type, BPF_MAP_TYPE_ARRAY); | ||
| __type(key, u32); | ||
| __type(value, u64); | ||
| __uint(max_entries, 1); | ||
| }; | ||
|
|
||
| // A CPU -> L3 cache ID map | ||
| struct cpu_to_l3_map { | ||
| __uint(type, BPF_MAP_TYPE_ARRAY); | ||
| __type(key, u32); | ||
| __type(value, u32); | ||
| __uint(max_entries, MAX_CPUS); | ||
| }; | ||
|
|
||
| struct l3_to_cpus_map { | ||
| __uint(type, BPF_MAP_TYPE_ARRAY); | ||
| __type(key, u32); | ||
| __type(value, struct l3_cpu_mask); | ||
| __uint(max_entries, MAX_L3S); | ||
| }; | ||
|
|
||
| extern struct cpu_to_l3_map cpu_to_l3 SEC(".maps"); | ||
| extern struct l3_to_cpus_map l3_to_cpus SEC(".maps"); | ||
| extern struct steal_stats_map steal_stats SEC(".maps"); | ||
|
|
||
| static inline const struct cpumask *lookup_l3_cpumask(u32 l3) | ||
| { | ||
| struct l3_cpu_mask *mask; | ||
|
|
||
| if (!(mask = bpf_map_lookup_elem(&l3_to_cpus, &l3))) { | ||
| scx_bpf_error("no l3 cpumask, l3: %d, %p", l3, &l3_to_cpus); | ||
| return NULL; | ||
| } | ||
|
|
||
| return (const struct cpumask *)mask; | ||
tommy-u marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| /* Recompute cell->l3_cpu_cnt[] after cell cpumask changes (no persistent kptrs). */ | ||
tommy-u marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| static __always_inline void recalc_cell_l3_counts(u32 cell_idx) | ||
| { | ||
| struct cell *cell = lookup_cell(cell_idx); | ||
| if (!cell) | ||
| return; | ||
|
|
||
| struct bpf_cpumask *tmp = bpf_cpumask_create(); | ||
| if (!tmp) | ||
| return; | ||
|
|
||
| u32 l3, present = 0, total_cpus = 0; | ||
|
|
||
| bpf_rcu_read_lock(); | ||
| const struct cpumask *cell_mask = | ||
| lookup_cell_cpumask(cell_idx); // RCU ptr | ||
| if (!cell_mask) { | ||
| bpf_rcu_read_unlock(); | ||
| bpf_cpumask_release(tmp); | ||
| return; | ||
| } | ||
|
|
||
| bpf_for(l3, 0, nr_l3) | ||
| { | ||
| const struct cpumask *l3_mask = | ||
| lookup_l3_cpumask(l3); // plain map memory | ||
| if (!l3_mask) { | ||
| cell->l3_cpu_cnt[l3] = 0; | ||
| continue; | ||
| } | ||
|
|
||
| /* ok: dst is bpf_cpumask*, sources are (RCU cpumask*, plain cpumask*) */ | ||
| bpf_cpumask_and(tmp, cell_mask, l3_mask); | ||
tommy-u marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| u32 cnt = bpf_cpumask_weight((const struct cpumask *)tmp); | ||
| cell->l3_cpu_cnt[l3] = cnt; | ||
| total_cpus += cnt; | ||
| if (cnt) | ||
| present++; | ||
| } | ||
| bpf_rcu_read_unlock(); | ||
|
|
||
| cell->l3_present_cnt = present; | ||
| cell->cpu_cnt = total_cpus; | ||
tommy-u marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| bpf_cpumask_release(tmp); | ||
| } | ||
|
|
||
| /** | ||
| * Weighted random selection of an L3 cache domain for a task. | ||
| * | ||
| * Uses the CPU count in each L3 domain within the cell as weights to | ||
| * probabilistically select an L3. L3 domains with more CPUs in the cell | ||
| * have higher probability of being selected. | ||
| * | ||
| * @cell_id: The cell ID to select an L3 from | ||
| * @return: L3 ID on success, INVALID_L3_ID on error, or 0 as fallback | ||
tommy-u marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| */ | ||
| static inline s32 pick_l3_for_task(u32 cell_id) | ||
| { | ||
| struct cell *cell; | ||
| u32 l3, target, cur = 0; | ||
| s32 ret = INVALID_L3_ID; | ||
|
|
||
| /* Look up the cell structure */ | ||
| if (!(cell = lookup_cell(cell_id))) | ||
| return INVALID_L3_ID; | ||
|
|
||
| /* Handle case where cell has no CPUs assigned yet */ | ||
| if (!cell->cpu_cnt) { | ||
| scx_bpf_error( | ||
| "pick_l3_for_task: cell %d has no CPUs accounted yet", | ||
| cell_id); | ||
| return INVALID_L3_ID; | ||
| } | ||
|
|
||
| /* Generate random target value in range [0, cpu_cnt) */ | ||
| target = bpf_get_prandom_u32() % cell->cpu_cnt; | ||
|
|
||
| /* Find the L3 domain corresponding to the target value using | ||
| * weighted selection - accumulate CPU counts until we exceed target */ | ||
| bpf_for(l3, 0, nr_l3) | ||
| { | ||
| cur += cell->l3_cpu_cnt[l3]; | ||
| if (target < cur) { | ||
| ret = (s32)l3; | ||
| break; | ||
| } | ||
| } | ||
| return ret; | ||
| } | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.