diff --git a/scheds/rust/scx_layered/src/bpf/intf.h b/scheds/rust/scx_layered/src/bpf/intf.h index bfbde8e4b..5b2823569 100644 --- a/scheds/rust/scx_layered/src/bpf/intf.h +++ b/scheds/rust/scx_layered/src/bpf/intf.h @@ -202,6 +202,9 @@ struct cpu_ctx { u32 gn_layer_order[MAX_LAYERS]; /* grouped non-preempt */ struct cpu_prox_map prox_map; + + u64 sticky_mod_end_time_ns; + u64 sticky_mod_pred_pct; }; struct llc_prox_map { @@ -332,6 +335,9 @@ struct layer { char name[MAX_LAYER_NAME]; bool is_protected; + + u64 sticky_mod_min_ns; + u64 sticky_mod_pred_pct; }; struct scx_cmd { diff --git a/scheds/rust/scx_layered/src/bpf/main.bpf.c b/scheds/rust/scx_layered/src/bpf/main.bpf.c index 6d608d695..4fa41e444 100644 --- a/scheds/rust/scx_layered/src/bpf/main.bpf.c +++ b/scheds/rust/scx_layered/src/bpf/main.bpf.c @@ -52,6 +52,7 @@ const volatile u64 min_open_layer_disallow_preempt_after_ns; const volatile u64 lo_fb_wait_ns = 5000000; /* !0 for veristat */ const volatile u32 lo_fb_share_ppk = 128; /* !0 for veristat */ const volatile bool percpu_kthread_preempt = true; +int active_sticky_mod = 0; /* Flag to enable or disable antistall feature */ const volatile bool enable_antistall = true; @@ -499,6 +500,11 @@ struct task_ctx { u32 qrt_llc_id; char join_layer[SCXCMD_COMLEN]; + +#define STICKY_MOD_NR_BUCKETS 8 + u64 sticky_mod_buckets[STICKY_MOD_NR_BUCKETS]; + u64 sticky_mod_nr_cnt; + u64 sticky_mod_start_ns; }; struct { @@ -871,6 +877,47 @@ s32 pick_idle_big_little(struct layer *layer, struct task_ctx *taskc, return cpu; } +static __always_inline +s32 pick_sticky_mod_cpu(struct llc_ctx *llc, struct layer *layer, s32 prev_cpu) +{ + u64 time = bpf_ktime_get_ns(); + const struct cpumask *cpumask; + struct cpu_ctx *cpu_ctx; + s32 cpu = -1; + int i; + + if (!active_sticky_mod) + return cpu; + + cpu_ctx = lookup_cpu_ctx(prev_cpu); + if (!cpu_ctx) + goto llc; + if (cpu_ctx->sticky_mod_pred_pct < layer->sticky_mod_pred_pct) + goto llc; + if (cpu_ctx->sticky_mod_end_time_ns - time > layer->sticky_mod_min_ns) + goto llc; + return prev_cpu; +llc: + if (!(cpumask = cast_mask(llc->cpumask))) + goto out; + bpf_for(i, 0, nr_possible_cpus) { + if (i == prev_cpu) + continue; + if (!bpf_cpumask_test_cpu(i, cpumask)) + continue; + if (!(cpu_ctx = lookup_cpu_ctx(i))) + continue; + if (cpu_ctx->sticky_mod_pred_pct < layer->sticky_mod_pred_pct) + continue; + if (cpu_ctx->sticky_mod_end_time_ns - time > layer->sticky_mod_min_ns) + continue; + cpu = i; + break; + } +out: + return cpu; +} + static __always_inline s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, struct cpu_ctx *cpuc, struct task_ctx *taskc, struct layer *layer, @@ -987,6 +1034,9 @@ s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, cpu = -1; goto out_put; } + + if ((cpu = pick_sticky_mod_cpu(prev_llcc, layer, prev_cpu)) >= 0) + goto out_put; } /* @@ -1195,6 +1245,55 @@ static void layer_kick_idle_cpu(struct layer *layer) scx_bpf_put_idle_cpumask(idle_smtmask); } +SEC("tp_btf/sched_switch") +int BPF_PROG(layered_sched_switch, bool ignore, struct task_struct *prev, struct task_struct *next) +{ + u64 time = bpf_ktime_get_ns(); + u64 duration = time, max = 0; + u32 beg = 0, end = 50000, i; + struct task_ctx *pc, *nc; + struct cpu_ctx *c; + u32 max_i = 0; + + if (!active_sticky_mod) + return 0; + + if (!(pc = lookup_task_ctx_may_fail(prev))) + goto next; + + duration -= pc->sticky_mod_start_ns; + duration /= 1000; + + pc->sticky_mod_nr_cnt++; + + for (i = 0; i < STICKY_MOD_NR_BUCKETS; i++) { + u64 cnt = pc->sticky_mod_buckets[i]; + + if (duration >= beg && duration <= end) { + pc->sticky_mod_buckets[i]++; + cnt++; + } + if (max < cnt) { + max = cnt; + max_i = i; + } + beg += 50000; + end += 50000; + if (i == STICKY_MOD_NR_BUCKETS - 2) + end = -1; + } + + if (!(c = lookup_cpu_ctx(-1))) + goto next; + c->sticky_mod_end_time_ns = (max_i + 1) * 50000; + c->sticky_mod_pred_pct = ((max * 100) / pc->sticky_mod_nr_cnt); +next: + if (!(nc = lookup_task_ctx_may_fail(next))) + return 0; + nc->sticky_mod_start_ns = time; + return 0; +} + void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags) { struct cpu_ctx *cpuc, *task_cpuc; @@ -1718,6 +1817,9 @@ static __always_inline bool try_consume_layer(u32 layer_id, struct cpu_ctx *cpuc xllc_mig_skipped = true; continue; } + + if (pick_sticky_mod_cpu(remote_llcc, layer, -1) >= 0) + continue; } if (scx_bpf_dsq_move_to_local(layer_dsq_id(layer_id, *llc_idp))) @@ -3174,6 +3276,9 @@ static s32 init_layer(int layer_id) return ret; } + if (layer->sticky_mod_min_ns || layer->sticky_mod_pred_pct) + active_sticky_mod++; + return 0; } diff --git a/scheds/rust/scx_layered/src/config.rs b/scheds/rust/scx_layered/src/config.rs index ea78d8867..eb8a5ef0d 100644 --- a/scheds/rust/scx_layered/src/config.rs +++ b/scheds/rust/scx_layered/src/config.rs @@ -122,6 +122,10 @@ pub struct LayerCommon { pub nodes: Vec, #[serde(default)] pub llcs: Vec, + #[serde(default)] + pub sticky_mod_min_us: f64, + #[serde(default)] + pub sticky_mod_pred_pct: f64, } #[derive(Clone, Debug, Serialize, Deserialize)] diff --git a/scheds/rust/scx_layered/src/main.rs b/scheds/rust/scx_layered/src/main.rs index 3b3ab9745..7052b1d0d 100644 --- a/scheds/rust/scx_layered/src/main.rs +++ b/scheds/rust/scx_layered/src/main.rs @@ -123,6 +123,8 @@ lazy_static! { perf: 1024, nodes: vec![], llcs: vec![], + sticky_mod_min_us: 0.0, + sticky_mod_pred_pct: 0.0, }, }, }, @@ -154,6 +156,8 @@ lazy_static! { idle_resume_us: None, nodes: vec![], llcs: vec![], + sticky_mod_min_us: 0.0, + sticky_mod_pred_pct: 0.0, }, }, }, @@ -189,6 +193,8 @@ lazy_static! { idle_resume_us: None, nodes: vec![], llcs: vec![], + sticky_mod_min_us: 0.0, + sticky_mod_pred_pct: 0.0, }, }, }, @@ -221,6 +227,8 @@ lazy_static! { idle_resume_us: None, nodes: vec![], llcs: vec![], + sticky_mod_min_us: 0.0, + sticky_mod_pred_pct: 0.0, }, }, }, @@ -428,6 +436,17 @@ lazy_static! { /// the nodes value is set the cpuset of LLCs will be or'ed with the nodes /// config. /// +/// - sticky_mod_min_us: Skip cross-CPU migration if the previous CPU is likely +/// to be available for execution sooner than this threshold, same applies for +/// one of the CPUs in the previous LLC. +/// +/// - sticky_mod_pred_pct: The percentage threshold to decide whether to stick +/// to previous CPU, or one of the CPUs in the previous LLC, opening up. It +/// is compared against the percentage of times the process stayed in a +/// predictable bucket of execution time, increasing confidence on +/// predictions. E.g. 90 would mean only processes predictable with 90% +/// accuracy or more will be chosen for stickiness modulation. +/// /// /// Similar to matches, adding new policies and extending existing ones /// should be relatively straightforward. @@ -1318,6 +1337,8 @@ impl<'a> Scheduler<'a> { disallow_open_after_us, disallow_preempt_after_us, xllc_mig_min_us, + sticky_mod_min_us, + sticky_mod_pred_pct, .. } = spec.kind.common(); @@ -1359,6 +1380,8 @@ impl<'a> Scheduler<'a> { } layer.llc_mask |= llcmask_from_llcs(&topo_node.llcs) as u64; } + layer.sticky_mod_min_ns = (sticky_mod_min_us * 1000.0) as u64; + layer.sticky_mod_pred_pct = sticky_mod_pred_pct.clamp(0.0, 100.0) as u64; } layer.is_protected.write(match spec.kind {