Skip to content

scx_layered: Implement sticky modulation optimization #1690

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions scheds/rust/scx_layered/src/bpf/intf.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,9 @@ struct cpu_ctx {
u32 gn_layer_order[MAX_LAYERS]; /* grouped non-preempt */

struct cpu_prox_map prox_map;

u64 sticky_mod_end_time_ns;
u64 sticky_mod_pred_pct;
};

struct llc_prox_map {
Expand Down Expand Up @@ -332,6 +335,9 @@ struct layer {

char name[MAX_LAYER_NAME];
bool is_protected;

u64 sticky_mod_min_ns;
u64 sticky_mod_pred_pct;
};

struct scx_cmd {
Expand Down
105 changes: 105 additions & 0 deletions scheds/rust/scx_layered/src/bpf/main.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ const volatile u64 min_open_layer_disallow_preempt_after_ns;
const volatile u64 lo_fb_wait_ns = 5000000; /* !0 for veristat */
const volatile u32 lo_fb_share_ppk = 128; /* !0 for veristat */
const volatile bool percpu_kthread_preempt = true;
int active_sticky_mod = 0;

/* Flag to enable or disable antistall feature */
const volatile bool enable_antistall = true;
Expand Down Expand Up @@ -499,6 +500,11 @@ struct task_ctx {
u32 qrt_llc_id;

char join_layer[SCXCMD_COMLEN];

#define STICKY_MOD_NR_BUCKETS 8
u64 sticky_mod_buckets[STICKY_MOD_NR_BUCKETS];
u64 sticky_mod_nr_cnt;
u64 sticky_mod_start_ns;
};

struct {
Expand Down Expand Up @@ -871,6 +877,47 @@ s32 pick_idle_big_little(struct layer *layer, struct task_ctx *taskc,
return cpu;
}

static __always_inline
s32 pick_sticky_mod_cpu(struct llc_ctx *llc, struct layer *layer, s32 prev_cpu)
{
u64 time = bpf_ktime_get_ns();
const struct cpumask *cpumask;
struct cpu_ctx *cpu_ctx;
s32 cpu = -1;
int i;

if (!active_sticky_mod)
return cpu;

cpu_ctx = lookup_cpu_ctx(prev_cpu);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Factoring out into a small inline function can avoid all the gotos.

if (!cpu_ctx)
goto llc;
if (cpu_ctx->sticky_mod_pred_pct < layer->sticky_mod_pred_pct)
goto llc;
if (cpu_ctx->sticky_mod_end_time_ns - time > layer->sticky_mod_min_ns)
goto llc;
return prev_cpu;
llc:
if (!(cpumask = cast_mask(llc->cpumask)))
goto out;
bpf_for(i, 0, nr_possible_cpus) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Comment: I'm surprised we don't have an FFS based foreach cpu in cpumask primitive)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, if there's a better way to do this, I'm all ears. It's suboptimal, especially on machines with a lot of CPUs. Even on Bergamo, we're iterating over 88 CPUs for every 8 we want to test.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One thing that comes to mind is storing start and end CPUs in llc_ctx, but that requires the assignment to be sequential.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

layered stores iteration indices in per-cpu/llc arrays. It's a bit more setup work but overall not that bad. But yeah, bit-wise iterators would be great.

if (i == prev_cpu)
continue;
if (!bpf_cpumask_test_cpu(i, cpumask))
continue;
if (!(cpu_ctx = lookup_cpu_ctx(i)))
continue;
if (cpu_ctx->sticky_mod_pred_pct < layer->sticky_mod_pred_pct)
continue;
if (cpu_ctx->sticky_mod_end_time_ns - time > layer->sticky_mod_min_ns)
continue;
cpu = i;
break;
}
out:
return cpu;
}

static __always_inline
s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
struct cpu_ctx *cpuc, struct task_ctx *taskc, struct layer *layer,
Expand Down Expand Up @@ -987,6 +1034,9 @@ s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
cpu = -1;
goto out_put;
}

if ((cpu = pick_sticky_mod_cpu(prev_llcc, layer, prev_cpu)) >= 0)
goto out_put;
}

/*
Expand Down Expand Up @@ -1195,6 +1245,55 @@ static void layer_kick_idle_cpu(struct layer *layer)
scx_bpf_put_idle_cpumask(idle_smtmask);
}

SEC("tp_btf/sched_switch")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the overhead of the extra probe? Both in terms of having an extra probe and in terms of the code itself. Can we roll this into our starting/stopping methods?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The probe itself should be pretty fast, but I think longer term hooking into starting/stopping methods is the better way, so I will do that.

int BPF_PROG(layered_sched_switch, bool ignore, struct task_struct *prev, struct task_struct *next)
{
u64 time = bpf_ktime_get_ns();
u64 duration = time, max = 0;
u32 beg = 0, end = 50000, i;
struct task_ctx *pc, *nc;
struct cpu_ctx *c;
u32 max_i = 0;

if (!active_sticky_mod)
return 0;

if (!(pc = lookup_task_ctx_may_fail(prev)))
goto next;

duration -= pc->sticky_mod_start_ns;
duration /= 1000;

pc->sticky_mod_nr_cnt++;

for (i = 0; i < STICKY_MOD_NR_BUCKETS; i++) {
u64 cnt = pc->sticky_mod_buckets[i];

if (duration >= beg && duration <= end) {
pc->sticky_mod_buckets[i]++;
cnt++;
}
if (max < cnt) {
max = cnt;
max_i = i;
}
beg += 50000;
end += 50000;
if (i == STICKY_MOD_NR_BUCKETS - 2)
end = -1;
}

if (!(c = lookup_cpu_ctx(-1)))
goto next;
c->sticky_mod_end_time_ns = (max_i + 1) * 50000;
c->sticky_mod_pred_pct = ((max * 100) / pc->sticky_mod_nr_cnt);
next:
if (!(nc = lookup_task_ctx_may_fail(next)))
return 0;
nc->sticky_mod_start_ns = time;
return 0;
}

void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
{
struct cpu_ctx *cpuc, *task_cpuc;
Expand Down Expand Up @@ -1718,6 +1817,9 @@ static __always_inline bool try_consume_layer(u32 layer_id, struct cpu_ctx *cpuc
xllc_mig_skipped = true;
continue;
}

if (pick_sticky_mod_cpu(remote_llcc, layer, -1) >= 0)
continue;
}

if (scx_bpf_dsq_move_to_local(layer_dsq_id(layer_id, *llc_idp)))
Expand Down Expand Up @@ -3174,6 +3276,9 @@ static s32 init_layer(int layer_id)
return ret;
}

if (layer->sticky_mod_min_ns || layer->sticky_mod_pred_pct)
active_sticky_mod++;

return 0;
}

Expand Down
4 changes: 4 additions & 0 deletions scheds/rust/scx_layered/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,10 @@ pub struct LayerCommon {
pub nodes: Vec<usize>,
#[serde(default)]
pub llcs: Vec<usize>,
#[serde(default)]
pub sticky_mod_min_us: f64,
#[serde(default)]
pub sticky_mod_pred_pct: f64,
}

#[derive(Clone, Debug, Serialize, Deserialize)]
Expand Down
23 changes: 23 additions & 0 deletions scheds/rust/scx_layered/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ lazy_static! {
perf: 1024,
nodes: vec![],
llcs: vec![],
sticky_mod_min_us: 0.0,
sticky_mod_pred_pct: 0.0,
},
},
},
Expand Down Expand Up @@ -154,6 +156,8 @@ lazy_static! {
idle_resume_us: None,
nodes: vec![],
llcs: vec![],
sticky_mod_min_us: 0.0,
sticky_mod_pred_pct: 0.0,
},
},
},
Expand Down Expand Up @@ -189,6 +193,8 @@ lazy_static! {
idle_resume_us: None,
nodes: vec![],
llcs: vec![],
sticky_mod_min_us: 0.0,
sticky_mod_pred_pct: 0.0,
},
},
},
Expand Down Expand Up @@ -221,6 +227,8 @@ lazy_static! {
idle_resume_us: None,
nodes: vec![],
llcs: vec![],
sticky_mod_min_us: 0.0,
sticky_mod_pred_pct: 0.0,
},
},
},
Expand Down Expand Up @@ -428,6 +436,17 @@ lazy_static! {
/// the nodes value is set the cpuset of LLCs will be or'ed with the nodes
/// config.
///
/// - sticky_mod_min_us: Skip cross-CPU migration if the previous CPU is likely
/// to be available for execution sooner than this threshold, same applies for
/// one of the CPUs in the previous LLC.
///
/// - sticky_mod_pred_pct: The percentage threshold to decide whether to stick
/// to previous CPU, or one of the CPUs in the previous LLC, opening up. It
/// is compared against the percentage of times the process stayed in a
/// predictable bucket of execution time, increasing confidence on
/// predictions. E.g. 90 would mean only processes predictable with 90%
/// accuracy or more will be chosen for stickiness modulation.
///
///
/// Similar to matches, adding new policies and extending existing ones
/// should be relatively straightforward.
Expand Down Expand Up @@ -1318,6 +1337,8 @@ impl<'a> Scheduler<'a> {
disallow_open_after_us,
disallow_preempt_after_us,
xllc_mig_min_us,
sticky_mod_min_us,
sticky_mod_pred_pct,
..
} = spec.kind.common();

Expand Down Expand Up @@ -1359,6 +1380,8 @@ impl<'a> Scheduler<'a> {
}
layer.llc_mask |= llcmask_from_llcs(&topo_node.llcs) as u64;
}
layer.sticky_mod_min_ns = (sticky_mod_min_us * 1000.0) as u64;
layer.sticky_mod_pred_pct = sticky_mod_pred_pct.clamp(0.0, 100.0) as u64;
}

layer.is_protected.write(match spec.kind {
Expand Down