From 2d285d561543d76b4a13263731b447e646c258f2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 9 Sep 2024 13:42:47 -1000 Subject: [PATCH] scx_qmap: Implement highpri boosting Implement a silly boosting mechanism for nice -20 tasks. The only purpose is demonstrating and testing scx_bpf_dispatch_from_dsq(). The boosting only works within SHARED_DSQ and makes only minor differences with increased dispatch batch (-b). This exercises moving tasks to a user DSQ and all local DSQs from ops.dispatch() and BPF timerfn. v2: - Updated to use scx_bpf_dispatch_from_dsq_set_{slice|vtime}(). - Drop the workaround for the iterated tasks not being trusted by the verifier. The issue is fixed from BPF side. Signed-off-by: Tejun Heo Cc: Daniel Hodges Cc: David Vernet Cc: Changwoo Min Cc: Andrea Righi Cc: Dan Schatzberg --- tools/sched_ext/scx_qmap.bpf.c | 133 +++++++++++++++++++++++++++++---- tools/sched_ext/scx_qmap.c | 11 ++- 2 files changed, 130 insertions(+), 14 deletions(-) diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 7e7f28f4117e..83c8f54c1e31 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -27,6 +27,8 @@ enum consts { ONE_SEC_IN_NS = 1000000000, SHARED_DSQ = 0, + HIGHPRI_DSQ = 1, + HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */ }; char _license[] SEC("license") = "GPL"; @@ -36,10 +38,12 @@ const volatile u32 stall_user_nth; const volatile u32 stall_kernel_nth; const volatile u32 dsp_inf_loop_after; const volatile u32 dsp_batch; +const volatile bool highpri_boosting; const volatile bool print_shared_dsq; const volatile s32 disallow_tgid; const volatile bool suppress_dump; +u64 nr_highpri_queued; u32 test_error_cnt; UEI_DEFINE(uei); @@ -95,6 +99,7 @@ static u64 core_sched_tail_seqs[5]; /* Per-task scheduling context */ struct task_ctx { bool force_local; /* Dispatch directly to local_dsq */ + bool highpri; u64 core_sched_seq; }; @@ -122,6 +127,7 @@ struct { /* Statistics */ u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq; u64 nr_core_sched_execed; +u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer; u32 cpuperf_min, cpuperf_avg, cpuperf_max; u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max; @@ -140,17 +146,25 @@ static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu) return -1; } +static struct task_ctx *lookup_task_ctx(struct task_struct *p) +{ + struct task_ctx *tctx; + + if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) { + scx_bpf_error("task_ctx lookup failed"); + return NULL; + } + return tctx; +} + s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) { struct task_ctx *tctx; s32 cpu; - tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); - if (!tctx) { - scx_bpf_error("task_ctx lookup failed"); + if (!(tctx = lookup_task_ctx(p))) return -ESRCH; - } cpu = pick_direct_dispatch_cpu(p, prev_cpu); @@ -197,11 +211,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) if (test_error_cnt && !--test_error_cnt) scx_bpf_error("test triggering error"); - tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); - if (!tctx) { - scx_bpf_error("task_ctx lookup failed"); + if (!(tctx = lookup_task_ctx(p))) return; - } /* * All enqueued tasks must have their core_sched_seq updated for correct @@ -255,6 +266,10 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) return; } + if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) { + tctx->highpri = true; + __sync_fetch_and_add(&nr_highpri_queued, 1); + } __sync_fetch_and_add(&nr_enqueued, 1); } @@ -271,13 +286,80 @@ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) static void update_core_sched_head_seq(struct task_struct *p) { - struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); int idx = weight_to_idx(p->scx.weight); + struct task_ctx *tctx; - if (tctx) + if ((tctx = lookup_task_ctx(p))) core_sched_head_seqs[idx] = tctx->core_sched_seq; - else - scx_bpf_error("task_ctx lookup failed"); +} + +/* + * To demonstrate the use of scx_bpf_dispatch_from_dsq(), implement silly + * selective priority boosting mechanism by scanning SHARED_DSQ looking for + * highpri tasks, moving them to HIGHPRI_DSQ and then consuming them first. This + * makes minor difference only when dsp_batch is larger than 1. + * + * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and + * non-rq-lock holding BPF programs. As demonstration, this function is called + * from qmap_dispatch() and monitor_timerfn(). + */ +static bool dispatch_highpri(bool from_timer) +{ + struct task_struct *p; + s32 this_cpu = bpf_get_smp_processor_id(); + + /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */ + bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) { + static u64 highpri_seq; + struct task_ctx *tctx; + + if (!(tctx = lookup_task_ctx(p))) + return false; + + if (tctx->highpri) { + /* exercise the set_*() and vtime interface too */ + scx_bpf_dispatch_from_dsq_set_slice( + BPF_FOR_EACH_ITER, slice_ns * 2); + scx_bpf_dispatch_from_dsq_set_vtime( + BPF_FOR_EACH_ITER, highpri_seq++); + scx_bpf_dispatch_vtime_from_dsq( + BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); + } + } + + /* + * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU + * is found. + */ + bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) { + bool dispatched = false; + s32 cpu; + + if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr)) + cpu = this_cpu; + else + cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0); + + if (scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p, + SCX_DSQ_LOCAL_ON | cpu, + SCX_ENQ_PREEMPT)) { + if (cpu == this_cpu) { + dispatched = true; + __sync_fetch_and_add(&nr_expedited_local, 1); + } else { + __sync_fetch_and_add(&nr_expedited_remote, 1); + } + if (from_timer) + __sync_fetch_and_add(&nr_expedited_from_timer, 1); + } else { + __sync_fetch_and_add(&nr_expedited_lost, 1); + } + + if (dispatched) + return true; + } + + return false; } void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) @@ -289,7 +371,10 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) void *fifo; s32 i, pid; - if (scx_bpf_consume(SHARED_DSQ)) + if (dispatch_highpri(false)) + return; + + if (!nr_highpri_queued && scx_bpf_consume(SHARED_DSQ)) return; if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { @@ -326,6 +411,8 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) /* Dispatch or advance. */ bpf_repeat(BPF_MAX_LOOPS) { + struct task_ctx *tctx; + if (bpf_map_pop_elem(fifo, &pid)) break; @@ -333,13 +420,25 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) if (!p) continue; + if (!(tctx = lookup_task_ctx(p))) { + bpf_task_release(p); + return; + } + + if (tctx->highpri) + __sync_fetch_and_sub(&nr_highpri_queued, 1); + update_core_sched_head_seq(p); __sync_fetch_and_add(&nr_dispatched, 1); + scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0); bpf_task_release(p); + batch--; cpuc->dsp_cnt--; if (!batch || !scx_bpf_dispatch_nr_slots()) { + if (dispatch_highpri(false)) + return; scx_bpf_consume(SHARED_DSQ); return; } @@ -664,6 +763,10 @@ static void dump_shared_dsq(void) static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) { + bpf_rcu_read_lock(); + dispatch_highpri(true); + bpf_rcu_read_unlock(); + monitor_cpuperf(); if (print_shared_dsq) @@ -685,6 +788,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) if (ret) return ret; + ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1); + if (ret) + return ret; + timer = bpf_map_lookup_elem(&monitor_timer, &key); if (!timer) return -ESRCH; diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index c9ca30d62b2b..ac45a02b4055 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -29,6 +29,7 @@ const char help_fmt[] = " -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" " -b COUNT Dispatch upto COUNT tasks together\n" " -P Print out DSQ content to trace_pipe every second, use with -b\n" +" -H Boost nice -20 tasks in SHARED_DSQ, use with -b\n" " -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" " -D LEN Set scx_exit_info.dump buffer length\n" " -S Suppress qmap-specific debug dump\n" @@ -63,7 +64,7 @@ int main(int argc, char **argv) skel = SCX_OPS_OPEN(qmap_ops, scx_qmap); - while ((opt = getopt(argc, argv, "s:e:t:T:l:b:Pd:D:Spvh")) != -1) { + while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PHd:D:Spvh")) != -1) { switch (opt) { case 's': skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; @@ -86,6 +87,9 @@ int main(int argc, char **argv) case 'P': skel->rodata->print_shared_dsq = true; break; + case 'H': + skel->rodata->highpri_boosting = true; + break; case 'd': skel->rodata->disallow_tgid = strtol(optarg, NULL, 0); if (skel->rodata->disallow_tgid < 0) @@ -121,6 +125,11 @@ int main(int argc, char **argv) skel->bss->nr_reenqueued, skel->bss->nr_dequeued, skel->bss->nr_core_sched_execed, skel->bss->nr_ddsp_from_enq); + printf(" exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n", + skel->bss->nr_expedited_local, + skel->bss->nr_expedited_remote, + skel->bss->nr_expedited_from_timer, + skel->bss->nr_expedited_lost); if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur")) printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n", skel->bss->cpuperf_min,