mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-17 02:36:21 +00:00
5209c03c8e
In sched_ext API, a repeatedly reported pain point is the overuse of the verb "dispatch" and confusion around "consume": - ops.dispatch() - scx_bpf_dispatch[_vtime]() - scx_bpf_consume() - scx_bpf_dispatch[_vtime]_from_dsq*() This overloading of the term is historical. Originally, there were only built-in DSQs and moving a task into a DSQ always dispatched it for execution. Using the verb "dispatch" for the kfuncs to move tasks into these DSQs made sense. Later, user DSQs were added and scx_bpf_dispatch[_vtime]() updated to be able to insert tasks into any DSQ. The only allowed DSQ to DSQ transfer was from a non-local DSQ to a local DSQ and this operation was named "consume". This was already confusing as a task could be dispatched to a user DSQ from ops.enqueue() and then the DSQ would have to be consumed in ops.dispatch(). Later addition of scx_bpf_dispatch_from_dsq*() made the confusion even worse as "dispatch" in this context meant moving a task to an arbitrary DSQ from a user DSQ. Clean up the API with the following renames: 1. scx_bpf_dispatch[_vtime]() -> scx_bpf_dsq_insert[_vtime]() 2. scx_bpf_consume() -> scx_bpf_dsq_move_to_local() 3. scx_bpf_dispatch[_vtime]_from_dsq*() -> scx_bpf_dsq_move[_vtime]*() This patch performs the second rename. Compatibility is maintained by: - The previous kfunc names are still provided by the kernel so that old binaries can run. Kernel generates a warning when the old names are used. - compat.bpf.h provides wrappers for the new names which automatically fall back to the old names when running on older kernels. They also trigger build error if old names are used for new builds. The compat features will be dropped after v6.15. v2: Comment and documentation updates. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Andrea Righi <arighi@nvidia.com> Acked-by: Changwoo Min <changwoo@igalia.com> Acked-by: Johannes Bechberger <me@mostlynerdless.de> Acked-by: Giovanni Gherdovich <ggherdovich@suse.com> Cc: Dan Schatzberg <dschatzberg@meta.com> Cc: Ming Yang <yougmark94@gmail.com>
157 lines
4.5 KiB
C
157 lines
4.5 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* A simple scheduler.
|
|
*
|
|
* By default, it operates as a simple global weighted vtime scheduler and can
|
|
* be switched to FIFO scheduling. It also demonstrates the following niceties.
|
|
*
|
|
* - Statistics tracking how many tasks are queued to local and global dsq's.
|
|
* - Termination notification for userspace.
|
|
*
|
|
* While very simple, this scheduler should work reasonably well on CPUs with a
|
|
* uniform L3 cache topology. While preemption is not implemented, the fact that
|
|
* the scheduling queue is shared across all CPUs means that whatever is at the
|
|
* front of the queue is likely to be executed fairly quickly given enough
|
|
* number of CPUs. The FIFO scheduling mode may be beneficial to some workloads
|
|
* but comes with the usual problems with FIFO scheduling where saturating
|
|
* threads can easily drown out interactive ones.
|
|
*
|
|
* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
|
|
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
|
|
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
|
|
*/
|
|
#include <scx/common.bpf.h>
|
|
|
|
char _license[] SEC("license") = "GPL";
|
|
|
|
const volatile bool fifo_sched;
|
|
|
|
static u64 vtime_now;
|
|
UEI_DEFINE(uei);
|
|
|
|
/*
|
|
* Built-in DSQs such as SCX_DSQ_GLOBAL cannot be used as priority queues
|
|
* (meaning, cannot be dispatched to with scx_bpf_dsq_insert_vtime()). We
|
|
* therefore create a separate DSQ with ID 0 that we dispatch to and consume
|
|
* from. If scx_simple only supported global FIFO scheduling, then we could just
|
|
* use SCX_DSQ_GLOBAL.
|
|
*/
|
|
#define SHARED_DSQ 0
|
|
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
|
|
__uint(key_size, sizeof(u32));
|
|
__uint(value_size, sizeof(u64));
|
|
__uint(max_entries, 2); /* [local, global] */
|
|
} stats SEC(".maps");
|
|
|
|
static void stat_inc(u32 idx)
|
|
{
|
|
u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx);
|
|
if (cnt_p)
|
|
(*cnt_p)++;
|
|
}
|
|
|
|
static inline bool vtime_before(u64 a, u64 b)
|
|
{
|
|
return (s64)(a - b) < 0;
|
|
}
|
|
|
|
s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
|
|
{
|
|
bool is_idle = false;
|
|
s32 cpu;
|
|
|
|
cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
|
|
if (is_idle) {
|
|
stat_inc(0); /* count local queueing */
|
|
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
|
|
}
|
|
|
|
return cpu;
|
|
}
|
|
|
|
void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
|
|
{
|
|
stat_inc(1); /* count global queueing */
|
|
|
|
if (fifo_sched) {
|
|
scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
|
|
} else {
|
|
u64 vtime = p->scx.dsq_vtime;
|
|
|
|
/*
|
|
* Limit the amount of budget that an idling task can accumulate
|
|
* to one slice.
|
|
*/
|
|
if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL))
|
|
vtime = vtime_now - SCX_SLICE_DFL;
|
|
|
|
scx_bpf_dsq_insert_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime,
|
|
enq_flags);
|
|
}
|
|
}
|
|
|
|
void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev)
|
|
{
|
|
scx_bpf_dsq_move_to_local(SHARED_DSQ);
|
|
}
|
|
|
|
void BPF_STRUCT_OPS(simple_running, struct task_struct *p)
|
|
{
|
|
if (fifo_sched)
|
|
return;
|
|
|
|
/*
|
|
* Global vtime always progresses forward as tasks start executing. The
|
|
* test and update can be performed concurrently from multiple CPUs and
|
|
* thus racy. Any error should be contained and temporary. Let's just
|
|
* live with it.
|
|
*/
|
|
if (vtime_before(vtime_now, p->scx.dsq_vtime))
|
|
vtime_now = p->scx.dsq_vtime;
|
|
}
|
|
|
|
void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable)
|
|
{
|
|
if (fifo_sched)
|
|
return;
|
|
|
|
/*
|
|
* Scale the execution time by the inverse of the weight and charge.
|
|
*
|
|
* Note that the default yield implementation yields by setting
|
|
* @p->scx.slice to zero and the following would treat the yielding task
|
|
* as if it has consumed all its slice. If this penalizes yielding tasks
|
|
* too much, determine the execution time by taking explicit timestamps
|
|
* instead of depending on @p->scx.slice.
|
|
*/
|
|
p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
|
|
}
|
|
|
|
void BPF_STRUCT_OPS(simple_enable, struct task_struct *p)
|
|
{
|
|
p->scx.dsq_vtime = vtime_now;
|
|
}
|
|
|
|
s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init)
|
|
{
|
|
return scx_bpf_create_dsq(SHARED_DSQ, -1);
|
|
}
|
|
|
|
void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
|
|
{
|
|
UEI_RECORD(uei, ei);
|
|
}
|
|
|
|
SCX_OPS_DEFINE(simple_ops,
|
|
.select_cpu = (void *)simple_select_cpu,
|
|
.enqueue = (void *)simple_enqueue,
|
|
.dispatch = (void *)simple_dispatch,
|
|
.running = (void *)simple_running,
|
|
.stopping = (void *)simple_stopping,
|
|
.enable = (void *)simple_enable,
|
|
.init = (void *)simple_init,
|
|
.exit = (void *)simple_exit,
|
|
.name = "simple");
|