Merge branch 'for-6.14' into for-next

This commit is contained in:
Tejun Heo 2025-01-10 08:29:48 -10:00
commit 821148d255
8 changed files with 228 additions and 46 deletions

View File

@ -789,6 +789,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
void update_rq_clock(struct rq *rq)
{
s64 delta;
u64 clock;
lockdep_assert_rq_held(rq);
@ -800,11 +801,14 @@ void update_rq_clock(struct rq *rq)
SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
rq->clock_update_flags |= RQCF_UPDATED;
#endif
clock = sched_clock_cpu(cpu_of(rq));
scx_rq_clock_update(rq, clock);
delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
delta = clock - rq->clock;
if (delta < 0)
return;
rq->clock += delta;
update_rq_clock_task(rq, delta);
}

View File

@ -4915,7 +4915,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
struct task_struct *p;
struct rhashtable_iter rht_iter;
struct scx_dispatch_q *dsq;
int i, kind;
int i, kind, cpu;
kind = atomic_read(&scx_exit_kind);
while (true) {
@ -4998,6 +4998,15 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
/*
* Invalidate all the rq clocks to prevent getting outdated
* rq clocks from a previous scx scheduler.
*/
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
scx_rq_clock_invalidate(rq);
}
/* no task is on scx, turn off all the switches and flush in-progress calls */
static_branch_disable(&__scx_ops_enabled);
for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
@ -7603,6 +7612,68 @@ out:
}
#endif
/**
* scx_bpf_now - Returns a high-performance monotonically non-decreasing
* clock for the current CPU. The clock returned is in nanoseconds.
*
* It provides the following properties:
*
* 1) High performance: Many BPF schedulers call bpf_ktime_get_ns() frequently
* to account for execution time and track tasks' runtime properties.
* Unfortunately, in some hardware platforms, bpf_ktime_get_ns() -- which
* eventually reads a hardware timestamp counter -- is neither performant nor
* scalable. scx_bpf_now() aims to provide a high-performance clock by
* using the rq clock in the scheduler core whenever possible.
*
* 2) High enough resolution for the BPF scheduler use cases: In most BPF
* scheduler use cases, the required clock resolution is lower than the most
* accurate hardware clock (e.g., rdtsc in x86). scx_bpf_now() basically
* uses the rq clock in the scheduler core whenever it is valid. It considers
* that the rq clock is valid from the time the rq clock is updated
* (update_rq_clock) until the rq is unlocked (rq_unpin_lock).
*
* 3) Monotonically non-decreasing clock for the same CPU: scx_bpf_now()
* guarantees the clock never goes backward when comparing them in the same
* CPU. On the other hand, when comparing clocks in different CPUs, there
* is no such guarantee -- the clock can go backward. It provides a
* monotonically *non-decreasing* clock so that it would provide the same
* clock values in two different scx_bpf_now() calls in the same CPU
* during the same period of when the rq clock is valid.
*/
__bpf_kfunc u64 scx_bpf_now(void)
{
struct rq *rq;
u64 clock;
preempt_disable();
rq = this_rq();
if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) {
/*
* If the rq clock is valid, use the cached rq clock.
*
* Note that scx_bpf_now() is re-entrant between a process
* context and an interrupt context (e.g., timer interrupt).
* However, we don't need to consider the race between them
* because such race is not observable from a caller.
*/
clock = READ_ONCE(rq->scx.clock);
} else {
/*
* Otherwise, return a fresh rq clock.
*
* The rq clock is updated outside of the rq lock.
* In this case, keep the updated rq clock invalid so the next
* kfunc call outside the rq lock gets a fresh rq clock.
*/
clock = sched_clock_cpu(cpu_of(rq));
}
preempt_enable();
return clock;
}
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_any)
@ -7634,6 +7705,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
#ifdef CONFIG_CGROUP_SCHED
BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
#endif
BTF_ID_FLAGS(func, scx_bpf_now)
BTF_KFUNCS_END(scx_kfunc_ids_any)
static const struct btf_kfunc_id_set scx_kfunc_set_any = {

View File

@ -754,6 +754,7 @@ enum scx_rq_flags {
SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */
SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */
SCX_RQ_BYPASSING = 1 << 4,
SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */
SCX_RQ_IN_WAKEUP = 1 << 16,
SCX_RQ_IN_BALANCE = 1 << 17,
@ -766,9 +767,10 @@ struct scx_rq {
unsigned long ops_qseq;
u64 extra_enq_flags; /* see move_task_to_local_dsq() */
u32 nr_running;
u32 flags;
u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */
bool cpu_released;
u32 flags;
u64 clock; /* current per-rq clock -- see scx_bpf_now() */
cpumask_var_t cpus_to_kick;
cpumask_var_t cpus_to_kick_if_idle;
cpumask_var_t cpus_to_preempt;
@ -1717,6 +1719,38 @@ struct rq_flags {
extern struct balance_callback balance_push_callback;
#ifdef CONFIG_SCHED_CLASS_EXT
extern const struct sched_class ext_sched_class;
DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); /* SCX BPF scheduler loaded */
DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */
#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled)
#define scx_switched_all() static_branch_unlikely(&__scx_switched_all)
static inline void scx_rq_clock_update(struct rq *rq, u64 clock)
{
if (!scx_enabled())
return;
WRITE_ONCE(rq->scx.clock, clock);
smp_store_release(&rq->scx.flags, rq->scx.flags | SCX_RQ_CLK_VALID);
}
static inline void scx_rq_clock_invalidate(struct rq *rq)
{
if (!scx_enabled())
return;
WRITE_ONCE(rq->scx.flags, rq->scx.flags & ~SCX_RQ_CLK_VALID);
}
#else /* !CONFIG_SCHED_CLASS_EXT */
#define scx_enabled() false
#define scx_switched_all() false
static inline void scx_rq_clock_update(struct rq *rq, u64 clock) {}
static inline void scx_rq_clock_invalidate(struct rq *rq) {}
#endif /* !CONFIG_SCHED_CLASS_EXT */
/*
* Lockdep annotation that avoids accidental unlocks; it's like a
* sticky/continuous lockdep_assert_held().
@ -1746,7 +1780,7 @@ static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
if (rq->clock_update_flags > RQCF_ACT_SKIP)
rf->clock_update_flags = RQCF_UPDATED;
#endif
scx_rq_clock_invalidate(rq);
lockdep_unpin_lock(__rq_lockp(rq), rf->cookie);
}
@ -2505,19 +2539,6 @@ extern const struct sched_class rt_sched_class;
extern const struct sched_class fair_sched_class;
extern const struct sched_class idle_sched_class;
#ifdef CONFIG_SCHED_CLASS_EXT
extern const struct sched_class ext_sched_class;
DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); /* SCX BPF scheduler loaded */
DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */
#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled)
#define scx_switched_all() static_branch_unlikely(&__scx_switched_all)
#else /* !CONFIG_SCHED_CLASS_EXT */
#define scx_enabled() false
#define scx_switched_all() false
#endif /* !CONFIG_SCHED_CLASS_EXT */
/*
* Iterate only active classes. SCX can take over all fair tasks or be
* completely disabled. If the former, skip fair. If the latter, skip SCX.

View File

@ -76,6 +76,7 @@ bool scx_bpf_task_running(const struct task_struct *p) __ksym;
s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak;
u64 scx_bpf_now(void) __ksym __weak;
/*
* Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from
@ -407,6 +408,100 @@ static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask)
void bpf_rcu_read_lock(void) __ksym;
void bpf_rcu_read_unlock(void) __ksym;
/*
* Time helpers, most of which are from jiffies.h.
*/
/**
* time_delta - Calculate the delta between new and old time stamp
* @after: first comparable as u64
* @before: second comparable as u64
*
* Return: the time difference, which is >= 0
*/
static inline s64 time_delta(u64 after, u64 before)
{
return (s64)(after - before) > 0 ? : 0;
}
/**
* time_after - returns true if the time a is after time b.
* @a: first comparable as u64
* @b: second comparable as u64
*
* Do this with "<0" and ">=0" to only test the sign of the result. A
* good compiler would generate better code (and a really good compiler
* wouldn't care). Gcc is currently neither.
*
* Return: %true is time a is after time b, otherwise %false.
*/
static inline bool time_after(u64 a, u64 b)
{
return (s64)(b - a) < 0;
}
/**
* time_before - returns true if the time a is before time b.
* @a: first comparable as u64
* @b: second comparable as u64
*
* Return: %true is time a is before time b, otherwise %false.
*/
static inline bool time_before(u64 a, u64 b)
{
return time_after(b, a);
}
/**
* time_after_eq - returns true if the time a is after or the same as time b.
* @a: first comparable as u64
* @b: second comparable as u64
*
* Return: %true is time a is after or the same as time b, otherwise %false.
*/
static inline bool time_after_eq(u64 a, u64 b)
{
return (s64)(a - b) >= 0;
}
/**
* time_before_eq - returns true if the time a is before or the same as time b.
* @a: first comparable as u64
* @b: second comparable as u64
*
* Return: %true is time a is before or the same as time b, otherwise %false.
*/
static inline bool time_before_eq(u64 a, u64 b)
{
return time_after_eq(b, a);
}
/**
* time_in_range - Calculate whether a is in the range of [b, c].
* @a: time to test
* @b: beginning of the range
* @c: end of the range
*
* Return: %true is time a is in the range [b, c], otherwise %false.
*/
static inline bool time_in_range(u64 a, u64 b, u64 c)
{
return time_after_eq(a, b) && time_before_eq(a, c);
}
/**
* time_in_range_open - Calculate whether a is in the range of [b, c).
* @a: time to test
* @b: beginning of the range
* @c: end of the range
*
* Return: %true is time a is in the range [b, c), otherwise %false.
*/
static inline bool time_in_range_open(u64 a, u64 b, u64 c)
{
return time_after_eq(a, b) && time_before(a, c);
}
/*
* Other helpers

View File

@ -125,6 +125,11 @@ bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter,
false; \
})
#define scx_bpf_now() \
(bpf_ksym_exists(scx_bpf_now) ? \
scx_bpf_now() : \
bpf_ktime_get_ns())
/*
* Define sched_ext_ops. This may be expanded to define multiple variants for
* backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().

View File

@ -87,11 +87,6 @@ struct {
__type(value, struct central_timer);
} central_timer SEC(".maps");
static bool vtime_before(u64 a, u64 b)
{
return (s64)(a - b) < 0;
}
s32 BPF_STRUCT_OPS(central_select_cpu, struct task_struct *p,
s32 prev_cpu, u64 wake_flags)
{
@ -245,7 +240,7 @@ void BPF_STRUCT_OPS(central_running, struct task_struct *p)
s32 cpu = scx_bpf_task_cpu(p);
u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
if (started_at)
*started_at = bpf_ktime_get_ns() ?: 1; /* 0 indicates idle */
*started_at = scx_bpf_now() ?: 1; /* 0 indicates idle */
}
void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable)
@ -258,7 +253,7 @@ void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable)
static int central_timerfn(void *map, int *key, struct bpf_timer *timer)
{
u64 now = bpf_ktime_get_ns();
u64 now = scx_bpf_now();
u64 nr_to_kick = nr_queued;
s32 i, curr_cpu;
@ -279,7 +274,7 @@ static int central_timerfn(void *map, int *key, struct bpf_timer *timer)
/* kick iff the current one exhausted its slice */
started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
if (started_at && *started_at &&
vtime_before(now, *started_at + slice_ns))
time_before(now, *started_at + slice_ns))
continue;
/* and there's something pending */

View File

@ -137,11 +137,6 @@ static u64 div_round_up(u64 dividend, u64 divisor)
return (dividend + divisor - 1) / divisor;
}
static bool vtime_before(u64 a, u64 b)
{
return (s64)(a - b) < 0;
}
static bool cgv_node_less(struct bpf_rb_node *a, const struct bpf_rb_node *b)
{
struct cgv_node *cgc_a, *cgc_b;
@ -271,7 +266,7 @@ static void cgrp_cap_budget(struct cgv_node *cgv_node, struct fcg_cgrp_ctx *cgc)
*/
max_budget = (cgrp_slice_ns * nr_cpus * cgc->hweight) /
(2 * FCG_HWEIGHT_ONE);
if (vtime_before(cvtime, cvtime_now - max_budget))
if (time_before(cvtime, cvtime_now - max_budget))
cvtime = cvtime_now - max_budget;
cgv_node->cvtime = cvtime;
@ -401,7 +396,7 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
* Limit the amount of budget that an idling task can accumulate
* to one slice.
*/
if (vtime_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL))
if (time_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL))
tvtime = cgc->tvtime_now - SCX_SLICE_DFL;
scx_bpf_dsq_insert_vtime(p, cgrp->kn->id, SCX_SLICE_DFL,
@ -535,7 +530,7 @@ void BPF_STRUCT_OPS(fcg_running, struct task_struct *p)
* from multiple CPUs and thus racy. Any error should be
* contained and temporary. Let's just live with it.
*/
if (vtime_before(cgc->tvtime_now, p->scx.dsq_vtime))
if (time_before(cgc->tvtime_now, p->scx.dsq_vtime))
cgc->tvtime_now = p->scx.dsq_vtime;
}
bpf_cgroup_release(cgrp);
@ -645,7 +640,7 @@ static bool try_pick_next_cgroup(u64 *cgidp)
cgv_node = container_of(rb_node, struct cgv_node, rb_node);
cgid = cgv_node->cgid;
if (vtime_before(cvtime_now, cgv_node->cvtime))
if (time_before(cvtime_now, cgv_node->cvtime))
cvtime_now = cgv_node->cvtime;
/*
@ -734,7 +729,7 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
struct fcg_cpu_ctx *cpuc;
struct fcg_cgrp_ctx *cgc;
struct cgroup *cgrp;
u64 now = bpf_ktime_get_ns();
u64 now = scx_bpf_now();
bool picked_next = false;
cpuc = find_cpu_ctx();
@ -744,7 +739,7 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
if (!cpuc->cur_cgid)
goto pick_next_cgroup;
if (vtime_before(now, cpuc->cur_at + cgrp_slice_ns)) {
if (time_before(now, cpuc->cur_at + cgrp_slice_ns)) {
if (scx_bpf_dsq_move_to_local(cpuc->cur_cgid)) {
stat_inc(FCG_STAT_CNS_KEEP);
return;
@ -920,14 +915,14 @@ void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p,
struct cgroup *from, struct cgroup *to)
{
struct fcg_cgrp_ctx *from_cgc, *to_cgc;
s64 vtime_delta;
s64 delta;
/* find_cgrp_ctx() triggers scx_ops_error() on lookup failures */
if (!(from_cgc = find_cgrp_ctx(from)) || !(to_cgc = find_cgrp_ctx(to)))
return;
vtime_delta = p->scx.dsq_vtime - from_cgc->tvtime_now;
p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta;
delta = time_delta(p->scx.dsq_vtime, from_cgc->tvtime_now);
p->scx.dsq_vtime = to_cgc->tvtime_now + delta;
}
s32 BPF_STRUCT_OPS_SLEEPABLE(fcg_init)

View File

@ -52,11 +52,6 @@ static void stat_inc(u32 idx)
(*cnt_p)++;
}
static inline bool vtime_before(u64 a, u64 b)
{
return (s64)(a - b) < 0;
}
s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
{
bool is_idle = false;
@ -84,7 +79,7 @@ void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
* Limit the amount of budget that an idling task can accumulate
* to one slice.
*/
if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL))
if (time_before(vtime, vtime_now - SCX_SLICE_DFL))
vtime = vtime_now - SCX_SLICE_DFL;
scx_bpf_dsq_insert_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime,
@ -108,7 +103,7 @@ void BPF_STRUCT_OPS(simple_running, struct task_struct *p)
* thus racy. Any error should be contained and temporary. Let's just
* live with it.
*/
if (vtime_before(vtime_now, p->scx.dsq_vtime))
if (time_before(vtime_now, p->scx.dsq_vtime))
vtime_now = p->scx.dsq_vtime;
}