mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-16 09:56:46 +00:00
sched/fair: Simplify util_est
With UTIL_EST_FASTUP now being permanent, we can take advantage of the fact that the ewma jumps directly to a higher utilization at dequeue to simplify util_est and remove the enqueued field. Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org> Signed-off-by: Ingo Molnar <mingo@kernel.org> Tested-by: Lukasz Luba <lukasz.luba@arm.com> Reviewed-by: Lukasz Luba <lukasz.luba@arm.com> Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com> Reviewed-by: Hongyan Xia <hongyan.xia2@arm.com> Reviewed-by: Alex Shi <alexs@kernel.org> Link: https://lore.kernel.org/r/20231201161652.1241695-3-vincent.guittot@linaro.org
This commit is contained in:
parent
7736ae5572
commit
11137d3849
@ -415,42 +415,6 @@ struct load_weight {
|
||||
u32 inv_weight;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct util_est - Estimation utilization of FAIR tasks
|
||||
* @enqueued: instantaneous estimated utilization of a task/cpu
|
||||
* @ewma: the Exponential Weighted Moving Average (EWMA)
|
||||
* utilization of a task
|
||||
*
|
||||
* Support data structure to track an Exponential Weighted Moving Average
|
||||
* (EWMA) of a FAIR task's utilization. New samples are added to the moving
|
||||
* average each time a task completes an activation. Sample's weight is chosen
|
||||
* so that the EWMA will be relatively insensitive to transient changes to the
|
||||
* task's workload.
|
||||
*
|
||||
* The enqueued attribute has a slightly different meaning for tasks and cpus:
|
||||
* - task: the task's util_avg at last task dequeue time
|
||||
* - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
|
||||
* Thus, the util_est.enqueued of a task represents the contribution on the
|
||||
* estimated utilization of the CPU where that task is currently enqueued.
|
||||
*
|
||||
* Only for tasks we track a moving average of the past instantaneous
|
||||
* estimated utilization. This allows to absorb sporadic drops in utilization
|
||||
* of an otherwise almost periodic task.
|
||||
*
|
||||
* The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
|
||||
* updates. When a task is dequeued, its util_est should not be updated if its
|
||||
* util_avg has not been updated in the meantime.
|
||||
* This information is mapped into the MSB bit of util_est.enqueued at dequeue
|
||||
* time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
|
||||
* for a task) it is safe to use MSB.
|
||||
*/
|
||||
struct util_est {
|
||||
unsigned int enqueued;
|
||||
unsigned int ewma;
|
||||
#define UTIL_EST_WEIGHT_SHIFT 2
|
||||
#define UTIL_AVG_UNCHANGED 0x80000000
|
||||
} __attribute__((__aligned__(sizeof(u64))));
|
||||
|
||||
/*
|
||||
* The load/runnable/util_avg accumulates an infinite geometric series
|
||||
* (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
|
||||
@ -505,9 +469,20 @@ struct sched_avg {
|
||||
unsigned long load_avg;
|
||||
unsigned long runnable_avg;
|
||||
unsigned long util_avg;
|
||||
struct util_est util_est;
|
||||
unsigned int util_est;
|
||||
} ____cacheline_aligned;
|
||||
|
||||
/*
|
||||
* The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
|
||||
* updates. When a task is dequeued, its util_est should not be updated if its
|
||||
* util_avg has not been updated in the meantime.
|
||||
* This information is mapped into the MSB bit of util_est at dequeue time.
|
||||
* Since max value of util_est for a task is 1024 (PELT util_avg for a task)
|
||||
* it is safe to use MSB.
|
||||
*/
|
||||
#define UTIL_EST_WEIGHT_SHIFT 2
|
||||
#define UTIL_AVG_UNCHANGED 0x80000000
|
||||
|
||||
struct sched_statistics {
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
u64 wait_start;
|
||||
|
@ -684,8 +684,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
|
||||
cfs_rq->avg.runnable_avg);
|
||||
SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
|
||||
cfs_rq->avg.util_avg);
|
||||
SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued",
|
||||
cfs_rq->avg.util_est.enqueued);
|
||||
SEQ_printf(m, " .%-30s: %u\n", "util_est",
|
||||
cfs_rq->avg.util_est);
|
||||
SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
|
||||
cfs_rq->removed.load_avg);
|
||||
SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
|
||||
@ -1075,8 +1075,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
|
||||
P(se.avg.runnable_avg);
|
||||
P(se.avg.util_avg);
|
||||
P(se.avg.last_update_time);
|
||||
P(se.avg.util_est.ewma);
|
||||
PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED);
|
||||
PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED);
|
||||
#endif
|
||||
#ifdef CONFIG_UCLAMP_TASK
|
||||
__PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
|
||||
|
@ -4781,9 +4781,7 @@ static inline unsigned long task_runnable(struct task_struct *p)
|
||||
|
||||
static inline unsigned long _task_util_est(struct task_struct *p)
|
||||
{
|
||||
struct util_est ue = READ_ONCE(p->se.avg.util_est);
|
||||
|
||||
return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
|
||||
return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED;
|
||||
}
|
||||
|
||||
static inline unsigned long task_util_est(struct task_struct *p)
|
||||
@ -4800,9 +4798,9 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
|
||||
return;
|
||||
|
||||
/* Update root cfs_rq's estimated utilization */
|
||||
enqueued = cfs_rq->avg.util_est.enqueued;
|
||||
enqueued = cfs_rq->avg.util_est;
|
||||
enqueued += _task_util_est(p);
|
||||
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
|
||||
WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
|
||||
|
||||
trace_sched_util_est_cfs_tp(cfs_rq);
|
||||
}
|
||||
@ -4816,34 +4814,20 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
|
||||
return;
|
||||
|
||||
/* Update root cfs_rq's estimated utilization */
|
||||
enqueued = cfs_rq->avg.util_est.enqueued;
|
||||
enqueued = cfs_rq->avg.util_est;
|
||||
enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
|
||||
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
|
||||
WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
|
||||
|
||||
trace_sched_util_est_cfs_tp(cfs_rq);
|
||||
}
|
||||
|
||||
#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
|
||||
|
||||
/*
|
||||
* Check if a (signed) value is within a specified (unsigned) margin,
|
||||
* based on the observation that:
|
||||
*
|
||||
* abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
|
||||
*
|
||||
* NOTE: this only works when value + margin < INT_MAX.
|
||||
*/
|
||||
static inline bool within_margin(int value, int margin)
|
||||
{
|
||||
return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
|
||||
}
|
||||
|
||||
static inline void util_est_update(struct cfs_rq *cfs_rq,
|
||||
struct task_struct *p,
|
||||
bool task_sleep)
|
||||
{
|
||||
long last_ewma_diff, last_enqueued_diff;
|
||||
struct util_est ue;
|
||||
unsigned int ewma, dequeued, last_ewma_diff;
|
||||
|
||||
if (!sched_feat(UTIL_EST))
|
||||
return;
|
||||
@ -4855,23 +4839,25 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
|
||||
if (!task_sleep)
|
||||
return;
|
||||
|
||||
/* Get current estimate of utilization */
|
||||
ewma = READ_ONCE(p->se.avg.util_est);
|
||||
|
||||
/*
|
||||
* If the PELT values haven't changed since enqueue time,
|
||||
* skip the util_est update.
|
||||
*/
|
||||
ue = p->se.avg.util_est;
|
||||
if (ue.enqueued & UTIL_AVG_UNCHANGED)
|
||||
if (ewma & UTIL_AVG_UNCHANGED)
|
||||
return;
|
||||
|
||||
last_enqueued_diff = ue.enqueued;
|
||||
/* Get utilization at dequeue */
|
||||
dequeued = task_util(p);
|
||||
|
||||
/*
|
||||
* Reset EWMA on utilization increases, the moving average is used only
|
||||
* to smooth utilization decreases.
|
||||
*/
|
||||
ue.enqueued = task_util(p);
|
||||
if (ue.ewma < ue.enqueued) {
|
||||
ue.ewma = ue.enqueued;
|
||||
if (ewma <= dequeued) {
|
||||
ewma = dequeued;
|
||||
goto done;
|
||||
}
|
||||
|
||||
@ -4879,27 +4865,22 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
|
||||
* Skip update of task's estimated utilization when its members are
|
||||
* already ~1% close to its last activation value.
|
||||
*/
|
||||
last_ewma_diff = ue.enqueued - ue.ewma;
|
||||
last_enqueued_diff -= ue.enqueued;
|
||||
if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
|
||||
if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
|
||||
goto done;
|
||||
|
||||
return;
|
||||
}
|
||||
last_ewma_diff = ewma - dequeued;
|
||||
if (last_ewma_diff < UTIL_EST_MARGIN)
|
||||
goto done;
|
||||
|
||||
/*
|
||||
* To avoid overestimation of actual task utilization, skip updates if
|
||||
* we cannot grant there is idle time in this CPU.
|
||||
*/
|
||||
if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
|
||||
if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
|
||||
return;
|
||||
|
||||
/*
|
||||
* To avoid underestimate of task utilization, skip updates of EWMA if
|
||||
* we cannot grant that thread got all CPU time it wanted.
|
||||
*/
|
||||
if ((ue.enqueued + UTIL_EST_MARGIN) < task_runnable(p))
|
||||
if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
|
||||
goto done;
|
||||
|
||||
|
||||
@ -4907,25 +4888,24 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
|
||||
* Update Task's estimated utilization
|
||||
*
|
||||
* When *p completes an activation we can consolidate another sample
|
||||
* of the task size. This is done by storing the current PELT value
|
||||
* as ue.enqueued and by using this value to update the Exponential
|
||||
* Weighted Moving Average (EWMA):
|
||||
* of the task size. This is done by using this value to update the
|
||||
* Exponential Weighted Moving Average (EWMA):
|
||||
*
|
||||
* ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
|
||||
* = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
|
||||
* = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
|
||||
* = w * ( last_ewma_diff ) + ewma(t-1)
|
||||
* = w * (last_ewma_diff + ewma(t-1) / w)
|
||||
* = w * ( -last_ewma_diff ) + ewma(t-1)
|
||||
* = w * (-last_ewma_diff + ewma(t-1) / w)
|
||||
*
|
||||
* Where 'w' is the weight of new samples, which is configured to be
|
||||
* 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
|
||||
*/
|
||||
ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
|
||||
ue.ewma += last_ewma_diff;
|
||||
ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
|
||||
ewma <<= UTIL_EST_WEIGHT_SHIFT;
|
||||
ewma -= last_ewma_diff;
|
||||
ewma >>= UTIL_EST_WEIGHT_SHIFT;
|
||||
done:
|
||||
ue.enqueued |= UTIL_AVG_UNCHANGED;
|
||||
WRITE_ONCE(p->se.avg.util_est, ue);
|
||||
ewma |= UTIL_AVG_UNCHANGED;
|
||||
WRITE_ONCE(p->se.avg.util_est, ewma);
|
||||
|
||||
trace_sched_util_est_se_tp(&p->se);
|
||||
}
|
||||
@ -7653,16 +7633,16 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
|
||||
if (sched_feat(UTIL_EST)) {
|
||||
unsigned long util_est;
|
||||
|
||||
util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
|
||||
util_est = READ_ONCE(cfs_rq->avg.util_est);
|
||||
|
||||
/*
|
||||
* During wake-up @p isn't enqueued yet and doesn't contribute
|
||||
* to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
|
||||
* to any cpu_rq(cpu)->cfs.avg.util_est.
|
||||
* If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
|
||||
* has been enqueued.
|
||||
*
|
||||
* During exec (@dst_cpu = -1) @p is enqueued and does
|
||||
* contribute to cpu_rq(cpu)->cfs.util_est.enqueued.
|
||||
* contribute to cpu_rq(cpu)->cfs.util_est.
|
||||
* Remove it to "simulate" cpu_util without @p's contribution.
|
||||
*
|
||||
* Despite the task_on_rq_queued(@p) check there is still a
|
||||
|
@ -52,13 +52,13 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
|
||||
return;
|
||||
|
||||
/* Avoid store if the flag has been already reset */
|
||||
enqueued = avg->util_est.enqueued;
|
||||
enqueued = avg->util_est;
|
||||
if (!(enqueued & UTIL_AVG_UNCHANGED))
|
||||
return;
|
||||
|
||||
/* Reset flag to report util_avg has been updated */
|
||||
enqueued &= ~UTIL_AVG_UNCHANGED;
|
||||
WRITE_ONCE(avg->util_est.enqueued, enqueued);
|
||||
WRITE_ONCE(avg->util_est, enqueued);
|
||||
}
|
||||
|
||||
static inline u64 rq_clock_pelt(struct rq *rq)
|
||||
|
Loading…
x
Reference in New Issue
Block a user