Scheduler fixes:

- When stime is larger than rtime due to accounting imprecision, then
      utime = rtime - stime becomes negative. As this is unsigned math, the
      result becomes a huge positive number. Cure it by resetting stime to
      rtime in that case, so utime becomes 0.
 
    - Restore consistent state when sched_cpu_deactivate() fails.
 
      When offlining a CPU fails in sched_cpu_deactivate() after the SMT
      present counter has been decremented, then the function aborts but
      fails to increment the SMT present counter and leaves it imbalanced.
      Consecutive operations cause it to underflow. Add the missing fixup
      for the error path.
 
      As SMT accounting the runqueue needs to marked online again in the
      error exit path to restore consistent state.
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmavdT8THHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYodVrEACDLJdjkM2n3T7EL8YjuBjkCW3dGWAZ
 umpJGjwMDsT9/oLIU7B1wgX/IdWppssQa+0yXxZy7cKQvfP5VTd4fueuub2k5sJc
 yDy5J8N0xRYvOhA0lrnp6jyqhhCZzIGDmSn3G+lDuQuuffaqfFbPkeMwoXmewiyt
 72ajFsjeo7q25pm8ALgBhrSKfO5FFV1HJoAyoYKEyT5E/pliKNWrzbcA+PWstMK3
 DWmj8dgYk6g/dBwNl6wORlpmcxjcDO65icH5XPSsadwosHe7q1+quIJSqMDyXHNY
 qQ5r5f9bvXdq5DPKRON0GJb9gfSQNX5yE/pKdyW75mqHMxJ/pnIIds6h6mLHBewt
 eZ5M1a/v8o+QiqQcDogk5DUzZlI46bKZsYLqU9y6v/WgUqa5C4BaEJT7CrQk+6wp
 xUB4g3j/+asih55Tq9HKo6PEY8NLj4ytKHgeh0EvEllDxGmnRYR+PEdzLBjuWlAY
 ka2/1vaNr/r5grbpQhO6N4txUAASoKF6nx1hq05I/lY45KA+RgeU0mgEN07Pa6HZ
 4873Q2CnVUlvMVFulOUkJogGNk7KTDb3e7/+BMsA9Lda/2KmqaOLEh5T6egdLZ0G
 feb/UQ6hoYcCD0IAsj9MfEOS3IVhOvtkJSwwLi/j09ucmC+5Ar3v3/Aw1EtTHJHm
 ObdoEXJC98RLFA==
 =b/q0
 -----END PGP SIGNATURE-----

Merge tag 'sched-urgent-2024-08-04' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Thomas Gleixner:

 - When stime is larger than rtime due to accounting imprecision, then
   utime = rtime - stime becomes negative. As this is unsigned math, the
   result becomes a huge positive number.

   Cure it by resetting stime to rtime in that case, so utime becomes 0.

 - Restore consistent state when sched_cpu_deactivate() fails.

   When offlining a CPU fails in sched_cpu_deactivate() after the SMT
   present counter has been decremented, then the function aborts but
   fails to increment the SMT present counter and leaves it imbalanced.
   Consecutive operations cause it to underflow. Add the missing fixup
   for the error path.

   For SMT accounting the runqueue needs to marked online again in the
   error exit path to restore consistent state.

* tag 'sched-urgent-2024-08-04' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/core: Fix unbalance set_rq_online/offline() in sched_cpu_deactivate()
  sched/core: Introduce sched_set_rq_on/offline() helper
  sched/smt: Fix unbalance sched_smt_present dec/inc
  sched/smt: Introduce sched_smt_present_inc/dec() helper
  sched/cputime: Fix mul_u64_u64_div_u64() precision for cputime
This commit is contained in:
Linus Torvalds 2024-08-04 08:46:14 -07:00
commit 6cc82dc2bd
2 changed files with 53 additions and 21 deletions

View File

@ -7845,6 +7845,30 @@ void set_rq_offline(struct rq *rq)
}
}
static inline void sched_set_rq_online(struct rq *rq, int cpu)
{
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_online(rq);
}
rq_unlock_irqrestore(rq, &rf);
}
static inline void sched_set_rq_offline(struct rq *rq, int cpu)
{
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
rq_unlock_irqrestore(rq, &rf);
}
/*
* used to mark begin/end of suspend/resume:
*/
@ -7895,10 +7919,25 @@ static int cpuset_cpu_inactive(unsigned int cpu)
return 0;
}
static inline void sched_smt_present_inc(int cpu)
{
#ifdef CONFIG_SCHED_SMT
if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
static_branch_inc_cpuslocked(&sched_smt_present);
#endif
}
static inline void sched_smt_present_dec(int cpu)
{
#ifdef CONFIG_SCHED_SMT
if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
static_branch_dec_cpuslocked(&sched_smt_present);
#endif
}
int sched_cpu_activate(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
/*
* Clear the balance_push callback and prepare to schedule
@ -7906,13 +7945,10 @@ int sched_cpu_activate(unsigned int cpu)
*/
balance_push_set(cpu, false);
#ifdef CONFIG_SCHED_SMT
/*
* When going up, increment the number of cores with SMT present.
*/
if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
static_branch_inc_cpuslocked(&sched_smt_present);
#endif
sched_smt_present_inc(cpu);
set_cpu_active(cpu, true);
if (sched_smp_initialized) {
@ -7930,12 +7966,7 @@ int sched_cpu_activate(unsigned int cpu)
* 2) At runtime, if cpuset_cpu_active() fails to rebuild the
* domains.
*/
rq_lock_irqsave(rq, &rf);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_online(rq);
}
rq_unlock_irqrestore(rq, &rf);
sched_set_rq_online(rq, cpu);
return 0;
}
@ -7943,7 +7974,6 @@ int sched_cpu_activate(unsigned int cpu)
int sched_cpu_deactivate(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
int ret;
/*
@ -7974,20 +8004,14 @@ int sched_cpu_deactivate(unsigned int cpu)
*/
synchronize_rcu();
rq_lock_irqsave(rq, &rf);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
rq_unlock_irqrestore(rq, &rf);
sched_set_rq_offline(rq, cpu);
#ifdef CONFIG_SCHED_SMT
/*
* When going down, decrement the number of cores with SMT present.
*/
if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
static_branch_dec_cpuslocked(&sched_smt_present);
sched_smt_present_dec(cpu);
#ifdef CONFIG_SCHED_SMT
sched_core_cpu_deactivate(cpu);
#endif
@ -7997,6 +8021,8 @@ int sched_cpu_deactivate(unsigned int cpu)
sched_update_numa(cpu, false);
ret = cpuset_cpu_inactive(cpu);
if (ret) {
sched_smt_present_inc(cpu);
sched_set_rq_online(rq, cpu);
balance_push_set(cpu, false);
set_cpu_active(cpu, true);
sched_update_numa(cpu, true);

View File

@ -582,6 +582,12 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
}
stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
/*
* Because mul_u64_u64_div_u64() can approximate on some
* achitectures; enforce the constraint that: a*b/(b+c) <= a.
*/
if (unlikely(stime > rtime))
stime = rtime;
update:
/*