mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-10 15:58:47 +00:00
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar: "Three fixes that address an SMP balancing performance regression" * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/core: Ensure load_balance() respects the active_mask sched/core: Address more wake_affine() regressions sched/core: Fix wake_affine() performance regression
This commit is contained in:
commit
a339b35130
@ -71,14 +71,6 @@ struct sched_domain_shared {
|
||||
atomic_t ref;
|
||||
atomic_t nr_busy_cpus;
|
||||
int has_idle_cores;
|
||||
|
||||
/*
|
||||
* Some variables from the most recent sd_lb_stats for this domain,
|
||||
* used by wake_affine().
|
||||
*/
|
||||
unsigned long nr_running;
|
||||
unsigned long load;
|
||||
unsigned long capacity;
|
||||
};
|
||||
|
||||
struct sched_domain {
|
||||
|
@ -5356,91 +5356,62 @@ static int wake_wide(struct task_struct *p)
|
||||
return 1;
|
||||
}
|
||||
|
||||
struct llc_stats {
|
||||
unsigned long nr_running;
|
||||
unsigned long load;
|
||||
unsigned long capacity;
|
||||
int has_capacity;
|
||||
};
|
||||
/*
|
||||
* The purpose of wake_affine() is to quickly determine on which CPU we can run
|
||||
* soonest. For the purpose of speed we only consider the waking and previous
|
||||
* CPU.
|
||||
*
|
||||
* wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
|
||||
* will be) idle.
|
||||
*
|
||||
* wake_affine_weight() - considers the weight to reflect the average
|
||||
* scheduling latency of the CPUs. This seems to work
|
||||
* for the overloaded case.
|
||||
*/
|
||||
|
||||
static bool get_llc_stats(struct llc_stats *stats, int cpu)
|
||||
static bool
|
||||
wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
|
||||
int this_cpu, int prev_cpu, int sync)
|
||||
{
|
||||
struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
|
||||
if (idle_cpu(this_cpu))
|
||||
return true;
|
||||
|
||||
if (!sds)
|
||||
return false;
|
||||
if (sync && cpu_rq(this_cpu)->nr_running == 1)
|
||||
return true;
|
||||
|
||||
stats->nr_running = READ_ONCE(sds->nr_running);
|
||||
stats->load = READ_ONCE(sds->load);
|
||||
stats->capacity = READ_ONCE(sds->capacity);
|
||||
stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
|
||||
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Can a task be moved from prev_cpu to this_cpu without causing a load
|
||||
* imbalance that would trigger the load balancer?
|
||||
*
|
||||
* Since we're running on 'stale' values, we might in fact create an imbalance
|
||||
* but recomputing these values is expensive, as that'd mean iteration 2 cache
|
||||
* domains worth of CPUs.
|
||||
*/
|
||||
static bool
|
||||
wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
|
||||
int this_cpu, int prev_cpu, int sync)
|
||||
wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
|
||||
int this_cpu, int prev_cpu, int sync)
|
||||
{
|
||||
struct llc_stats prev_stats, this_stats;
|
||||
s64 this_eff_load, prev_eff_load;
|
||||
unsigned long task_load;
|
||||
|
||||
if (!get_llc_stats(&prev_stats, prev_cpu) ||
|
||||
!get_llc_stats(&this_stats, this_cpu))
|
||||
return false;
|
||||
this_eff_load = target_load(this_cpu, sd->wake_idx);
|
||||
prev_eff_load = source_load(prev_cpu, sd->wake_idx);
|
||||
|
||||
/*
|
||||
* If sync wakeup then subtract the (maximum possible)
|
||||
* effect of the currently running task from the load
|
||||
* of the current LLC.
|
||||
*/
|
||||
if (sync) {
|
||||
unsigned long current_load = task_h_load(current);
|
||||
|
||||
/* in this case load hits 0 and this LLC is considered 'idle' */
|
||||
if (current_load > this_stats.load)
|
||||
if (current_load > this_eff_load)
|
||||
return true;
|
||||
|
||||
this_stats.load -= current_load;
|
||||
this_eff_load -= current_load;
|
||||
}
|
||||
|
||||
/*
|
||||
* The has_capacity stuff is not SMT aware, but by trying to balance
|
||||
* the nr_running on both ends we try and fill the domain at equal
|
||||
* rates, thereby first consuming cores before siblings.
|
||||
*/
|
||||
|
||||
/* if the old cache has capacity, stay there */
|
||||
if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
|
||||
return false;
|
||||
|
||||
/* if this cache has capacity, come here */
|
||||
if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
|
||||
return true;
|
||||
|
||||
/*
|
||||
* Check to see if we can move the load without causing too much
|
||||
* imbalance.
|
||||
*/
|
||||
task_load = task_h_load(p);
|
||||
|
||||
this_eff_load = 100;
|
||||
this_eff_load *= prev_stats.capacity;
|
||||
this_eff_load += task_load;
|
||||
if (sched_feat(WA_BIAS))
|
||||
this_eff_load *= 100;
|
||||
this_eff_load *= capacity_of(prev_cpu);
|
||||
|
||||
prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
|
||||
prev_eff_load *= this_stats.capacity;
|
||||
|
||||
this_eff_load *= this_stats.load + task_load;
|
||||
prev_eff_load *= prev_stats.load - task_load;
|
||||
prev_eff_load -= task_load;
|
||||
if (sched_feat(WA_BIAS))
|
||||
prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
|
||||
prev_eff_load *= capacity_of(this_cpu);
|
||||
|
||||
return this_eff_load <= prev_eff_load;
|
||||
}
|
||||
@ -5449,22 +5420,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
|
||||
int prev_cpu, int sync)
|
||||
{
|
||||
int this_cpu = smp_processor_id();
|
||||
bool affine;
|
||||
bool affine = false;
|
||||
|
||||
/*
|
||||
* Default to no affine wakeups; wake_affine() should not effect a task
|
||||
* placement the load-balancer feels inclined to undo. The conservative
|
||||
* option is therefore to not move tasks when they wake up.
|
||||
*/
|
||||
affine = false;
|
||||
if (sched_feat(WA_IDLE) && !affine)
|
||||
affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
|
||||
|
||||
/*
|
||||
* If the wakeup is across cache domains, try to evaluate if movement
|
||||
* makes sense, otherwise rely on select_idle_siblings() to do
|
||||
* placement inside the cache domain.
|
||||
*/
|
||||
if (!cpus_share_cache(prev_cpu, this_cpu))
|
||||
affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
|
||||
if (sched_feat(WA_WEIGHT) && !affine)
|
||||
affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
|
||||
|
||||
schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
|
||||
if (affine) {
|
||||
@ -7600,7 +7562,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
|
||||
*/
|
||||
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
|
||||
{
|
||||
struct sched_domain_shared *shared = env->sd->shared;
|
||||
struct sched_domain *child = env->sd->child;
|
||||
struct sched_group *sg = env->sd->groups;
|
||||
struct sg_lb_stats *local = &sds->local_stat;
|
||||
@ -7672,22 +7633,6 @@ next_group:
|
||||
if (env->dst_rq->rd->overload != overload)
|
||||
env->dst_rq->rd->overload = overload;
|
||||
}
|
||||
|
||||
if (!shared)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Since these are sums over groups they can contain some CPUs
|
||||
* multiple times for the NUMA domains.
|
||||
*
|
||||
* Currently only wake_affine_llc() and find_busiest_group()
|
||||
* uses these numbers, only the last is affected by this problem.
|
||||
*
|
||||
* XXX fix that.
|
||||
*/
|
||||
WRITE_ONCE(shared->nr_running, sds->total_running);
|
||||
WRITE_ONCE(shared->load, sds->total_load);
|
||||
WRITE_ONCE(shared->capacity, sds->total_capacity);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -8097,6 +8042,13 @@ static int should_we_balance(struct lb_env *env)
|
||||
struct sched_group *sg = env->sd->groups;
|
||||
int cpu, balance_cpu = -1;
|
||||
|
||||
/*
|
||||
* Ensure the balancing environment is consistent; can happen
|
||||
* when the softirq triggers 'during' hotplug.
|
||||
*/
|
||||
if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* In the newly idle case, we will allow all the cpu's
|
||||
* to do the newly idle load balance.
|
||||
|
@ -81,3 +81,6 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
|
||||
SCHED_FEAT(LB_MIN, false)
|
||||
SCHED_FEAT(ATTACH_AGE_LOAD, true)
|
||||
|
||||
SCHED_FEAT(WA_IDLE, true)
|
||||
SCHED_FEAT(WA_WEIGHT, true)
|
||||
SCHED_FEAT(WA_BIAS, true)
|
||||
|
Loading…
x
Reference in New Issue
Block a user