Merge branches 'pm-cpuidle' and 'pm-em'

Merge cpuidle and Energy Model changes for 6.13-rc1:

 - Add a built-in idle states table for Granite Rapids Xeon D to the
   intel_idle driver (Artem Bityutskiy).

 - Fix some typos in comments in the cpuidle core and drivers (Shen
   Lichuan).

 - Remove iowait influence from the menu cpuidle governor (Christian
   Loehle).

 - Add min/max available performance state limits to the Energy Model
   management code (Lukasz Luba).

* pm-cpuidle:
  intel_idle: add Granite Rapids Xeon D support
  cpuidle: Correct some typos in comments
  cpuidle: menu: Remove iowait influence

* pm-em:
  PM: EM: Add min/max available performance state limits
This commit is contained in:
Rafael J. Wysocki 2024-11-15 19:54:05 +01:00
commit 923c256e37
8 changed files with 135 additions and 80 deletions

View File

@ -139,7 +139,7 @@ static int __init arm_idle_init_cpu(int cpu)
*
* Initializes arm cpuidle driver for all CPUs, if any CPU fails
* to register cpuidle driver then rollback to cancel all CPUs
* registeration.
* registration.
*/
static int __init arm_idle_init(void)
{

View File

@ -48,7 +48,7 @@ static int qcom_cpu_spc(struct spm_driver_data *drv)
ret = cpu_suspend(0, qcom_pm_collapse);
/*
* ARM common code executes WFI without calling into our driver and
* if the SPM mode is not reset, then we may accidently power down the
* if the SPM mode is not reset, then we may accidentally power down the
* cpu when we intended only to gate the cpu clock.
* Ensure the state is set to standby before returning.
*/

View File

@ -406,7 +406,7 @@ void cpuidle_reflect(struct cpuidle_device *dev, int index)
* Min polling interval of 10usec is a guess. It is assuming that
* for most users, the time for a single ping-pong workload like
* perf bench pipe would generally complete within 10usec but
* this is hardware dependant. Actual time can be estimated with
* this is hardware dependent. Actual time can be estimated with
*
* perf bench sched pipe -l 10000
*

View File

@ -261,7 +261,7 @@ static void __cpuidle_unregister_driver(struct cpuidle_driver *drv)
* @drv: a pointer to a valid struct cpuidle_driver
*
* Register the driver under a lock to prevent concurrent attempts to
* [un]register the driver from occuring at the same time.
* [un]register the driver from occurring at the same time.
*
* Returns 0 on success, a negative error code (returned by
* __cpuidle_register_driver()) otherwise.
@ -296,7 +296,7 @@ EXPORT_SYMBOL_GPL(cpuidle_register_driver);
* @drv: a pointer to a valid struct cpuidle_driver
*
* Unregisters the cpuidle driver under a lock to prevent concurrent attempts
* to [un]register the driver from occuring at the same time. @drv has to
* to [un]register the driver from occurring at the same time. @drv has to
* match the currently registered driver.
*/
void cpuidle_unregister_driver(struct cpuidle_driver *drv)

View File

@ -19,7 +19,7 @@
#include "gov.h"
#define BUCKETS 12
#define BUCKETS 6
#define INTERVAL_SHIFT 3
#define INTERVALS (1UL << INTERVAL_SHIFT)
#define RESOLUTION 1024
@ -29,12 +29,11 @@
/*
* Concepts and ideas behind the menu governor
*
* For the menu governor, there are 3 decision factors for picking a C
* For the menu governor, there are 2 decision factors for picking a C
* state:
* 1) Energy break even point
* 2) Performance impact
* 3) Latency tolerance (from pmqos infrastructure)
* These three factors are treated independently.
* 2) Latency tolerance (from pmqos infrastructure)
* These two factors are treated independently.
*
* Energy break even point
* -----------------------
@ -75,30 +74,6 @@
* intervals and if the stand deviation of these 8 intervals is below a
* threshold value, we use the average of these intervals as prediction.
*
* Limiting Performance Impact
* ---------------------------
* C states, especially those with large exit latencies, can have a real
* noticeable impact on workloads, which is not acceptable for most sysadmins,
* and in addition, less performance has a power price of its own.
*
* As a general rule of thumb, menu assumes that the following heuristic
* holds:
* The busier the system, the less impact of C states is acceptable
*
* This rule-of-thumb is implemented using a performance-multiplier:
* If the exit latency times the performance multiplier is longer than
* the predicted duration, the C state is not considered a candidate
* for selection due to a too high performance impact. So the higher
* this multiplier is, the longer we need to be idle to pick a deep C
* state, and thus the less likely a busy CPU will hit such a deep
* C state.
*
* Currently there is only one value determining the factor:
* 10 points are added for each process that is waiting for IO on this CPU.
* (This value was experimentally determined.)
* Utilization is no longer a factor as it was shown that it never contributed
* significantly to the performance multiplier in the first place.
*
*/
struct menu_device {
@ -112,19 +87,10 @@ struct menu_device {
int interval_ptr;
};
static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
static inline int which_bucket(u64 duration_ns)
{
int bucket = 0;
/*
* We keep two groups of stats; one with no
* IO pending, one without.
* This allows us to calculate
* E(duration)|iowait
*/
if (nr_iowaiters)
bucket = BUCKETS/2;
if (duration_ns < 10ULL * NSEC_PER_USEC)
return bucket;
if (duration_ns < 100ULL * NSEC_PER_USEC)
@ -138,19 +104,6 @@ static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
return bucket + 5;
}
/*
* Return a multiplier for the exit latency that is intended
* to take performance requirements into account.
* The more performance critical we estimate the system
* to be, the higher this multiplier, and thus the higher
* the barrier to go to an expensive C state.
*/
static inline int performance_multiplier(unsigned int nr_iowaiters)
{
/* for IO wait tasks (per cpu!) we add 10x each */
return 1 + 10 * nr_iowaiters;
}
static DEFINE_PER_CPU(struct menu_device, menu_devices);
static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
@ -258,8 +211,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
struct menu_device *data = this_cpu_ptr(&menu_devices);
s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
u64 predicted_ns;
u64 interactivity_req;
unsigned int nr_iowaiters;
ktime_t delta, delta_tick;
int i, idx;
@ -268,8 +219,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
data->needs_update = 0;
}
nr_iowaiters = nr_iowait_cpu(dev->cpu);
/* Find the shortest expected idle interval. */
predicted_ns = get_typical_interval(data) * NSEC_PER_USEC;
if (predicted_ns > RESIDENCY_THRESHOLD_NS) {
@ -283,7 +232,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
}
data->next_timer_ns = delta;
data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters);
data->bucket = which_bucket(data->next_timer_ns);
/* Round up the result for half microseconds. */
timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 +
@ -301,7 +250,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
*/
data->next_timer_ns = KTIME_MAX;
delta_tick = TICK_NSEC / 2;
data->bucket = which_bucket(KTIME_MAX, nr_iowaiters);
data->bucket = which_bucket(KTIME_MAX);
}
if (unlikely(drv->state_count <= 1 || latency_req == 0) ||
@ -328,15 +277,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
*/
if (predicted_ns < TICK_NSEC)
predicted_ns = data->next_timer_ns;
} else {
/*
* Use the performance multiplier and the user-configurable
* latency_req to determine the maximum exit latency.
*/
interactivity_req = div64_u64(predicted_ns,
performance_multiplier(nr_iowaiters));
if (latency_req > interactivity_req)
latency_req = interactivity_req;
} else if (latency_req > predicted_ns) {
latency_req = predicted_ns;
}
/*

View File

@ -1069,6 +1069,47 @@ static struct cpuidle_state gnr_cstates[] __initdata = {
.enter = NULL }
};
static struct cpuidle_state gnrd_cstates[] __initdata = {
{
.name = "C1",
.desc = "MWAIT 0x00",
.flags = MWAIT2flg(0x00),
.exit_latency = 1,
.target_residency = 1,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
.name = "C1E",
.desc = "MWAIT 0x01",
.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
.exit_latency = 4,
.target_residency = 4,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
.name = "C6",
.desc = "MWAIT 0x20",
.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED |
CPUIDLE_FLAG_INIT_XSTATE |
CPUIDLE_FLAG_PARTIAL_HINT_MATCH,
.exit_latency = 220,
.target_residency = 650,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
.name = "C6P",
.desc = "MWAIT 0x21",
.flags = MWAIT2flg(0x21) | CPUIDLE_FLAG_TLB_FLUSHED |
CPUIDLE_FLAG_INIT_XSTATE |
CPUIDLE_FLAG_PARTIAL_HINT_MATCH,
.exit_latency = 240,
.target_residency = 750,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
.enter = NULL }
};
static struct cpuidle_state atom_cstates[] __initdata = {
{
.name = "C1E",
@ -1508,6 +1549,12 @@ static const struct idle_cpu idle_cpu_gnr __initconst = {
.use_acpi = true,
};
static const struct idle_cpu idle_cpu_gnrd __initconst = {
.state_table = gnrd_cstates,
.disable_promotion_to_c1e = true,
.use_acpi = true,
};
static const struct idle_cpu idle_cpu_avn __initconst = {
.state_table = avn_cstates,
.disable_promotion_to_c1e = true,
@ -1593,6 +1640,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &idle_cpu_spr),
X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &idle_cpu_spr),
X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &idle_cpu_gnr),
X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, &idle_cpu_gnrd),
X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &idle_cpu_knl),
X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &idle_cpu_knl),
X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &idle_cpu_bxt),

View File

@ -55,6 +55,8 @@ struct em_perf_table {
* struct em_perf_domain - Performance domain
* @em_table: Pointer to the runtime modifiable em_perf_table
* @nr_perf_states: Number of performance states
* @min_perf_state: Minimum allowed Performance State index
* @max_perf_state: Maximum allowed Performance State index
* @flags: See "em_perf_domain flags"
* @cpus: Cpumask covering the CPUs of the domain. It's here
* for performance reasons to avoid potential cache
@ -70,6 +72,8 @@ struct em_perf_table {
struct em_perf_domain {
struct em_perf_table __rcu *em_table;
int nr_perf_states;
int min_perf_state;
int max_perf_state;
unsigned long flags;
unsigned long cpus[];
};
@ -173,13 +177,14 @@ void em_table_free(struct em_perf_table __rcu *table);
int em_dev_compute_costs(struct device *dev, struct em_perf_state *table,
int nr_states);
int em_dev_update_chip_binning(struct device *dev);
int em_update_performance_limits(struct em_perf_domain *pd,
unsigned long freq_min_khz, unsigned long freq_max_khz);
/**
* em_pd_get_efficient_state() - Get an efficient performance state from the EM
* @table: List of performance states, in ascending order
* @nr_perf_states: Number of performance states
* @pd: performance domain for which this must be done
* @max_util: Max utilization to map with the EM
* @pd_flags: Performance Domain flags
*
* It is called from the scheduler code quite frequently and as a consequence
* doesn't implement any check.
@ -188,13 +193,16 @@ int em_dev_update_chip_binning(struct device *dev);
* requirement.
*/
static inline int
em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states,
unsigned long max_util, unsigned long pd_flags)
em_pd_get_efficient_state(struct em_perf_state *table,
struct em_perf_domain *pd, unsigned long max_util)
{
unsigned long pd_flags = pd->flags;
int min_ps = pd->min_perf_state;
int max_ps = pd->max_perf_state;
struct em_perf_state *ps;
int i;
for (i = 0; i < nr_perf_states; i++) {
for (i = min_ps; i <= max_ps; i++) {
ps = &table[i];
if (ps->performance >= max_util) {
if (pd_flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES &&
@ -204,7 +212,7 @@ em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states,
}
}
return nr_perf_states - 1;
return max_ps;
}
/**
@ -253,8 +261,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
* requested performance.
*/
em_table = rcu_dereference(pd->em_table);
i = em_pd_get_efficient_state(em_table->state, pd->nr_perf_states,
max_util, pd->flags);
i = em_pd_get_efficient_state(em_table->state, pd, max_util);
ps = &em_table->state[i];
/*
@ -391,6 +398,12 @@ static inline int em_dev_update_chip_binning(struct device *dev)
{
return -EINVAL;
}
static inline
int em_update_performance_limits(struct em_perf_domain *pd,
unsigned long freq_min_khz, unsigned long freq_max_khz)
{
return -EINVAL;
}
#endif
#endif

View File

@ -628,6 +628,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
goto unlock;
dev->em_pd->flags |= flags;
dev->em_pd->min_perf_state = 0;
dev->em_pd->max_perf_state = nr_states - 1;
em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state);
@ -856,3 +858,53 @@ int em_dev_update_chip_binning(struct device *dev)
return em_recalc_and_update(dev, pd, em_table);
}
EXPORT_SYMBOL_GPL(em_dev_update_chip_binning);
/**
* em_update_performance_limits() - Update Energy Model with performance
* limits information.
* @pd : Performance Domain with EM that has to be updated.
* @freq_min_khz : New minimum allowed frequency for this device.
* @freq_max_khz : New maximum allowed frequency for this device.
*
* This function allows to update the EM with information about available
* performance levels. It takes the minimum and maximum frequency in kHz
* and does internal translation to performance levels.
* Returns 0 on success or -EINVAL when failed.
*/
int em_update_performance_limits(struct em_perf_domain *pd,
unsigned long freq_min_khz, unsigned long freq_max_khz)
{
struct em_perf_state *table;
int min_ps = -1;
int max_ps = -1;
int i;
if (!pd)
return -EINVAL;
rcu_read_lock();
table = em_perf_state_from_pd(pd);
for (i = 0; i < pd->nr_perf_states; i++) {
if (freq_min_khz == table[i].frequency)
min_ps = i;
if (freq_max_khz == table[i].frequency)
max_ps = i;
}
rcu_read_unlock();
/* Only update when both are found and sane */
if (min_ps < 0 || max_ps < 0 || max_ps < min_ps)
return -EINVAL;
/* Guard simultaneous updates and make them atomic */
mutex_lock(&em_pd_mutex);
pd->min_perf_state = min_ps;
pd->max_perf_state = max_ps;
mutex_unlock(&em_pd_mutex);
return 0;
}
EXPORT_SYMBOL_GPL(em_update_performance_limits);