mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-07 13:53:24 +00:00
Urgent RCU pull request for v5.19
This pull request contains a pair of commits that fix282d8998e9
("srcu: Prevent expedited GPs and blocking readers from consuming CPU"), which was itself a fix to an SRCU expedited grace-period problem that could prevent kernel live patching (KLP) from completing. That SRCU fix for KLP introduced large (as in minutes) boot-time delays to embedded Linux kernels running on qemu/KVM. These delays were due to the emulation of certain MMIO operations controlling memory layout, which were emulated with one expedited grace period per access. Common configurations required thousands of boot-time MMIO accesses, and thus thousands of boot-time expedited SRCU grace periods. In these configurations, the occasional sleeps that allowed KLP to proceed caused excessive boot delays. These commits preserve enough sleeps to permit KLP to proceed, but few enough that the virtual embedded kernels still boot reasonably quickly. This represents a regression introduced in the v5.19 merge window, and the bug is causing significant inconvenience, hence this pull request. -----BEGIN PGP SIGNATURE----- iQJHBAABCgAxFiEEbK7UrM+RBIrCoViJnr8S83LZ+4wFAmLZ6LoTHHBhdWxtY2tA a2VybmVsLm9yZwAKCRCevxLzctn7jNHgD/4tb8Un6vZlrEaYbyA/ztUITX/2DisS kiqbQz1BH8V3B3PxSo4ldEiw+z3fC3SMyIPymuu9bhwm6SFdjEsarFkIqySxkYnX jnuk0JbWxs4Kk64rIkHHzAxzvM2Iw1EjSzjP1M+DC7iymSJpsgp+0zFJJtcJ8Y87 67hbQRQYk+1T7ZT+vq77NiyAAFEzSd8UydgBVxlsOOdkXQ91NYTyB8D6ldUJAnLU opwCEpgpu74Sp4Te5q6f9uAt8xZmXsyrm8zJgzTz0KSgivcpt4GmIoyEFYUQczj0 Hewr6+qM9AWfvfQxNvRCS25yeox18kbdp1qdp9rl0BZMtYN2Zsk1Ec4c79s7NBLc G3TIvJkGLHuZO1dO4BhLkYczgRYlaPxOR/0GKNn4m69/TbVmseUL1WeZS0pswB0q cH1AKKEg9KdPoaX0hTLoOrlv/vwbgjhKKuoqEv7yEUhJJdACy50rmnhWhSxeuQDb aIITVKkjkwpDtRX5QTdG1f5uIMoGz9BbUDv7VeodB0mrYHluXEfyNTwlqcISKAgm T9kLmsdfvMrQ4fLR5S3i3dwnL3b52OB8h5NyfW3YRkXEnA7//ef/XpPiW2HY8BMT 7QwPqOoUSr/IraAcI8j0QxRpioUk1oaNi+UJ3FSHni8re6rZ0kaxatRCT20h6Djq C9RVLaevw3bGXQ== =ndhB -----END PGP SIGNATURE----- Merge tag 'rcu-urgent.2022.07.21a' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu Pull RCU fix from Paul McKenney: "This contains a pair of commits that fix282d8998e9
("srcu: Prevent expedited GPs and blocking readers from consuming CPU"), which was itself a fix to an SRCU expedited grace-period problem that could prevent kernel live patching (KLP) from completing. That SRCU fix for KLP introduced large (as in minutes) boot-time delays to embedded Linux kernels running on qemu/KVM. These delays were due to the emulation of certain MMIO operations controlling memory layout, which were emulated with one expedited grace period per access. Common configurations required thousands of boot-time MMIO accesses, and thus thousands of boot-time expedited SRCU grace periods. In these configurations, the occasional sleeps that allowed KLP to proceed caused excessive boot delays. These commits preserve enough sleeps to permit KLP to proceed, but few enough that the virtual embedded kernels still boot reasonably quickly. This represents a regression introduced in the v5.19 merge window, and the bug is causing significant inconvenience" * tag 'rcu-urgent.2022.07.21a' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu: srcu: Make expedited RCU grace periods block even less frequently srcu: Block less aggressively for expedited grace periods
This commit is contained in:
commit
4ba1329cbb
@ -5796,6 +5796,24 @@
|
||||
expediting. Set to zero to disable automatic
|
||||
expediting.
|
||||
|
||||
srcutree.srcu_max_nodelay [KNL]
|
||||
Specifies the number of no-delay instances
|
||||
per jiffy for which the SRCU grace period
|
||||
worker thread will be rescheduled with zero
|
||||
delay. Beyond this limit, worker thread will
|
||||
be rescheduled with a sleep delay of one jiffy.
|
||||
|
||||
srcutree.srcu_max_nodelay_phase [KNL]
|
||||
Specifies the per-grace-period phase, number of
|
||||
non-sleeping polls of readers. Beyond this limit,
|
||||
grace period worker thread will be rescheduled
|
||||
with a sleep delay of one jiffy, between each
|
||||
rescan of the readers, for a grace period phase.
|
||||
|
||||
srcutree.srcu_retry_check_delay [KNL]
|
||||
Specifies number of microseconds of non-sleeping
|
||||
delay between each non-sleeping poll of readers.
|
||||
|
||||
srcutree.small_contention_lim [KNL]
|
||||
Specifies the number of update-side contention
|
||||
events per jiffy will be tolerated before
|
||||
|
@ -511,10 +511,52 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
|
||||
return sum;
|
||||
}
|
||||
|
||||
#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending.
|
||||
#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers.
|
||||
#define SRCU_MAX_NODELAY_PHASE 1 // Maximum per-GP-phase consecutive no-delay instances.
|
||||
#define SRCU_MAX_NODELAY 100 // Maximum consecutive no-delay instances.
|
||||
/*
|
||||
* We use an adaptive strategy for synchronize_srcu() and especially for
|
||||
* synchronize_srcu_expedited(). We spin for a fixed time period
|
||||
* (defined below, boot time configurable) to allow SRCU readers to exit
|
||||
* their read-side critical sections. If there are still some readers
|
||||
* after one jiffy, we repeatedly block for one jiffy time periods.
|
||||
* The blocking time is increased as the grace-period age increases,
|
||||
* with max blocking time capped at 10 jiffies.
|
||||
*/
|
||||
#define SRCU_DEFAULT_RETRY_CHECK_DELAY 5
|
||||
|
||||
static ulong srcu_retry_check_delay = SRCU_DEFAULT_RETRY_CHECK_DELAY;
|
||||
module_param(srcu_retry_check_delay, ulong, 0444);
|
||||
|
||||
#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending.
|
||||
#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers.
|
||||
|
||||
#define SRCU_DEFAULT_MAX_NODELAY_PHASE_LO 3UL // Lowmark on default per-GP-phase
|
||||
// no-delay instances.
|
||||
#define SRCU_DEFAULT_MAX_NODELAY_PHASE_HI 1000UL // Highmark on default per-GP-phase
|
||||
// no-delay instances.
|
||||
|
||||
#define SRCU_UL_CLAMP_LO(val, low) ((val) > (low) ? (val) : (low))
|
||||
#define SRCU_UL_CLAMP_HI(val, high) ((val) < (high) ? (val) : (high))
|
||||
#define SRCU_UL_CLAMP(val, low, high) SRCU_UL_CLAMP_HI(SRCU_UL_CLAMP_LO((val), (low)), (high))
|
||||
// per-GP-phase no-delay instances adjusted to allow non-sleeping poll upto
|
||||
// one jiffies time duration. Mult by 2 is done to factor in the srcu_get_delay()
|
||||
// called from process_srcu().
|
||||
#define SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED \
|
||||
(2UL * USEC_PER_SEC / HZ / SRCU_DEFAULT_RETRY_CHECK_DELAY)
|
||||
|
||||
// Maximum per-GP-phase consecutive no-delay instances.
|
||||
#define SRCU_DEFAULT_MAX_NODELAY_PHASE \
|
||||
SRCU_UL_CLAMP(SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED, \
|
||||
SRCU_DEFAULT_MAX_NODELAY_PHASE_LO, \
|
||||
SRCU_DEFAULT_MAX_NODELAY_PHASE_HI)
|
||||
|
||||
static ulong srcu_max_nodelay_phase = SRCU_DEFAULT_MAX_NODELAY_PHASE;
|
||||
module_param(srcu_max_nodelay_phase, ulong, 0444);
|
||||
|
||||
// Maximum consecutive no-delay instances.
|
||||
#define SRCU_DEFAULT_MAX_NODELAY (SRCU_DEFAULT_MAX_NODELAY_PHASE > 100 ? \
|
||||
SRCU_DEFAULT_MAX_NODELAY_PHASE : 100)
|
||||
|
||||
static ulong srcu_max_nodelay = SRCU_DEFAULT_MAX_NODELAY;
|
||||
module_param(srcu_max_nodelay, ulong, 0444);
|
||||
|
||||
/*
|
||||
* Return grace-period delay, zero if there are expedited grace
|
||||
@ -522,16 +564,22 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
|
||||
*/
|
||||
static unsigned long srcu_get_delay(struct srcu_struct *ssp)
|
||||
{
|
||||
unsigned long gpstart;
|
||||
unsigned long j;
|
||||
unsigned long jbase = SRCU_INTERVAL;
|
||||
|
||||
if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
|
||||
jbase = 0;
|
||||
if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)))
|
||||
jbase += jiffies - READ_ONCE(ssp->srcu_gp_start);
|
||||
if (!jbase) {
|
||||
WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1);
|
||||
if (READ_ONCE(ssp->srcu_n_exp_nodelay) > SRCU_MAX_NODELAY_PHASE)
|
||||
jbase = 1;
|
||||
if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) {
|
||||
j = jiffies - 1;
|
||||
gpstart = READ_ONCE(ssp->srcu_gp_start);
|
||||
if (time_after(j, gpstart))
|
||||
jbase += j - gpstart;
|
||||
if (!jbase) {
|
||||
WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1);
|
||||
if (READ_ONCE(ssp->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
|
||||
jbase = 1;
|
||||
}
|
||||
}
|
||||
return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase;
|
||||
}
|
||||
@ -606,15 +654,6 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
|
||||
|
||||
/*
|
||||
* We use an adaptive strategy for synchronize_srcu() and especially for
|
||||
* synchronize_srcu_expedited(). We spin for a fixed time period
|
||||
* (defined below) to allow SRCU readers to exit their read-side critical
|
||||
* sections. If there are still some readers after a few microseconds,
|
||||
* we repeatedly block for 1-millisecond time periods.
|
||||
*/
|
||||
#define SRCU_RETRY_CHECK_DELAY 5
|
||||
|
||||
/*
|
||||
* Start an SRCU grace period.
|
||||
*/
|
||||
@ -700,7 +739,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp
|
||||
*/
|
||||
static void srcu_gp_end(struct srcu_struct *ssp)
|
||||
{
|
||||
unsigned long cbdelay;
|
||||
unsigned long cbdelay = 1;
|
||||
bool cbs;
|
||||
bool last_lvl;
|
||||
int cpu;
|
||||
@ -720,7 +759,9 @@ static void srcu_gp_end(struct srcu_struct *ssp)
|
||||
spin_lock_irq_rcu_node(ssp);
|
||||
idx = rcu_seq_state(ssp->srcu_gp_seq);
|
||||
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
|
||||
cbdelay = !!srcu_get_delay(ssp);
|
||||
if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
|
||||
cbdelay = 0;
|
||||
|
||||
WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
|
||||
rcu_seq_end(&ssp->srcu_gp_seq);
|
||||
gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
|
||||
@ -921,12 +962,16 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
|
||||
*/
|
||||
static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
|
||||
{
|
||||
unsigned long curdelay;
|
||||
|
||||
curdelay = !srcu_get_delay(ssp);
|
||||
|
||||
for (;;) {
|
||||
if (srcu_readers_active_idx_check(ssp, idx))
|
||||
return true;
|
||||
if (--trycount + !srcu_get_delay(ssp) <= 0)
|
||||
if ((--trycount + curdelay) <= 0)
|
||||
return false;
|
||||
udelay(SRCU_RETRY_CHECK_DELAY);
|
||||
udelay(srcu_retry_check_delay);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1582,7 +1627,7 @@ static void process_srcu(struct work_struct *work)
|
||||
j = jiffies;
|
||||
if (READ_ONCE(ssp->reschedule_jiffies) == j) {
|
||||
WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1);
|
||||
if (READ_ONCE(ssp->reschedule_count) > SRCU_MAX_NODELAY)
|
||||
if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay)
|
||||
curdelay = 1;
|
||||
} else {
|
||||
WRITE_ONCE(ssp->reschedule_count, 1);
|
||||
@ -1674,6 +1719,11 @@ static int __init srcu_bootup_announce(void)
|
||||
pr_info("Hierarchical SRCU implementation.\n");
|
||||
if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF)
|
||||
pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff);
|
||||
if (srcu_retry_check_delay != SRCU_DEFAULT_RETRY_CHECK_DELAY)
|
||||
pr_info("\tNon-default retry check delay of %lu us.\n", srcu_retry_check_delay);
|
||||
if (srcu_max_nodelay != SRCU_DEFAULT_MAX_NODELAY)
|
||||
pr_info("\tNon-default max no-delay of %lu.\n", srcu_max_nodelay);
|
||||
pr_info("\tMax phase no-delay instances is %lu.\n", srcu_max_nodelay_phase);
|
||||
return 0;
|
||||
}
|
||||
early_initcall(srcu_bootup_announce);
|
||||
|
Loading…
Reference in New Issue
Block a user