From 7aeba709a048d870c15940af8b620b16281c3b9e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 May 2024 15:45:42 +0200 Subject: [PATCH 01/64] rcu/nocb: Introduce RCU_NOCB_LOCKDEP_WARN() Checking for races against concurrent (de-)offloading implies the creation of !CONFIG_RCU_NOCB_CPU stubs to check if each relevant lock is held. For now this only implies the nocb_lock but more are to be expected. Create instead a NOCB specific version of RCU_LOCKDEP_WARN() to avoid the proliferation of stubs. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- include/linux/rcupdate.h | 7 +++++++ kernel/rcu/tree_nocb.h | 14 -------------- kernel/rcu/tree_plugin.h | 4 ++-- 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 13f6f00aecf9..d48d3c237305 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -144,11 +144,18 @@ void rcu_init_nohz(void); int rcu_nocb_cpu_offload(int cpu); int rcu_nocb_cpu_deoffload(int cpu); void rcu_nocb_flush_deferred_wakeup(void); + +#define RCU_NOCB_LOCKDEP_WARN(c, s) RCU_LOCKDEP_WARN(c, s) + #else /* #ifdef CONFIG_RCU_NOCB_CPU */ + static inline void rcu_init_nohz(void) { } static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; } static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; } static inline void rcu_nocb_flush_deferred_wakeup(void) { } + +#define RCU_NOCB_LOCKDEP_WARN(c, s) + #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ /* diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 3ce30841119a..f4112fc663a7 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -16,10 +16,6 @@ #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ -static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) -{ - return lockdep_is_held(&rdp->nocb_lock); -} static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) { @@ -1653,16 +1649,6 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) -{ - return 0; -} - -static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) -{ - return false; -} - /* No ->nocb_lock to acquire. */ static void rcu_nocb_lock(struct rcu_data *rdp) { diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c569da65b421..f752b2a1d887 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -24,10 +24,10 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) * timers have their own means of synchronization against the * offloaded state updaters. */ - RCU_LOCKDEP_WARN( + RCU_NOCB_LOCKDEP_WARN( !(lockdep_is_held(&rcu_state.barrier_mutex) || (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) || - rcu_lockdep_is_held_nocb(rdp) || + lockdep_is_held(&rdp->nocb_lock) || (!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) && rdp == this_cpu_ptr(&rcu_data)) || rcu_current_is_nocb_kthread(rdp)), From ff81428ede8a290fc5fd85135e39be1065ae6176 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 May 2024 15:45:43 +0200 Subject: [PATCH 02/64] rcu/nocb: Move nocb field at the end of state struct nocb_is_setup is a rarely used field, mostly on boot and CPU hotplug. It shouldn't occupy the middle of the rcu state hot fields cacheline. Move it to the end and build it conditionally while at it. More cold NOCB fields are to come. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index fcf2b4aa3441..a297dc89a09c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -411,7 +411,6 @@ struct rcu_state { arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; /* Synchronize offline with */ /* GP pre-initialization. */ - int nocb_is_setup; /* nocb is setup from boot */ /* synchronize_rcu() part. */ struct llist_head srs_next; /* request a GP users. */ @@ -420,6 +419,10 @@ struct rcu_state { struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX]; struct work_struct srs_cleanup_work; atomic_t srs_cleanups_pending; /* srs inflight worker cleanups. */ + +#ifdef CONFIG_RCU_NOCB_CPU + int nocb_is_setup; /* nocb is setup from boot */ +#endif }; /* Values for rcu_state structure's gp_flags field. */ From 7be88a857eb84d2e0677690b81ee423dee51c93d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 May 2024 15:45:44 +0200 Subject: [PATCH 03/64] rcu/nocb: Assert no callbacks while nocb kthread allocation fails When a NOCB CPU fails to create a nocb kthread on bringup, the CPU is then deoffloaded. The barrier mutex is locked at this stage. It is typically used to protect against concurrent (de-)offloading and/or concurrent rcu_barrier() that would otherwise risk a nocb locking imbalance. However: * rcu_barrier() can't run concurrently if it's the boot CPU on early boot-up. * rcu_barrier() can run concurrently if it's a secondary CPU but it is expected to see 0 callbacks on this target because it's the first time it boots. * (de-)offloading can't happen concurrently with smp_init(), as rcutorture is initialized later, at least not before device_initcall(), and userspace isn't available yet. * (de-)offloading can't happen concurrently with cpu_up(), courtesy of cpu_hotplug_lock. But: * The lazy shrinker might run concurrently with cpu_up(). It shouldn't try to grab the nocb_lock and risk an imbalance due to lazy_len supposed to be 0 but be extra cautious. * Also be cautious against resume from hibernation potential subtleties. So keep the locking and add some assertions and comments. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree_nocb.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index f4112fc663a7..fdd0616f2fd1 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1442,7 +1442,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) "rcuog/%d", rdp_gp->cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) { mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); - goto end; + goto err; } WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); if (kthread_prio) @@ -1454,7 +1454,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) t = kthread_create(rcu_nocb_cb_kthread, rdp, "rcuo%c/%d", rcu_state.abbr, cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) - goto end; + goto err; if (rcu_rdp_is_offloaded(rdp)) wake_up_process(t); @@ -1467,7 +1467,15 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) WRITE_ONCE(rdp->nocb_cb_kthread, t); WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); return; -end: + +err: + /* + * No need to protect against concurrent rcu_barrier() + * because the number of callbacks should be 0 for a non-boot CPU, + * therefore rcu_barrier() shouldn't even try to grab the nocb_lock. + * But hold barrier_mutex to avoid nocb_lock imbalance from shrinker. + */ + WARN_ON_ONCE(system_state > SYSTEM_BOOTING && rcu_segcblist_n_cbs(&rdp->cblist)); mutex_lock(&rcu_state.barrier_mutex); if (rcu_rdp_is_offloaded(rdp)) { rcu_nocb_rdp_deoffload(rdp); From 7121dd915a262aee1f5a88cf86b1543f3a9439d7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 May 2024 15:45:45 +0200 Subject: [PATCH 04/64] rcu/nocb: Introduce nocb mutex The barrier_mutex is used currently to protect (de-)offloading operations and prevent from nocb_lock locking imbalance in rcu_barrier() and shrinker, and also from misordered RCU barrier invocation. Now since RCU (de-)offloading is going to happen on offline CPUs, an RCU barrier will have to be executed while transitionning from offloaded to de-offloaded state. And this can't happen while holding the barrier_mutex. Introduce a NOCB mutex to protect (de-)offloading transitions. The barrier_mutex is still held for now when necessary to avoid barrier callbacks reordering and nocb_lock imbalance. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 3 +++ kernel/rcu/tree.h | 1 + kernel/rcu/tree_nocb.h | 20 ++++++++++++-------- kernel/rcu/tree_plugin.h | 1 + 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e641cc681901..2b9e713854b0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -97,6 +97,9 @@ static struct rcu_state rcu_state = { .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work, rcu_sr_normal_gp_cleanup_work), .srs_cleanups_pending = ATOMIC_INIT(0), +#ifdef CONFIG_RCU_NOCB_CPU + .nocb_mutex = __MUTEX_INITIALIZER(rcu_state.nocb_mutex), +#endif }; /* Dump rcu_node combining tree at boot to verify correct setup. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a297dc89a09c..16e6fe63d93c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -421,6 +421,7 @@ struct rcu_state { atomic_t srs_cleanups_pending; /* srs inflight worker cleanups. */ #ifdef CONFIG_RCU_NOCB_CPU + struct mutex nocb_mutex; /* Guards (de-)offloading */ int nocb_is_setup; /* nocb is setup from boot */ #endif }; diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index fdd0616f2fd1..16bcb8b13a5e 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1141,6 +1141,7 @@ int rcu_nocb_cpu_deoffload(int cpu) int ret = 0; cpus_read_lock(); + mutex_lock(&rcu_state.nocb_mutex); mutex_lock(&rcu_state.barrier_mutex); if (rcu_rdp_is_offloaded(rdp)) { if (cpu_online(cpu)) { @@ -1153,6 +1154,7 @@ int rcu_nocb_cpu_deoffload(int cpu) } } mutex_unlock(&rcu_state.barrier_mutex); + mutex_unlock(&rcu_state.nocb_mutex); cpus_read_unlock(); return ret; @@ -1228,6 +1230,7 @@ int rcu_nocb_cpu_offload(int cpu) int ret = 0; cpus_read_lock(); + mutex_lock(&rcu_state.nocb_mutex); mutex_lock(&rcu_state.barrier_mutex); if (!rcu_rdp_is_offloaded(rdp)) { if (cpu_online(cpu)) { @@ -1240,6 +1243,7 @@ int rcu_nocb_cpu_offload(int cpu) } } mutex_unlock(&rcu_state.barrier_mutex); + mutex_unlock(&rcu_state.nocb_mutex); cpus_read_unlock(); return ret; @@ -1257,7 +1261,7 @@ lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) return 0; /* Protect rcu_nocb_mask against concurrent (de-)offloading. */ - if (!mutex_trylock(&rcu_state.barrier_mutex)) + if (!mutex_trylock(&rcu_state.nocb_mutex)) return 0; /* Snapshot count of all CPUs */ @@ -1267,7 +1271,7 @@ lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) count += READ_ONCE(rdp->lazy_len); } - mutex_unlock(&rcu_state.barrier_mutex); + mutex_unlock(&rcu_state.nocb_mutex); return count ? count : SHRINK_EMPTY; } @@ -1285,9 +1289,9 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) * Protect against concurrent (de-)offloading. Otherwise nocb locking * may be ignored or imbalanced. */ - if (!mutex_trylock(&rcu_state.barrier_mutex)) { + if (!mutex_trylock(&rcu_state.nocb_mutex)) { /* - * But really don't insist if barrier_mutex is contended since we + * But really don't insist if nocb_mutex is contended since we * can't guarantee that it will never engage in a dependency * chain involving memory allocation. The lock is seldom contended * anyway. @@ -1326,7 +1330,7 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) break; } - mutex_unlock(&rcu_state.barrier_mutex); + mutex_unlock(&rcu_state.nocb_mutex); return count ? count : SHRINK_STOP; } @@ -1473,15 +1477,15 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) * No need to protect against concurrent rcu_barrier() * because the number of callbacks should be 0 for a non-boot CPU, * therefore rcu_barrier() shouldn't even try to grab the nocb_lock. - * But hold barrier_mutex to avoid nocb_lock imbalance from shrinker. + * But hold nocb_mutex to avoid nocb_lock imbalance from shrinker. */ WARN_ON_ONCE(system_state > SYSTEM_BOOTING && rcu_segcblist_n_cbs(&rdp->cblist)); - mutex_lock(&rcu_state.barrier_mutex); + mutex_lock(&rcu_state.nocb_mutex); if (rcu_rdp_is_offloaded(rdp)) { rcu_nocb_rdp_deoffload(rdp); cpumask_clear_cpu(cpu, rcu_nocb_mask); } - mutex_unlock(&rcu_state.barrier_mutex); + mutex_unlock(&rcu_state.nocb_mutex); } /* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f752b2a1d887..c662376c8af0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -28,6 +28,7 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) !(lockdep_is_held(&rcu_state.barrier_mutex) || (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) || lockdep_is_held(&rdp->nocb_lock) || + lockdep_is_held(&rcu_state.nocb_mutex) || (!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) && rdp == this_cpu_ptr(&rcu_data)) || rcu_current_is_nocb_kthread(rdp)), From 4e26ce423116e9f48f73723cc33add295e56ad31 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 May 2024 15:45:46 +0200 Subject: [PATCH 05/64] rcu/nocb: (De-)offload callbacks on offline CPUs only Currently callbacks can be (de-)offloaded only on online CPUs. This involves an overly elaborated state machine in order to make sure that callbacks are always handled during the process while ensuring synchronization between rcu_core and NOCB kthreads. The only potential user of NOCB (de-)offloading appears to be a nohz_full toggling interface through cpusets. And the general agreement is now to work toward toggling the nohz_full state on offline CPUs to simplify the whole picture. Therefore, convert the (de-)offloading to only support offline CPUs. This involves the following changes: * Call rcu_barrier() before deoffloading. An offline offloaded CPU may still carry callbacks in its queue ignored by rcutree_migrate_callbacks(). Those callbacks must all be flushed before switching to a regular queue because no more kthreads will handle those before the CPU ever gets re-onlined. This means that further calls to rcu_barrier() will find an empty queue until the CPU goes through rcutree_report_cpu_starting(). As a result it is guaranteed that further rcu_barrier() won't try to lock the nocb_lock for that target and thus won't risk an imbalance. Therefore barrier_mutex doesn't need to be locked anymore upon deoffloading. * Assume the queue is empty before offloading, as rcutree_migrate_callbacks() took care of everything. This means that further calls to rcu_barrier() will find an empty queue until the CPU goes through rcutree_report_cpu_starting(). As a result it is guaranteed that further rcu_barrier() won't risk a nocb_lock imbalance. Therefore barrier_mutex doesn't need to be locked anymore upon offloading. * No need to flush bypass anymore. Further simplifications will follow in upcoming patches. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree_nocb.h | 82 +++++++++++------------------------------- 1 file changed, 21 insertions(+), 61 deletions(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 16bcb8b13a5e..8e766396df3a 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1049,43 +1049,26 @@ static int rdp_offload_toggle(struct rcu_data *rdp, return wake_gp; } -static long rcu_nocb_rdp_deoffload(void *arg) +static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp) { - struct rcu_data *rdp = arg; struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; int wake_gp; struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - /* - * rcu_nocb_rdp_deoffload() may be called directly if - * rcuog/o[p] spawn failed, because at this time the rdp->cpu - * is not online yet. - */ - WARN_ON_ONCE((rdp->cpu != raw_smp_processor_id()) && cpu_online(rdp->cpu)); + /* CPU must be offline, unless it's early boot */ + WARN_ON_ONCE(cpu_online(rdp->cpu) && rdp->cpu != raw_smp_processor_id()); pr_info("De-offloading %d\n", rdp->cpu); + /* Flush all callbacks from segcblist and bypass */ + rcu_barrier(); + rcu_nocb_lock_irqsave(rdp, flags); - /* - * Flush once and for all now. This suffices because we are - * running on the target CPU holding ->nocb_lock (thus having - * interrupts disabled), and because rdp_offload_toggle() - * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED. - * Thus future calls to rcu_segcblist_completely_offloaded() will - * return false, which means that future calls to rcu_nocb_try_bypass() - * will refuse to put anything into the bypass. - */ - WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false)); - /* - * Start with invoking rcu_core() early. This way if the current thread - * happens to preempt an ongoing call to rcu_core() in the middle, - * leaving some work dismissed because rcu_core() still thinks the rdp is - * completely offloaded, we are guaranteed a nearby future instance of - * rcu_core() to catch up. - */ + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); + rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE); - invoke_rcu_core(); wake_gp = rdp_offload_toggle(rdp, false, flags); mutex_lock(&rdp_gp->nocb_gp_kthread_mutex); @@ -1128,10 +1111,6 @@ static long rcu_nocb_rdp_deoffload(void *arg) */ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - /* Sanity check */ - WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); - - return 0; } @@ -1142,18 +1121,16 @@ int rcu_nocb_cpu_deoffload(int cpu) cpus_read_lock(); mutex_lock(&rcu_state.nocb_mutex); - mutex_lock(&rcu_state.barrier_mutex); if (rcu_rdp_is_offloaded(rdp)) { - if (cpu_online(cpu)) { - ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); + if (!cpu_online(cpu)) { + ret = rcu_nocb_rdp_deoffload(rdp); if (!ret) cpumask_clear_cpu(cpu, rcu_nocb_mask); } else { - pr_info("NOCB: Cannot CB-deoffload offline CPU %d\n", rdp->cpu); + pr_info("NOCB: Cannot CB-deoffload online CPU %d\n", rdp->cpu); ret = -EINVAL; } } - mutex_unlock(&rcu_state.barrier_mutex); mutex_unlock(&rcu_state.nocb_mutex); cpus_read_unlock(); @@ -1161,15 +1138,14 @@ int rcu_nocb_cpu_deoffload(int cpu) } EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); -static long rcu_nocb_rdp_offload(void *arg) +static int rcu_nocb_rdp_offload(struct rcu_data *rdp) { - struct rcu_data *rdp = arg; struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; int wake_gp; struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + WARN_ON_ONCE(cpu_online(rdp->cpu)); /* * For now we only support re-offload, ie: the rdp must have been * offloaded on boot first. @@ -1182,28 +1158,15 @@ static long rcu_nocb_rdp_offload(void *arg) pr_info("Offloading %d\n", rdp->cpu); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); + /* * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING * is set. */ raw_spin_lock_irqsave(&rdp->nocb_lock, flags); - /* - * We didn't take the nocb lock while working on the - * rdp->cblist with SEGCBLIST_LOCKING cleared (pure softirq/rcuc mode). - * Every modifications that have been done previously on - * rdp->cblist must be visible remotely by the nocb kthreads - * upon wake up after reading the cblist flags. - * - * The layout against nocb_lock enforces that ordering: - * - * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait() - * ------------------------- ---------------------------- - * WRITE callbacks rcu_nocb_lock() - * rcu_nocb_lock() READ flags - * WRITE flags READ callbacks - * rcu_nocb_unlock() rcu_nocb_unlock() - */ wake_gp = rdp_offload_toggle(rdp, true, flags); if (wake_gp) wake_up_process(rdp_gp->nocb_gp_kthread); @@ -1214,8 +1177,7 @@ static long rcu_nocb_rdp_offload(void *arg) rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); /* - * All kthreads are ready to work, we can finally relieve rcu_core() and - * enable nocb bypass. + * All kthreads are ready to work, we can finally enable nocb bypass. */ rcu_nocb_lock_irqsave(rdp, flags); rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE); @@ -1231,18 +1193,16 @@ int rcu_nocb_cpu_offload(int cpu) cpus_read_lock(); mutex_lock(&rcu_state.nocb_mutex); - mutex_lock(&rcu_state.barrier_mutex); if (!rcu_rdp_is_offloaded(rdp)) { - if (cpu_online(cpu)) { - ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); + if (!cpu_online(cpu)) { + ret = rcu_nocb_rdp_offload(rdp); if (!ret) cpumask_set_cpu(cpu, rcu_nocb_mask); } else { - pr_info("NOCB: Cannot CB-offload offline CPU %d\n", rdp->cpu); + pr_info("NOCB: Cannot CB-offload online CPU %d\n", rdp->cpu); ret = -EINVAL; } } - mutex_unlock(&rcu_state.barrier_mutex); mutex_unlock(&rcu_state.nocb_mutex); cpus_read_unlock(); From d2e7f55ff2e3d878a63149c90b360cbec101a83e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 May 2024 15:45:47 +0200 Subject: [PATCH 06/64] rcu/nocb: Remove halfway (de-)offloading handling from bypass Bypass enqueue can't happen anymore in the middle of (de-)offloading since this sort of transition now only applies to offline CPUs. The related safety check can therefore be removed. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree_nocb.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 8e766396df3a..af44e75eb0cd 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -409,14 +409,6 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, return false; } - // In the process of (de-)offloading: no bypassing, but - // locking. - if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { - rcu_nocb_lock(rdp); - *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); - return false; /* Not offloaded, no bypassing. */ - } - // Don't use ->nocb_bypass during early boot. if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { rcu_nocb_lock(rdp); From 5a4f9059a8f46fc1fbbae74781a5f7c8d74c732b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 May 2024 15:45:48 +0200 Subject: [PATCH 07/64] rcu/nocb: Remove halfway (de-)offloading handling from rcu_core()'s QS reporting RCU core can't be running anymore while in the middle of (de-)offloading since this sort of transition now only applies to offline CPUs. The locked callback acceleration handling during the transition can therefore be removed. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2b9e713854b0..60f271f5c079 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2386,7 +2386,6 @@ rcu_report_qs_rdp(struct rcu_data *rdp) { unsigned long flags; unsigned long mask; - bool needacc = false; struct rcu_node *rnp; WARN_ON_ONCE(rdp->cpu != smp_processor_id()); @@ -2423,23 +2422,11 @@ rcu_report_qs_rdp(struct rcu_data *rdp) * to return true. So complain, but don't awaken. */ WARN_ON_ONCE(rcu_accelerate_cbs(rnp, rdp)); - } else if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { - /* - * ...but NOCB kthreads may miss or delay callbacks acceleration - * if in the middle of a (de-)offloading process. - */ - needacc = true; } rcu_disable_urgency_upon_qs(rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ - - if (needacc) { - rcu_nocb_lock_irqsave(rdp, flags); - rcu_accelerate_cbs_unlocked(rnp, rdp); - rcu_nocb_unlock_irqrestore(rdp, flags); - } } } From df7c249a0ed464b3f690c6f04b8cbc0fdbf30afc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 May 2024 15:45:49 +0200 Subject: [PATCH 08/64] rcu/nocb: Remove halfway (de-)offloading handling from rcu_core RCU core can't be running anymore while in the middle of (de-)offloading since this sort of transition now only applies to offline CPUs. The locked callback acceleration handling during the transition can therefore be removed, along with concurrent batch execution. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 60f271f5c079..1a272c678533 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2781,24 +2781,6 @@ static __latent_entropy void rcu_core(void) unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - /* - * On RT rcu_core() can be preempted when IRQs aren't disabled. - * Therefore this function can race with concurrent NOCB (de-)offloading - * on this CPU and the below condition must be considered volatile. - * However if we race with: - * - * _ Offloading: In the worst case we accelerate or process callbacks - * concurrently with NOCB kthreads. We are guaranteed to - * call rcu_nocb_lock() if that happens. - * - * _ Deoffloading: In the worst case we miss callbacks acceleration or - * processing. This is fine because the early stage - * of deoffloading invokes rcu_core() after setting - * SEGCBLIST_RCU_CORE. So we guarantee that we'll process - * what could have been dismissed without the need to wait - * for the next rcu_pending() check in the next jiffy. - */ - const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist); if (cpu_is_offline(smp_processor_id())) return; @@ -2818,17 +2800,17 @@ static __latent_entropy void rcu_core(void) /* No grace period and unregistered callbacks? */ if (!rcu_gp_in_progress() && - rcu_segcblist_is_enabled(&rdp->cblist) && do_batch) { - rcu_nocb_lock_irqsave(rdp, flags); + rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_rdp_is_offloaded(rdp)) { + local_irq_save(flags); if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) rcu_accelerate_cbs_unlocked(rnp, rdp); - rcu_nocb_unlock_irqrestore(rdp, flags); + local_irq_restore(flags); } rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); /* If there are callbacks ready, invoke them. */ - if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) && + if (!rcu_rdp_is_offloaded(rdp) && rcu_segcblist_ready_cbs(&rdp->cblist) && likely(READ_ONCE(rcu_scheduler_fully_active))) { rcu_do_batch(rdp); /* Re-invoke RCU core processing if there are callbacks remaining. */ From bae6076ebbd14cc1f1bd4de65c2db21d3ab109d7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 May 2024 15:45:50 +0200 Subject: [PATCH 09/64] rcu/nocb: Remove SEGCBLIST_RCU_CORE RCU core can't be running anymore while in the middle of (de-)offloading since this sort of transition now only applies to offline CPUs. The SEGCBLIST_RCU_CORE state can therefore be removed. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- include/linux/rcu_segcblist.h | 9 ++++----- kernel/rcu/rcu_segcblist.h | 9 --------- kernel/rcu/tree.c | 3 --- kernel/rcu/tree_nocb.h | 9 --------- 4 files changed, 4 insertions(+), 26 deletions(-) diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index ba95c06675e1..5469c54cd778 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -185,11 +185,10 @@ struct rcu_cblist { * ---------------------------------------------------------------------------- */ #define SEGCBLIST_ENABLED BIT(0) -#define SEGCBLIST_RCU_CORE BIT(1) -#define SEGCBLIST_LOCKING BIT(2) -#define SEGCBLIST_KTHREAD_CB BIT(3) -#define SEGCBLIST_KTHREAD_GP BIT(4) -#define SEGCBLIST_OFFLOADED BIT(5) +#define SEGCBLIST_LOCKING BIT(1) +#define SEGCBLIST_KTHREAD_CB BIT(2) +#define SEGCBLIST_KTHREAD_GP BIT(3) +#define SEGCBLIST_OFFLOADED BIT(4) struct rcu_segcblist { struct rcu_head *head; diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 4fe877f5f654..7a0962dfee86 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -95,15 +95,6 @@ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) return false; } -static inline bool rcu_segcblist_completely_offloaded(struct rcu_segcblist *rsclp) -{ - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - !rcu_segcblist_test_flags(rsclp, SEGCBLIST_RCU_CORE)) - return true; - - return false; -} - /* * Are all segments following the specified segment of the specified * rcu_segcblist structure empty of callbacks? (The specified diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1a272c678533..82e831b969e4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -79,9 +79,6 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *); static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .gpwrap = true, -#ifdef CONFIG_RCU_NOCB_CPU - .cblist.flags = SEGCBLIST_RCU_CORE, -#endif }; static struct rcu_state rcu_state = { .level = { &rcu_state.node[0] }, diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index af44e75eb0cd..24daf606de0c 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1060,7 +1060,6 @@ static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp) WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); - rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE); wake_gp = rdp_offload_toggle(rdp, false, flags); mutex_lock(&rdp_gp->nocb_gp_kthread_mutex); @@ -1168,13 +1167,6 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp) swait_event_exclusive(rdp->nocb_state_wq, rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); - /* - * All kthreads are ready to work, we can finally enable nocb bypass. - */ - rcu_nocb_lock_irqsave(rdp, flags); - rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE); - rcu_nocb_unlock_irqrestore(rdp, flags); - return 0; } @@ -1350,7 +1342,6 @@ void __init rcu_init_nohz(void) rcu_segcblist_init(&rdp->cblist); rcu_segcblist_offload(&rdp->cblist, true); rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); - rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_RCU_CORE); } rcu_organize_nocb_kthreads(); } From 91e43b9044a4f9b6976dc28b3f1446b733cc68de Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 May 2024 15:45:51 +0200 Subject: [PATCH 10/64] rcu/nocb: Remove SEGCBLIST_KTHREAD_CB This state excerpt from the (de-)offloading state machine was used to implement an ad-hoc kthread parking of rcuo kthreads. This code has been removed and therefore the related state can be erased as well. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- include/linux/rcu_segcblist.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 5469c54cd778..1ef1bb54853d 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -186,9 +186,8 @@ struct rcu_cblist { */ #define SEGCBLIST_ENABLED BIT(0) #define SEGCBLIST_LOCKING BIT(1) -#define SEGCBLIST_KTHREAD_CB BIT(2) -#define SEGCBLIST_KTHREAD_GP BIT(3) -#define SEGCBLIST_OFFLOADED BIT(4) +#define SEGCBLIST_KTHREAD_GP BIT(2) +#define SEGCBLIST_OFFLOADED BIT(3) struct rcu_segcblist { struct rcu_head *head; From cf3b1501a8aa8e817baede5e3e1f2beb26a09527 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 14 Jun 2024 13:35:47 -0700 Subject: [PATCH 11/64] rcutorture: Remove redundant rcu_torture_ops get_gp_completed fields The rcu_torture_ops structure's ->get_gp_completed and ->get_gp_completed_full fields are redundant with its ->get_comp_state and ->get_comp_state_full fields. This commit therefore removes the former in favor of the latter. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcutorture.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 08bf7c669dd3..836e27250a8b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -366,8 +366,6 @@ struct rcu_torture_ops { bool (*same_gp_state_full)(struct rcu_gp_oldstate *rgosp1, struct rcu_gp_oldstate *rgosp2); unsigned long (*get_gp_state)(void); void (*get_gp_state_full)(struct rcu_gp_oldstate *rgosp); - unsigned long (*get_gp_completed)(void); - void (*get_gp_completed_full)(struct rcu_gp_oldstate *rgosp); unsigned long (*start_gp_poll)(void); void (*start_gp_poll_full)(struct rcu_gp_oldstate *rgosp); bool (*poll_gp_state)(unsigned long oldstate); @@ -553,8 +551,6 @@ static struct rcu_torture_ops rcu_ops = { .get_comp_state_full = get_completed_synchronize_rcu_full, .get_gp_state = get_state_synchronize_rcu, .get_gp_state_full = get_state_synchronize_rcu_full, - .get_gp_completed = get_completed_synchronize_rcu, - .get_gp_completed_full = get_completed_synchronize_rcu_full, .start_gp_poll = start_poll_synchronize_rcu, .start_gp_poll_full = start_poll_synchronize_rcu_full, .poll_gp_state = poll_state_synchronize_rcu, @@ -1437,8 +1433,8 @@ rcu_torture_writer(void *arg) rcu_torture_writer_state_getname(), rcu_torture_writer_state, cookie, cur_ops->get_gp_state()); - if (cur_ops->get_gp_completed) { - cookie = cur_ops->get_gp_completed(); + if (cur_ops->get_comp_state) { + cookie = cur_ops->get_comp_state(); WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie)); } cur_ops->readunlock(idx); @@ -1452,8 +1448,8 @@ rcu_torture_writer(void *arg) rcu_torture_writer_state_getname(), rcu_torture_writer_state, cpumask_pr_args(cpu_online_mask)); - if (cur_ops->get_gp_completed_full) { - cur_ops->get_gp_completed_full(&cookie_full); + if (cur_ops->get_comp_state_full) { + cur_ops->get_comp_state_full(&cookie_full); WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full)); } cur_ops->readunlock(idx); From 1bc5bb9a61378bbb2ca73ab06651d5b174876a8e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 14 Jun 2024 16:47:12 -0700 Subject: [PATCH 12/64] rcutorture: Add SRCU ->same_gp_state and ->get_comp_state functions This commit points the SRCU ->same_gp_state and ->get_comp_state fields to same_state_synchronize_srcu() and get_completed_synchronize_srcu(), allowing them to be tested. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcutorture.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 836e27250a8b..b2e6201b4569 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -736,6 +736,8 @@ static struct rcu_torture_ops srcu_ops = { .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .same_gp_state = same_state_synchronize_srcu, + .get_comp_state = get_completed_synchronize_srcu, .get_gp_state = srcu_torture_get_gp_state, .start_gp_poll = srcu_torture_start_gp_poll, .poll_gp_state = srcu_torture_poll_gp_state, @@ -776,6 +778,8 @@ static struct rcu_torture_ops srcud_ops = { .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .same_gp_state = same_state_synchronize_srcu, + .get_comp_state = get_completed_synchronize_srcu, .get_gp_state = srcu_torture_get_gp_state, .start_gp_poll = srcu_torture_start_gp_poll, .poll_gp_state = srcu_torture_poll_gp_state, From 72ed29f68e6370841ba10edce5b9a213259eb9a9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 14 Jun 2024 21:34:17 -0700 Subject: [PATCH 13/64] rcutorture: Generic test for NUM_ACTIVE_*RCU_POLL* The rcutorture test suite has specific tests for both of the NUM_ACTIVE_RCU_POLL_OLDSTATE and NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE macros provided for RCU polled grace periods. However, with the advent of NUM_ACTIVE_SRCU_POLL_OLDSTATE, a more generic test is needed. This commit therefore adds ->poll_active and ->poll_active_full fields to the rcu_torture_ops structure and converts the existing specific tests to use these fields, when present. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcutorture.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index b2e6201b4569..acf9f9945d2b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -373,6 +373,8 @@ struct rcu_torture_ops { bool (*poll_need_2gp)(bool poll, bool poll_full); void (*cond_sync)(unsigned long oldstate); void (*cond_sync_full)(struct rcu_gp_oldstate *rgosp); + int poll_active; + int poll_active_full; call_rcu_func_t call; void (*cb_barrier)(void); void (*fqs)(void); @@ -558,6 +560,8 @@ static struct rcu_torture_ops rcu_ops = { .poll_need_2gp = rcu_poll_need_2gp, .cond_sync = cond_synchronize_rcu, .cond_sync_full = cond_synchronize_rcu_full, + .poll_active = NUM_ACTIVE_RCU_POLL_OLDSTATE, + .poll_active_full = NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE, .get_gp_state_exp = get_state_synchronize_rcu, .start_gp_poll_exp = start_poll_synchronize_rcu_expedited, .start_gp_poll_exp_full = start_poll_synchronize_rcu_expedited_full, @@ -741,6 +745,7 @@ static struct rcu_torture_ops srcu_ops = { .get_gp_state = srcu_torture_get_gp_state, .start_gp_poll = srcu_torture_start_gp_poll, .poll_gp_state = srcu_torture_poll_gp_state, + .poll_active = NUM_ACTIVE_SRCU_POLL_OLDSTATE, .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, @@ -783,6 +788,7 @@ static struct rcu_torture_ops srcud_ops = { .get_gp_state = srcu_torture_get_gp_state, .start_gp_poll = srcu_torture_start_gp_poll, .poll_gp_state = srcu_torture_poll_gp_state, + .poll_active = NUM_ACTIVE_SRCU_POLL_OLDSTATE, .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, @@ -1374,13 +1380,15 @@ rcu_torture_writer(void *arg) int i; int idx; int oldnice = task_nice(current); - struct rcu_gp_oldstate rgo[NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE]; + struct rcu_gp_oldstate *rgo = NULL; + int rgo_size = 0; struct rcu_torture *rp; struct rcu_torture *old_rp; static DEFINE_TORTURE_RANDOM(rand); unsigned long stallsdone = jiffies; bool stutter_waited; - unsigned long ulo[NUM_ACTIVE_RCU_POLL_OLDSTATE]; + unsigned long *ulo = NULL; + int ulo_size = 0; // If a new stall test is added, this must be adjusted. if (stall_cpu_holdoff + stall_gp_kthread + stall_cpu) @@ -1401,6 +1409,16 @@ rcu_torture_writer(void *arg) torture_kthread_stopping("rcu_torture_writer"); return 0; } + if (cur_ops->poll_active > 0) { + ulo = kzalloc(cur_ops->poll_active * sizeof(ulo[0]), GFP_KERNEL); + if (!WARN_ON(!ulo)) + ulo_size = cur_ops->poll_active; + } + if (cur_ops->poll_active_full > 0) { + rgo = kzalloc(cur_ops->poll_active_full * sizeof(rgo[0]), GFP_KERNEL); + if (!WARN_ON(!rgo)) + rgo_size = cur_ops->poll_active_full; + } do { rcu_torture_writer_state = RTWS_FIXED_DELAY; @@ -1502,19 +1520,19 @@ rcu_torture_writer(void *arg) break; case RTWS_POLL_GET: rcu_torture_writer_state = RTWS_POLL_GET; - for (i = 0; i < ARRAY_SIZE(ulo); i++) + for (i = 0; i < ulo_size; i++) ulo[i] = cur_ops->get_comp_state(); gp_snap = cur_ops->start_gp_poll(); rcu_torture_writer_state = RTWS_POLL_WAIT; while (!cur_ops->poll_gp_state(gp_snap)) { gp_snap1 = cur_ops->get_gp_state(); - for (i = 0; i < ARRAY_SIZE(ulo); i++) + for (i = 0; i < ulo_size; i++) if (cur_ops->poll_gp_state(ulo[i]) || cur_ops->same_gp_state(ulo[i], gp_snap1)) { ulo[i] = gp_snap1; break; } - WARN_ON_ONCE(i >= ARRAY_SIZE(ulo)); + WARN_ON_ONCE(ulo_size > 0 && i >= ulo_size); torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); } @@ -1522,20 +1540,20 @@ rcu_torture_writer(void *arg) break; case RTWS_POLL_GET_FULL: rcu_torture_writer_state = RTWS_POLL_GET_FULL; - for (i = 0; i < ARRAY_SIZE(rgo); i++) + for (i = 0; i < rgo_size; i++) cur_ops->get_comp_state_full(&rgo[i]); cur_ops->start_gp_poll_full(&gp_snap_full); rcu_torture_writer_state = RTWS_POLL_WAIT_FULL; while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { cur_ops->get_gp_state_full(&gp_snap1_full); - for (i = 0; i < ARRAY_SIZE(rgo); i++) + for (i = 0; i < rgo_size; i++) if (cur_ops->poll_gp_state_full(&rgo[i]) || cur_ops->same_gp_state_full(&rgo[i], &gp_snap1_full)) { rgo[i] = gp_snap1_full; break; } - WARN_ON_ONCE(i >= ARRAY_SIZE(rgo)); + WARN_ON_ONCE(rgo_size > 0 && i >= rgo_size); torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); } @@ -1617,6 +1635,8 @@ rcu_torture_writer(void *arg) pr_alert("%s" TORTURE_FLAG " Dynamic grace-period expediting was disabled.\n", torture_type); + kfree(ulo); + kfree(rgo); rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); return 0; From dc86460e77e8cea4896ff19de905001922653311 Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Wed, 19 Jun 2024 23:06:58 +0000 Subject: [PATCH 14/64] rcutorture: Add CFcommon.arch for arch-specific Kconfig options Add CFcommon.arch for arch-specific Kconfig options. In accordance with [1], [2] and [3], move the x86-specific kernel option CONFIG_HYPERVISOR_GUEST to CFcommon.i686 and CFcommon.x86_64, and also move the x86/PowerPC CONFIG_KVM_GUEST Kconfig option to CFcommon.i686, CFcommon.x86_64, and CFcommon.ppc64le. The "arch" in CFcommon.arch is taken from the "uname -m" command. [1] https://lore.kernel.org/all/20240427005626.1365935-1-zhouzhouyi@gmail.com/ [2] https://lore.kernel.org/all/059d36ce-6453-42be-a31e-895abd35d590@paulmck-laptop/ [3] https://lore.kernel.org/all/ZnBkHosMDhsh4H8g@J2N7QTR9R3/ Tested in x86_64 and PPC VM of Open Source Lab of Oregon State University. Fixes: a6fda6dab93c ("rcutorture: Tweak kvm options") Suggested-by: Paul E. McKenney Suggested-by: Mark Rutland Signed-off-by: Zhouyi Zhou Acked-by: Mark Rutland Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 2 ++ tools/testing/selftests/rcutorture/configs/rcu/CFcommon | 2 -- tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686 | 2 ++ tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le | 1 + tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64 | 2 ++ 5 files changed, 7 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686 create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64 diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index b33cd8753689..ad79784e552d 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -68,6 +68,8 @@ config_override_param "--gdb options" KcList "$TORTURE_KCONFIG_GDB_ARG" config_override_param "--kasan options" KcList "$TORTURE_KCONFIG_KASAN_ARG" config_override_param "--kcsan options" KcList "$TORTURE_KCONFIG_KCSAN_ARG" config_override_param "--kconfig argument" KcList "$TORTURE_KCONFIG_ARG" +config_override_param "$config_dir/CFcommon.$(uname -m)" KcList \ + "`cat $config_dir/CFcommon.$(uname -m) 2> /dev/null`" cp $T/KcList $resdir/ConfigFragment base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'` diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon index 0e92d85313aa..217597e84905 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon @@ -1,7 +1,5 @@ CONFIG_RCU_TORTURE_TEST=y CONFIG_PRINTK_TIME=y -CONFIG_HYPERVISOR_GUEST=y CONFIG_PARAVIRT=y -CONFIG_KVM_GUEST=y CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686 b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686 new file mode 100644 index 000000000000..d8b2f555686f --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686 @@ -0,0 +1,2 @@ +CONFIG_HYPERVISOR_GUEST=y +CONFIG_KVM_GUEST=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le new file mode 100644 index 000000000000..133da04247ee --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le @@ -0,0 +1 @@ +CONFIG_KVM_GUEST=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64 b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64 new file mode 100644 index 000000000000..d8b2f555686f --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64 @@ -0,0 +1,2 @@ +CONFIG_HYPERVISOR_GUEST=y +CONFIG_KVM_GUEST=y From 20cee0b3daa315237edded31ddf72a4e948d5e1d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Jul 2024 09:18:33 -0700 Subject: [PATCH 15/64] rcutorture: Make rcu_torture_write_types() print number of update types This commit follows the list of update types with their count, resulting in console output like this: rcu_torture_write_types: Testing conditional GPs. rcu_torture_write_types: Testing conditional expedited GPs. rcu_torture_write_types: Testing conditional full-state GPs. rcu_torture_write_types: Testing expedited GPs. rcu_torture_write_types: Testing asynchronous GPs. rcu_torture_write_types: Testing polling GPs. rcu_torture_write_types: Testing polling full-state GPs. rcu_torture_write_types: Testing polling expedited GPs. rcu_torture_write_types: Testing polling full-state expedited GPs. rcu_torture_write_types: Testing normal GPs. rcu_torture_write_types: Testing 10 update types This commit adds the final line giving the count. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcutorture.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index acf9f9945d2b..9a2ecb8b88ec 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1324,6 +1324,7 @@ static void rcu_torture_write_types(void) } else if (gp_sync && !cur_ops->sync) { pr_alert("%s: gp_sync without primitives.\n", __func__); } + pr_alert("%s: Testing %d update types.\n", __func__, nsynctypes); } /* From 9a13a324f40fc3846cf7867daffaccd785beb10c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Jul 2024 19:51:02 -0700 Subject: [PATCH 16/64] tools/rcu: Remove RCU Tasks Rude asynchronous APIs from rcu-updaters.sh The call_rcu_tasks_rude() and rcu_barrier_tasks_rude() APIs are no longer. This commit therefore removes them from the rcu-updaters.sh script. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- tools/rcu/rcu-updaters.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/rcu/rcu-updaters.sh b/tools/rcu/rcu-updaters.sh index 4ef1397927bb..8a5df3f22550 100755 --- a/tools/rcu/rcu-updaters.sh +++ b/tools/rcu/rcu-updaters.sh @@ -21,12 +21,10 @@ fi bpftrace -e 'kprobe:kvfree_call_rcu, kprobe:call_rcu, kprobe:call_rcu_tasks, - kprobe:call_rcu_tasks_rude, kprobe:call_rcu_tasks_trace, kprobe:call_srcu, kprobe:rcu_barrier, kprobe:rcu_barrier_tasks, - kprobe:rcu_barrier_tasks_rude, kprobe:rcu_barrier_tasks_trace, kprobe:srcu_barrier, kprobe:synchronize_rcu, From 3471e96bcf53731639587580fd02e5a297940b91 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 18 Jun 2024 10:40:30 -0700 Subject: [PATCH 17/64] rcu/kfree: Warn on unexpected tail state Within the rcu_sr_normal_gp_cleanup_work() function, there is an acquire load from rcu_state.srs_done_tail, which is expected to be non-NULL. This commit adds a WARN_ON_ONCE() to check this expectation. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e641cc681901..0f41a81138dc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1649,7 +1649,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work) * the done tail list manipulations are protected here. */ done = smp_load_acquire(&rcu_state.srs_done_tail); - if (!done) + if (WARN_ON_ONCE(!done)) return; WARN_ON_ONCE(!rcu_sr_is_wait_head(done)); From c1972c8dc987769eac8e5dede536d3cff489c6ee Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Jul 2024 09:49:49 -0700 Subject: [PATCH 18/64] locking/csd_lock: Print large numbers as negatives The CSD-lock-hold diagnostics from CONFIG_CSD_LOCK_WAIT_DEBUG are printed in nanoseconds as unsigned long longs, which is a bit obtuse for human readers when timing bugs result in negative CSD-lock hold times. Yes, there are some people to whom it is immediately obvious that 18446744073709551615 is really -1, but for the rest of us... Therefore, print these numbers as signed long longs, making the negative hold times immediately apparent. Reported-by: Rik van Riel Signed-off-by: Paul E. McKenney Cc: Imran Khan Cc: Ingo Molnar Cc: Leonardo Bras Cc: "Peter Zijlstra (Intel)" Cc: Rik van Riel Reviewed-by: Rik van Riel Signed-off-by: Neeraj Upadhyay --- kernel/smp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/smp.c b/kernel/smp.c index aaffecdad319..e87953729230 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -249,8 +249,8 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */ /* How long since this CSD lock was stuck. */ ts_delta = ts2 - ts0; - pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n", - firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts_delta, + pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %lld ns for CPU#%02d %pS(%ps).\n", + firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), (s64)ts_delta, cpu, csd->func, csd->info); /* * If the CSD lock is still stuck after 5 minutes, it is unlikely From 9aed3b51fd6186582a95abd9fa67782982540749 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Jul 2024 18:49:35 -0700 Subject: [PATCH 19/64] rcu: Better define "atomic" for list replacement The kernel-doc headers for list_replace_rcu() and hlist_replace_rcu() claim that the replacement is atomic, which it is, but only for readers. Avoid confusion by making it clear that the atomic nature of these functions applies only to readers, not to concurrent updaters. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- include/linux/rculist.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 3dc1e58865f7..14dfa6008467 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -191,7 +191,10 @@ static inline void hlist_del_init_rcu(struct hlist_node *n) * @old : the element to be replaced * @new : the new element to insert * - * The @old entry will be replaced with the @new entry atomically. + * The @old entry will be replaced with the @new entry atomically from + * the perspective of concurrent readers. It is the caller's responsibility + * to synchronize with concurrent updaters, if any. + * * Note: @old should not be empty. */ static inline void list_replace_rcu(struct list_head *old, @@ -519,7 +522,9 @@ static inline void hlist_del_rcu(struct hlist_node *n) * @old : the element to be replaced * @new : the new element to insert * - * The @old entry will be replaced with the @new entry atomically. + * The @old entry will be replaced with the @new entry atomically from + * the perspective of concurrent readers. It is the caller's responsibility + * to synchronize with concurrent updaters, if any. */ static inline void hlist_replace_rcu(struct hlist_node *old, struct hlist_node *new) From cfdbfb94b3824b8da3a6b21252d24754d0162ab6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:37:48 -0700 Subject: [PATCH 20/64] rcutorture: Add rcutree.nohz_full_patience_delay to TREE07 This commit adds the rcutree.nohz_full_patience_delay=1000 kernel boot parameter to the TREE07 scenario, on the observation that "if it ain't tested, it don't work". Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot index 979edbf4c820..55ce305b2a3d 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot @@ -2,3 +2,4 @@ nohz_full=2-9 rcutorture.stall_cpu=14 rcutorture.stall_cpu_holdoff=90 rcutorture.fwd_progress=0 +rcutree.nohz_full_patience_delay=1000 From fb579e6656a8d5e042e65062f68ca41321817237 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 7 Aug 2024 11:55:00 +0200 Subject: [PATCH 21/64] rcu: Annotate struct kvfree_rcu_bulk_data with __counted_by() Add the __counted_by compiler attribute to the flexible array member records to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and CONFIG_FORTIFY_SOURCE. Increment nr_records before adding a new pointer to the records array. Signed-off-by: Thorsten Blum Reviewed-by: "Gustavo A. R. Silva" Reviewed-by: "Uladzislau Rezki (Sony)" Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0f41a81138dc..d5bf824159da 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3227,7 +3227,7 @@ struct kvfree_rcu_bulk_data { struct list_head list; struct rcu_gp_oldstate gp_snap; unsigned long nr_records; - void *records[]; + void *records[] __counted_by(nr_records); }; /* @@ -3767,7 +3767,8 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, } // Finally insert and update the GP for this page. - bnode->records[bnode->nr_records++] = ptr; + bnode->nr_records++; + bnode->records[bnode->nr_records - 1] = ptr; get_state_synchronize_rcu_full(&bnode->gp_snap); atomic_inc(&(*krcp)->bulk_count[idx]); From 0aac9daef6763e6efef398faff71f8c593651cce Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 23 Jul 2024 14:10:25 -0400 Subject: [PATCH 22/64] rcu: Use system_unbound_wq to avoid disturbing isolated CPUs It was discovered that isolated CPUs could sometimes be disturbed by kworkers processing kfree_rcu() works causing higher than expected latency. It is because the RCU core uses "system_wq" which doesn't have the WQ_UNBOUND flag to handle all its work items. Fix this violation of latency limits by using "system_unbound_wq" in the RCU core instead. This will ensure that those work items will not be run on CPUs marked as isolated. Beside the WQ_UNBOUND flag, the other major difference between system_wq and system_unbound_wq is their max_active count. The system_unbound_wq has a max_active of WQ_MAX_ACTIVE (512) while system_wq's max_active is WQ_DFL_ACTIVE (256) which is half of WQ_MAX_ACTIVE. Reported-by: Vratislav Bendel Closes: https://issues.redhat.com/browse/RHEL-50220 Signed-off-by: Waiman Long Reviewed-by: "Uladzislau Rezki (Sony)" Tested-by: Breno Leitao Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e641cc681901..494aa9513d0b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3539,10 +3539,10 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) if (delayed_work_pending(&krcp->monitor_work)) { delay_left = krcp->monitor_work.timer.expires - jiffies; if (delay < delay_left) - mod_delayed_work(system_wq, &krcp->monitor_work, delay); + mod_delayed_work(system_unbound_wq, &krcp->monitor_work, delay); return; } - queue_delayed_work(system_wq, &krcp->monitor_work, delay); + queue_delayed_work(system_unbound_wq, &krcp->monitor_work, delay); } static void @@ -3634,7 +3634,7 @@ static void kfree_rcu_monitor(struct work_struct *work) // be that the work is in the pending state when // channels have been detached following by each // other. - queue_rcu_work(system_wq, &krwp->rcu_work); + queue_rcu_work(system_unbound_wq, &krwp->rcu_work); } } @@ -3704,7 +3704,7 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp) if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && !atomic_xchg(&krcp->work_in_progress, 1)) { if (atomic_read(&krcp->backoff_page_cache_fill)) { - queue_delayed_work(system_wq, + queue_delayed_work(system_unbound_wq, &krcp->page_cache_work, msecs_to_jiffies(rcu_delay_page_cache_fill_msec)); } else { From 29bc83e4d90546aa794a9584786086b141a6ba4d Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Mon, 15 Jul 2024 16:23:24 -0700 Subject: [PATCH 23/64] srcu: faster gp seq wrap-around Using a higher value for the initial gp sequence counters allows for wrapping to occur faster. It can help with surfacing any issues that may be happening as a result of the wrap around. Signed-off-by: JP Kobryn Tested-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- include/linux/rcupdate.h | 3 +++ include/linux/srcutree.h | 15 ++++++++++++++- kernel/rcu/rcu.h | 3 --- kernel/rcu/srcutree.c | 7 ++++--- 4 files changed, 21 insertions(+), 7 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 13f6f00aecf9..8d56db70d417 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -34,6 +34,9 @@ #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) +#define RCU_SEQ_CTR_SHIFT 2 +#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) + /* Exported common interfaces */ void call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier_tasks(void); diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 8f3f72480e78..ed57598394de 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -129,10 +129,23 @@ struct srcu_struct { #define SRCU_STATE_SCAN1 1 #define SRCU_STATE_SCAN2 2 +/* + * Values for initializing gp sequence fields. Higher values allow wrap arounds to + * occur earlier. + * The second value with state is useful in the case of static initialization of + * srcu_usage where srcu_gp_seq_needed is expected to have some state value in its + * lower bits (or else it will appear to be already initialized within + * the call check_init_srcu_struct()). + */ +#define SRCU_GP_SEQ_INITIAL_VAL ((0UL - 100UL) << RCU_SEQ_CTR_SHIFT) +#define SRCU_GP_SEQ_INITIAL_VAL_WITH_STATE (SRCU_GP_SEQ_INITIAL_VAL - 1) + #define __SRCU_USAGE_INIT(name) \ { \ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ - .srcu_gp_seq_needed = -1UL, \ + .srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL, \ + .srcu_gp_seq_needed = SRCU_GP_SEQ_INITIAL_VAL_WITH_STATE, \ + .srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL, \ .work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0), \ } diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 38238e595a61..2bfed9855d67 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -54,9 +54,6 @@ * grace-period sequence number. */ -#define RCU_SEQ_CTR_SHIFT 2 -#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) - /* Low-order bit definition for polled grace-period APIs. */ #define RCU_GET_STATE_COMPLETED 0x1 diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index b24db425f16d..6fd9c914ce64 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -247,7 +247,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) mutex_init(&ssp->srcu_sup->srcu_cb_mutex); mutex_init(&ssp->srcu_sup->srcu_gp_mutex); ssp->srcu_idx = 0; - ssp->srcu_sup->srcu_gp_seq = 0; + ssp->srcu_sup->srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL; ssp->srcu_sup->srcu_barrier_seq = 0; mutex_init(&ssp->srcu_sup->srcu_barrier_mutex); atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0); @@ -258,7 +258,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) if (!ssp->sda) goto err_free_sup; init_srcu_struct_data(ssp); - ssp->srcu_sup->srcu_gp_seq_needed_exp = 0; + ssp->srcu_sup->srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL; ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns(); if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) @@ -266,7 +266,8 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); } ssp->srcu_sup->srcu_ssp = ssp; - smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */ + smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, + SRCU_GP_SEQ_INITIAL_VAL); /* Init done. */ return 0; err_free_sda: From c8c3ae83e0bbed6d56a00068c7d10ebf64db48d9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:36:21 -0700 Subject: [PATCH 24/64] srcu: Check for concurrent updates of heuristics SRCU maintains the ->srcu_n_exp_nodelay and ->reschedule_count values to guide heuristics governing auto-expediting of normal SRCU grace periods and grace-period-state-machine delays. This commit adds KCSAN ASSERT_EXCLUSIVE_WRITER() calls to check for concurrent updates to these fields. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/srcutree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 6fd9c914ce64..aaee09a6748c 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -629,6 +629,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) if (time_after(j, gpstart)) jbase += j - gpstart; if (!jbase) { + ASSERT_EXCLUSIVE_WRITER(sup->srcu_n_exp_nodelay); WRITE_ONCE(sup->srcu_n_exp_nodelay, READ_ONCE(sup->srcu_n_exp_nodelay) + 1); if (READ_ONCE(sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase) jbase = 1; @@ -1819,6 +1820,7 @@ static void process_srcu(struct work_struct *work) } else { j = jiffies; if (READ_ONCE(sup->reschedule_jiffies) == j) { + ASSERT_EXCLUSIVE_WRITER(sup->reschedule_count); WRITE_ONCE(sup->reschedule_count, READ_ONCE(sup->reschedule_count) + 1); if (READ_ONCE(sup->reschedule_count) > srcu_max_nodelay) curdelay = 1; From e53cef031bfac2369d249cfd726706c30a5a3351 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:36:22 -0700 Subject: [PATCH 25/64] srcu: Mark callbacks not currently participating in barrier operation SRCU keeps a count of the number of callbacks that the current srcu_barrier() is waiting on, but there is currently no easy way to work out which callback is stuck. One way to do this is to mark idle SRCU-barrier callbacks by making the ->next pointer point to the callback itself, and this commit does just that. Later commits will use this for debug output. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/srcutree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index aaee09a6748c..31706e3293bc 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -137,6 +137,7 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) sdp->srcu_cblist_invoking = false; sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq; sdp->srcu_gp_seq_needed_exp = ssp->srcu_sup->srcu_gp_seq; + sdp->srcu_barrier_head.next = &sdp->srcu_barrier_head; sdp->mynode = NULL; sdp->cpu = cpu; INIT_WORK(&sdp->work, srcu_invoke_callbacks); @@ -1562,6 +1563,7 @@ static void srcu_barrier_cb(struct rcu_head *rhp) struct srcu_data *sdp; struct srcu_struct *ssp; + rhp->next = rhp; // Mark the callback as having been invoked. sdp = container_of(rhp, struct srcu_data, srcu_barrier_head); ssp = sdp->ssp; if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt)) From 58cb3210543591c1783a9a884adf27708f19d2a1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:37:49 -0700 Subject: [PATCH 26/64] rcutorture: Add a stall_cpu_repeat module parameter This commit adds an stall_cpu_repeat kernel, which is also the rcutorture.stall_cpu_repeat boot parameter, to test repeated CPU stalls. Note that only the first stall will pay attention to the stall_cpu_irqsoff module parameter. For the second and subsequent stalls, interrupts will be enabled. This is helpful when testing the interaction between RCU CPU stall warnings and CSD-lock stall warnings. Reported-by: Rik van Riel Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- .../admin-guide/kernel-parameters.txt | 8 ++- kernel/rcu/rcutorture.c | 56 +++++++++++++------ 2 files changed, 47 insertions(+), 17 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f1384c7b59c9..db1be767c91a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5384,7 +5384,13 @@ Time to wait (s) after boot before inducing stall. rcutorture.stall_cpu_irqsoff= [KNL] - Disable interrupts while stalling if set. + Disable interrupts while stalling if set, but only + on the first stall in the set. + + rcutorture.stall_cpu_repeat= [KNL] + Number of times to repeat the stall sequence, + so that rcutorture.stall_cpu_repeat=3 will result + in four stall sequences. rcutorture.stall_gp_kthread= [KNL] Duration (s) of forced sleep within RCU diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 9a2ecb8b88ec..4fd23cae0e9e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -115,6 +115,7 @@ torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s torture_param(bool, stall_no_softlockup, false, "Avoid softlockup warning during cpu stall."); torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling."); torture_param(int, stall_cpu_block, 0, "Sleep while stalling."); +torture_param(int, stall_cpu_repeat, 0, "Number of additional stalls after the first one."); torture_param(int, stall_gp_kthread, 0, "Grace-period kthread stall duration (s)."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of seconds to run/halt test"); @@ -1393,7 +1394,8 @@ rcu_torture_writer(void *arg) // If a new stall test is added, this must be adjusted. if (stall_cpu_holdoff + stall_gp_kthread + stall_cpu) - stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) * HZ; + stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) * + HZ * (stall_cpu_repeat + 1); VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); if (!can_expedite) pr_alert("%s" TORTURE_FLAG @@ -2391,7 +2393,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "test_boost=%d/%d test_boost_interval=%d " "test_boost_duration=%d shutdown_secs=%d " "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " - "stall_cpu_block=%d " + "stall_cpu_block=%d stall_cpu_repeat=%d " "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d " "read_exit_delay=%d read_exit_burst=%d " @@ -2403,7 +2405,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) test_boost, cur_ops->can_boost, test_boost_interval, test_boost_duration, shutdown_secs, stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, - stall_cpu_block, + stall_cpu_block, stall_cpu_repeat, n_barrier_cbs, onoff_interval, onoff_holdoff, read_exit_delay, read_exit_burst, @@ -2481,19 +2483,11 @@ static struct notifier_block rcu_torture_stall_block = { * induces a CPU stall for the time specified by stall_cpu. If a new * stall test is added, stallsdone in rcu_torture_writer() must be adjusted. */ -static int rcu_torture_stall(void *args) +static void rcu_torture_stall_one(int rep, int irqsoff) { int idx; - int ret; unsigned long stop_at; - VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); - if (rcu_cpu_stall_notifiers) { - ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block); - if (ret) - pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n", - __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : ""); - } if (stall_cpu_holdoff > 0) { VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff"); schedule_timeout_interruptible(stall_cpu_holdoff * HZ); @@ -2513,12 +2507,12 @@ static int rcu_torture_stall(void *args) stop_at = ktime_get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ idx = cur_ops->readlock(); - if (stall_cpu_irqsoff) + if (irqsoff) local_irq_disable(); else if (!stall_cpu_block) preempt_disable(); - pr_alert("%s start on CPU %d.\n", - __func__, raw_smp_processor_id()); + pr_alert("%s start stall episode %d on CPU %d.\n", + __func__, rep + 1, raw_smp_processor_id()); while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at) && !kthread_should_stop()) if (stall_cpu_block) { @@ -2530,12 +2524,42 @@ static int rcu_torture_stall(void *args) } else if (stall_no_softlockup) { touch_softlockup_watchdog(); } - if (stall_cpu_irqsoff) + if (irqsoff) local_irq_enable(); else if (!stall_cpu_block) preempt_enable(); cur_ops->readunlock(idx); } +} + +/* + * CPU-stall kthread. Invokes rcu_torture_stall_one() once, and then as many + * additional times as specified by the stall_cpu_repeat module parameter. + * Note that stall_cpu_irqsoff is ignored on the second and subsequent + * stall. + */ +static int rcu_torture_stall(void *args) +{ + int i; + int repeat = stall_cpu_repeat; + int ret; + + VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); + if (repeat < 0) { + repeat = 0; + WARN_ON_ONCE(IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)); + } + if (rcu_cpu_stall_notifiers) { + ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block); + if (ret) + pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n", + __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : ""); + } + for (i = 0; i <= repeat; i++) { + if (kthread_should_stop()) + break; + rcu_torture_stall_one(i, i == 0 ? stall_cpu_irqsoff : 0); + } pr_alert("%s end.\n", __func__); if (rcu_cpu_stall_notifiers && !ret) { ret = rcu_stall_chain_notifier_unregister(&rcu_torture_stall_block); From 1c5144a066fd17eba681b3d6b0f86bda8e06fbfa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:37:50 -0700 Subject: [PATCH 27/64] torture: Add torture.sh --guest-cpu-limit argument for limited hosts Some servers have limitations on the number of CPUs a given guest OS can use. In my earlier experience, such limitations have been at least half of the host's CPUs, but in a recent example, this limit is less than 40%. This commit therefore adds a --guest-cpu-limit argument that allows such low limits to be made known to torture.sh. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- .../selftests/rcutorture/bin/torture.sh | 38 +++++++++++++------ 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index 990d24696fd3..0447c4a00cc4 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -19,10 +19,10 @@ PATH=${RCUTORTURE}/bin:$PATH; export PATH TORTURE_ALLOTED_CPUS="`identify_qemu_vcpus`" MAKE_ALLOTED_CPUS=$((TORTURE_ALLOTED_CPUS*2)) -HALF_ALLOTED_CPUS=$((TORTURE_ALLOTED_CPUS/2)) -if test "$HALF_ALLOTED_CPUS" -lt 1 +SCALE_ALLOTED_CPUS=$((TORTURE_ALLOTED_CPUS/2)) +if test "$SCALE_ALLOTED_CPUS" -lt 1 then - HALF_ALLOTED_CPUS=1 + SCALE_ALLOTED_CPUS=1 fi VERBOSE_BATCH_CPUS=$((TORTURE_ALLOTED_CPUS/16)) if test "$VERBOSE_BATCH_CPUS" -lt 2 @@ -90,6 +90,7 @@ usage () { echo " --do-scftorture / --do-no-scftorture / --no-scftorture" echo " --do-srcu-lockdep / --do-no-srcu-lockdep / --no-srcu-lockdep" echo " --duration [ | h | d ]" + echo " --guest-cpu-limit N" echo " --kcsan-kmake-arg kernel-make-arguments" exit 1 } @@ -203,6 +204,21 @@ do duration_base=$(($ts*mult)) shift ;; + --guest-cpu-limit|--guest-cpu-lim) + checkarg --guest-cpu-limit "(number)" "$#" "$2" '^[0-9]*$' '^--' + if (("$2" <= "$TORTURE_ALLOTED_CPUS" / 2)) + then + SCALE_ALLOTED_CPUS="$2" + VERBOSE_BATCH_CPUS="$((SCALE_ALLOTED_CPUS/8))" + if (("$VERBOSE_BATCH_CPUS" < 2)) + then + VERBOSE_BATCH_CPUS=0 + fi + else + echo "Ignoring value of $2 for --guest-cpu-limit which is greater than (("$TORTURE_ALLOTED_CPUS" / 2))." + fi + shift + ;; --kcsan-kmake-arg|--kcsan-kmake-args) checkarg --kcsan-kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$' kcsan_kmake_args="`echo "$kcsan_kmake_args $2" | sed -e 's/^ *//' -e 's/ *$//'`" @@ -425,9 +441,9 @@ fi if test "$do_scftorture" = "yes" then # Scale memory based on the number of CPUs. - scfmem=$((3+HALF_ALLOTED_CPUS/16)) - torture_bootargs="scftorture.nthreads=$HALF_ALLOTED_CPUS torture.disable_onoff_at_boot csdlock_debug=1" - torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory ${scfmem}G --trust-make + scfmem=$((3+SCALE_ALLOTED_CPUS/16)) + torture_bootargs="scftorture.nthreads=$SCALE_ALLOTED_CPUS torture.disable_onoff_at_boot csdlock_debug=1" + torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --memory ${scfmem}G --trust-make fi if test "$do_rt" = "yes" @@ -471,8 +487,8 @@ for prim in $primlist do if test -n "$firsttime" then - torture_bootargs="refscale.scale_type="$prim" refscale.nreaders=$HALF_ALLOTED_CPUS refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot" - torture_set "refscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --bootargs "refscale.verbose_batched=$VERBOSE_BATCH_CPUS torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=$VERBOSE_BATCH_CPUS" --trust-make + torture_bootargs="refscale.scale_type="$prim" refscale.nreaders=$SCALE_ALLOTED_CPUS refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot" + torture_set "refscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --bootargs "refscale.verbose_batched=$VERBOSE_BATCH_CPUS torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=$VERBOSE_BATCH_CPUS" --trust-make mv $T/last-resdir-nodebug $T/first-resdir-nodebug || : if test -f "$T/last-resdir-kasan" then @@ -520,8 +536,8 @@ for prim in $primlist do if test -n "$firsttime" then - torture_bootargs="rcuscale.scale_type="$prim" rcuscale.nwriters=$HALF_ALLOTED_CPUS rcuscale.holdoff=20 torture.disable_onoff_at_boot" - torture_set "rcuscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --trust-make + torture_bootargs="rcuscale.scale_type="$prim" rcuscale.nwriters=$SCALE_ALLOTED_CPUS rcuscale.holdoff=20 torture.disable_onoff_at_boot" + torture_set "rcuscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --trust-make mv $T/last-resdir-nodebug $T/first-resdir-nodebug || : if test -f "$T/last-resdir-kasan" then @@ -559,7 +575,7 @@ do_kcsan="$do_kcsan_save" if test "$do_kvfree" = "yes" then torture_bootargs="rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 rcuscale.holdoff=20 rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot" - torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration $duration_rcutorture --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 2G --trust-make + torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration $duration_rcutorture --kconfig "CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --memory 2G --trust-make fi if test "$do_clocksourcewd" = "yes" From 0ff92d145a2be4da47e2ee6287a731084fba112f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Jul 2024 19:52:29 -0700 Subject: [PATCH 28/64] doc: Remove RCU Tasks Rude asynchronous APIs The call_rcu_tasks_rude() and rcu_barrier_tasks_rude() APIs are no longer. This commit therefore removes them from the documentation. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- .../RCU/Design/Requirements/Requirements.rst | 3 +- Documentation/RCU/checklist.rst | 61 +++++++++---------- Documentation/RCU/whatisRCU.rst | 2 +- .../admin-guide/kernel-parameters.txt | 8 --- 4 files changed, 30 insertions(+), 44 deletions(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst index f511476b4550..6125e7068d2c 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.rst +++ b/Documentation/RCU/Design/Requirements/Requirements.rst @@ -2649,8 +2649,7 @@ those that are idle from RCU's perspective) and then Tasks Rude RCU can be removed from the kernel. The tasks-rude-RCU API is also reader-marking-free and thus quite compact, -consisting of call_rcu_tasks_rude(), synchronize_rcu_tasks_rude(), -and rcu_barrier_tasks_rude(). +consisting solely of synchronize_rcu_tasks_rude(). Tasks Trace RCU ~~~~~~~~~~~~~~~ diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst index 3e6407de231c..7de3e308f330 100644 --- a/Documentation/RCU/checklist.rst +++ b/Documentation/RCU/checklist.rst @@ -194,14 +194,13 @@ over a rather long period of time, but improvements are always welcome! when publicizing a pointer to a structure that can be traversed by an RCU read-side critical section. -5. If any of call_rcu(), call_srcu(), call_rcu_tasks(), - call_rcu_tasks_rude(), or call_rcu_tasks_trace() is used, - the callback function may be invoked from softirq context, - and in any case with bottom halves disabled. In particular, - this callback function cannot block. If you need the callback - to block, run that code in a workqueue handler scheduled from - the callback. The queue_rcu_work() function does this for you - in the case of call_rcu(). +5. If any of call_rcu(), call_srcu(), call_rcu_tasks(), or + call_rcu_tasks_trace() is used, the callback function may be + invoked from softirq context, and in any case with bottom halves + disabled. In particular, this callback function cannot block. + If you need the callback to block, run that code in a workqueue + handler scheduled from the callback. The queue_rcu_work() + function does this for you in the case of call_rcu(). 6. Since synchronize_rcu() can block, it cannot be called from any sort of irq context. The same rule applies @@ -254,10 +253,10 @@ over a rather long period of time, but improvements are always welcome! corresponding readers must use rcu_read_lock_trace() and rcu_read_unlock_trace(). - c. If an updater uses call_rcu_tasks_rude() or - synchronize_rcu_tasks_rude(), then the corresponding - readers must use anything that disables preemption, - for example, preempt_disable() and preempt_enable(). + c. If an updater uses synchronize_rcu_tasks_rude(), + then the corresponding readers must use anything that + disables preemption, for example, preempt_disable() + and preempt_enable(). Mixing things up will result in confusion and broken kernels, and has even resulted in an exploitable security issue. Therefore, @@ -326,11 +325,9 @@ over a rather long period of time, but improvements are always welcome! d. Periodically invoke rcu_barrier(), permitting a limited number of updates per grace period. - The same cautions apply to call_srcu(), call_rcu_tasks(), - call_rcu_tasks_rude(), and call_rcu_tasks_trace(). This is - why there is an srcu_barrier(), rcu_barrier_tasks(), - rcu_barrier_tasks_rude(), and rcu_barrier_tasks_rude(), - respectively. + The same cautions apply to call_srcu(), call_rcu_tasks(), and + call_rcu_tasks_trace(). This is why there is an srcu_barrier(), + rcu_barrier_tasks(), and rcu_barrier_tasks_trace(), respectively. Note that although these primitives do take action to avoid memory exhaustion when any given CPU has too many callbacks, @@ -383,17 +380,17 @@ over a rather long period of time, but improvements are always welcome! must use whatever locking or other synchronization is required to safely access and/or modify that data structure. - Do not assume that RCU callbacks will be executed on - the same CPU that executed the corresponding call_rcu(), - call_srcu(), call_rcu_tasks(), call_rcu_tasks_rude(), or - call_rcu_tasks_trace(). For example, if a given CPU goes offline - while having an RCU callback pending, then that RCU callback - will execute on some surviving CPU. (If this was not the case, - a self-spawning RCU callback would prevent the victim CPU from - ever going offline.) Furthermore, CPUs designated by rcu_nocbs= - might well *always* have their RCU callbacks executed on some - other CPUs, in fact, for some real-time workloads, this is the - whole point of using the rcu_nocbs= kernel boot parameter. + Do not assume that RCU callbacks will be executed on the same + CPU that executed the corresponding call_rcu(), call_srcu(), + call_rcu_tasks(), or call_rcu_tasks_trace(). For example, if + a given CPU goes offline while having an RCU callback pending, + then that RCU callback will execute on some surviving CPU. + (If this was not the case, a self-spawning RCU callback would + prevent the victim CPU from ever going offline.) Furthermore, + CPUs designated by rcu_nocbs= might well *always* have their + RCU callbacks executed on some other CPUs, in fact, for some + real-time workloads, this is the whole point of using the + rcu_nocbs= kernel boot parameter. In addition, do not assume that callbacks queued in a given order will be invoked in that order, even if they all are queued on the @@ -507,9 +504,9 @@ over a rather long period of time, but improvements are always welcome! These debugging aids can help you find problems that are otherwise extremely difficult to spot. -17. If you pass a callback function defined within a module to one of - call_rcu(), call_srcu(), call_rcu_tasks(), call_rcu_tasks_rude(), - or call_rcu_tasks_trace(), then it is necessary to wait for all +17. If you pass a callback function defined within a module + to one of call_rcu(), call_srcu(), call_rcu_tasks(), or + call_rcu_tasks_trace(), then it is necessary to wait for all pending callbacks to be invoked before unloading that module. Note that it is absolutely *not* sufficient to wait for a grace period! For example, synchronize_rcu() implementation is *not* @@ -522,7 +519,6 @@ over a rather long period of time, but improvements are always welcome! - call_rcu() -> rcu_barrier() - call_srcu() -> srcu_barrier() - call_rcu_tasks() -> rcu_barrier_tasks() - - call_rcu_tasks_rude() -> rcu_barrier_tasks_rude() - call_rcu_tasks_trace() -> rcu_barrier_tasks_trace() However, these barrier functions are absolutely *not* guaranteed @@ -539,7 +535,6 @@ over a rather long period of time, but improvements are always welcome! - Either synchronize_srcu() or synchronize_srcu_expedited(), together with and srcu_barrier() - synchronize_rcu_tasks() and rcu_barrier_tasks() - - synchronize_tasks_rude() and rcu_barrier_tasks_rude() - synchronize_tasks_trace() and rcu_barrier_tasks_trace() If necessary, you can use something like workqueues to execute diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst index d585a5490aee..1ef5784c1b84 100644 --- a/Documentation/RCU/whatisRCU.rst +++ b/Documentation/RCU/whatisRCU.rst @@ -1103,7 +1103,7 @@ RCU-Tasks-Rude:: Critical sections Grace period Barrier - N/A call_rcu_tasks_rude rcu_barrier_tasks_rude + N/A N/A synchronize_rcu_tasks_rude diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f1384c7b59c9..a6107b745076 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5572,14 +5572,6 @@ of zero will disable batching. Batching is always disabled for synchronize_rcu_tasks(). - rcupdate.rcu_tasks_rude_lazy_ms= [KNL] - Set timeout in milliseconds RCU Tasks - Rude asynchronous callback batching for - call_rcu_tasks_rude(). A negative value - will take the default. A value of zero will - disable batching. Batching is always disabled - for synchronize_rcu_tasks_rude(). - rcupdate.rcu_tasks_trace_lazy_ms= [KNL] Set timeout in milliseconds RCU Tasks Trace asynchronous callback batching for From b428a9de9bba8877aeb5de4691cc9334467065d6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Jul 2024 09:04:08 -0700 Subject: [PATCH 29/64] rcutorture: Stop testing RCU Tasks Rude asynchronous APIs The call_rcu_tasks_rude() and rcu_barrier_tasks_rude() APIs are currently unused. Furthermore, the idea is to get rid of RCU Tasks Rude entirely once all architectures have their deep-idle and entry/exit code correctly marked as inline or noinstr. As a first step towards this goal, this commit therefore removes these two functions from rcutorture testing. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcutorture.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 08bf7c669dd3..cd8752b2a960 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -915,11 +915,6 @@ static struct rcu_torture_ops tasks_ops = { * Definitions for rude RCU-tasks torture testing. */ -static void rcu_tasks_rude_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu_tasks_rude(&p->rtort_rcu, rcu_torture_cb); -} - static struct rcu_torture_ops tasks_rude_ops = { .ttype = RCU_TASKS_RUDE_FLAVOR, .init = rcu_sync_torture_init, @@ -927,11 +922,8 @@ static struct rcu_torture_ops tasks_rude_ops = { .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock_trivial, .get_gp_seq = rcu_no_completed, - .deferred_free = rcu_tasks_rude_torture_deferred_free, .sync = synchronize_rcu_tasks_rude, .exp_sync = synchronize_rcu_tasks_rude, - .call = call_rcu_tasks_rude, - .cb_barrier = rcu_barrier_tasks_rude, .gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread, .get_gp_data = rcu_tasks_rude_get_gp_data, .cbflood_max = 50000, From 599194d01459670049079ff54d52442f54d38432 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Jul 2024 19:46:00 -0700 Subject: [PATCH 30/64] rcuscale: Stop testing RCU Tasks Rude asynchronous APIs The call_rcu_tasks_rude() and rcu_barrier_tasks_rude() APIs are currently unused. Furthermore, the idea is to get rid of RCU Tasks Rude entirely once all architectures have their deep-idle and entry/exit code correctly marked as inline or noinstr. As a step towards this goal, this commit therefore removes these two functions from rcuscale testing. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcuscale.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index b53a9e8f5904..d534d4ec2314 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -333,8 +333,6 @@ static struct rcu_scale_ops tasks_rude_ops = { .readunlock = tasks_rude_scale_read_unlock, .get_gp_seq = rcu_no_completed, .gp_diff = rcu_seq_diff, - .async = call_rcu_tasks_rude, - .gp_barrier = rcu_barrier_tasks_rude, .sync = synchronize_rcu_tasks_rude, .exp_sync = synchronize_rcu_tasks_rude, .rso_gp_kthread = get_rcu_tasks_rude_gp_kthread, From 7945b741d1fc071a621366c512a060ea08848955 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Jul 2024 19:47:47 -0700 Subject: [PATCH 31/64] rcu-tasks: Remove RCU Tasks Rude asynchronous APIs The call_rcu_tasks_rude() and rcu_barrier_tasks_rude() APIs are currently unused. This commit therefore removes their definitions and boot-time self-tests. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Signed-off-by: Neeraj Upadhyay --- include/linux/rcupdate.h | 2 -- kernel/rcu/tasks.h | 55 +++++++++++++--------------------------- 2 files changed, 17 insertions(+), 40 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 13f6f00aecf9..31e679c7110e 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -37,7 +37,6 @@ /* Exported common interfaces */ void call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier_tasks(void); -void rcu_barrier_tasks_rude(void); void synchronize_rcu(void); struct rcu_gp_oldstate; @@ -202,7 +201,6 @@ do { \ } while (0) # ifdef CONFIG_TASKS_RUDE_RCU -void call_rcu_tasks_rude(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks_rude(void); # endif diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index ba3440a45b6d..4bc038bcc016 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -396,7 +396,7 @@ static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp) // Wait for all in-flight callbacks for the specified RCU Tasks flavor. // Operates in a manner similar to rcu_barrier(). -static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp) +static void __maybe_unused rcu_barrier_tasks_generic(struct rcu_tasks *rtp) { int cpu; unsigned long flags; @@ -1244,13 +1244,12 @@ void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } //////////////////////////////////////////////////////////////////////// // -// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of -// passing an empty function to schedule_on_each_cpu(). This approach -// provides an asynchronous call_rcu_tasks_rude() API and batching of -// concurrent calls to the synchronous synchronize_rcu_tasks_rude() API. -// This invokes schedule_on_each_cpu() in order to send IPIs far and wide -// and induces otherwise unnecessary context switches on all online CPUs, -// whether idle or not. +// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's +// trick of passing an empty function to schedule_on_each_cpu(). +// This approach provides batching of concurrent calls to the synchronous +// synchronize_rcu_tasks_rude() API. This invokes schedule_on_each_cpu() +// in order to send IPIs far and wide and induces otherwise unnecessary +// context switches on all online CPUs, whether idle or not. // // Callback handling is provided by the rcu_tasks_kthread() function. // @@ -1268,11 +1267,11 @@ static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) schedule_on_each_cpu(rcu_tasks_be_rude); } -void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func); +static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func); DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude, "RCU Tasks Rude"); -/** +/* * call_rcu_tasks_rude() - Queue a callback rude task-based grace period * @rhp: structure to be used for queueing the RCU updates. * @func: actual callback function to be invoked after the grace period @@ -1289,12 +1288,14 @@ DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude, * * See the description of call_rcu() for more detailed information on * memory ordering guarantees. + * + * This is no longer exported, and is instead reserved for use by + * synchronize_rcu_tasks_rude(). */ -void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func) +static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func) { call_rcu_tasks_generic(rhp, func, &rcu_tasks_rude); } -EXPORT_SYMBOL_GPL(call_rcu_tasks_rude); /** * synchronize_rcu_tasks_rude - wait for a rude rcu-tasks grace period @@ -1320,26 +1321,9 @@ void synchronize_rcu_tasks_rude(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude); -/** - * rcu_barrier_tasks_rude - Wait for in-flight call_rcu_tasks_rude() callbacks. - * - * Although the current implementation is guaranteed to wait, it is not - * obligated to, for example, if there are no pending callbacks. - */ -void rcu_barrier_tasks_rude(void) -{ - rcu_barrier_tasks_generic(&rcu_tasks_rude); -} -EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude); - -int rcu_tasks_rude_lazy_ms = -1; -module_param(rcu_tasks_rude_lazy_ms, int, 0444); - static int __init rcu_spawn_tasks_rude_kthread(void) { rcu_tasks_rude.gp_sleep = HZ / 10; - if (rcu_tasks_rude_lazy_ms >= 0) - rcu_tasks_rude.lazy_jiffies = msecs_to_jiffies(rcu_tasks_rude_lazy_ms); rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); return 0; } @@ -2069,11 +2053,6 @@ static struct rcu_tasks_test_desc tests[] = { /* If not defined, the test is skipped. */ .notrun = IS_ENABLED(CONFIG_TASKS_RCU), }, - { - .name = "call_rcu_tasks_rude()", - /* If not defined, the test is skipped. */ - .notrun = IS_ENABLED(CONFIG_TASKS_RUDE_RCU), - }, { .name = "call_rcu_tasks_trace()", /* If not defined, the test is skipped. */ @@ -2081,6 +2060,7 @@ static struct rcu_tasks_test_desc tests[] = { } }; +#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) static void test_rcu_tasks_callback(struct rcu_head *rhp) { struct rcu_tasks_test_desc *rttd = @@ -2090,6 +2070,7 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp) rttd->notrun = false; } +#endif // #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) static void rcu_tasks_initiate_self_tests(void) { @@ -2102,16 +2083,14 @@ static void rcu_tasks_initiate_self_tests(void) #ifdef CONFIG_TASKS_RUDE_RCU pr_info("Running RCU Tasks Rude wait API self tests\n"); - tests[1].runstart = jiffies; synchronize_rcu_tasks_rude(); - call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback); #endif #ifdef CONFIG_TASKS_TRACE_RCU pr_info("Running RCU Tasks Trace wait API self tests\n"); - tests[2].runstart = jiffies; + tests[1].runstart = jiffies; synchronize_rcu_tasks_trace(); - call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback); + call_rcu_tasks_trace(&tests[1].rh, test_rcu_tasks_callback); #endif } From fd70e9f1d85f5323096ad313ba73f5fe3d15ea41 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 10 Jul 2024 12:45:42 +0800 Subject: [PATCH 32/64] rcu-tasks: Fix access non-existent percpu rtpcp variable in rcu_tasks_need_gpcb() For kernels built with CONFIG_FORCE_NR_CPUS=y, the nr_cpu_ids is defined as NR_CPUS instead of the number of possible cpus, this will cause the following system panic: smpboot: Allowing 4 CPUs, 0 hotplug CPUs ... setup_percpu: NR_CPUS:512 nr_cpumask_bits:512 nr_cpu_ids:512 nr_node_ids:1 ... BUG: unable to handle page fault for address: ffffffff9911c8c8 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 0 PID: 15 Comm: rcu_tasks_trace Tainted: G W 6.6.21 #1 5dc7acf91a5e8e9ac9dcfc35bee0245691283ea6 RIP: 0010:rcu_tasks_need_gpcb+0x25d/0x2c0 RSP: 0018:ffffa371c00a3e60 EFLAGS: 00010082 CR2: ffffffff9911c8c8 CR3: 000000040fa20005 CR4: 00000000001706f0 Call Trace: ? __die+0x23/0x80 ? page_fault_oops+0xa4/0x180 ? exc_page_fault+0x152/0x180 ? asm_exc_page_fault+0x26/0x40 ? rcu_tasks_need_gpcb+0x25d/0x2c0 ? __pfx_rcu_tasks_kthread+0x40/0x40 rcu_tasks_one_gp+0x69/0x180 rcu_tasks_kthread+0x94/0xc0 kthread+0xe8/0x140 ? __pfx_kthread+0x40/0x40 ret_from_fork+0x34/0x80 ? __pfx_kthread+0x40/0x40 ret_from_fork_asm+0x1b/0x80 Considering that there may be holes in the CPU numbers, use the maximum possible cpu number, instead of nr_cpu_ids, for configuring enqueue and dequeue limits. [ neeraj.upadhyay: Fix htmldocs build error reported by Stephen Rothwell ] Closes: https://lore.kernel.org/linux-input/CALMA0xaTSMN+p4xUXkzrtR5r6k7hgoswcaXx7baR_z9r5jjskw@mail.gmail.com/T/#u Reported-by: Zhixu Liu Signed-off-by: Zqiang Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tasks.h | 82 ++++++++++++++++++++++++++++++---------------- 1 file changed, 53 insertions(+), 29 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 4bc038bcc016..72d564c84499 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -34,6 +34,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @rtp_blkd_tasks: List of tasks blocked as readers. * @rtp_exit_list: List of tasks in the latter portion of do_exit(). * @cpu: CPU number corresponding to this entry. + * @index: Index of this CPU in rtpcp_array of the rcu_tasks structure. * @rtpp: Pointer to the rcu_tasks structure. */ struct rcu_tasks_percpu { @@ -49,6 +50,7 @@ struct rcu_tasks_percpu { struct list_head rtp_blkd_tasks; struct list_head rtp_exit_list; int cpu; + int index; struct rcu_tasks *rtpp; }; @@ -76,6 +78,7 @@ struct rcu_tasks_percpu { * @call_func: This flavor's call_rcu()-equivalent function. * @wait_state: Task state for synchronous grace-period waits (default TASK_UNINTERRUPTIBLE). * @rtpcpu: This flavor's rcu_tasks_percpu structure. + * @rtpcp_array: Array of pointers to rcu_tasks_percpu structure of CPUs in cpu_possible_mask. * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks. * @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing. * @percpu_dequeue_lim: Number of per-CPU callback queues in use for dequeuing. @@ -110,6 +113,7 @@ struct rcu_tasks { call_rcu_func_t call_func; unsigned int wait_state; struct rcu_tasks_percpu __percpu *rtpcpu; + struct rcu_tasks_percpu **rtpcp_array; int percpu_enqueue_shift; int percpu_enqueue_lim; int percpu_dequeue_lim; @@ -182,6 +186,8 @@ module_param(rcu_task_collapse_lim, int, 0444); static int rcu_task_lazy_lim __read_mostly = 32; module_param(rcu_task_lazy_lim, int, 0444); +static int rcu_task_cpu_ids; + /* RCU tasks grace-period state for debugging. */ #define RTGS_INIT 0 #define RTGS_WAIT_WAIT_CBS 1 @@ -245,6 +251,8 @@ static void cblist_init_generic(struct rcu_tasks *rtp) int cpu; int lim; int shift; + int maxcpu; + int index = 0; if (rcu_task_enqueue_lim < 0) { rcu_task_enqueue_lim = 1; @@ -254,14 +262,9 @@ static void cblist_init_generic(struct rcu_tasks *rtp) } lim = rcu_task_enqueue_lim; - if (lim > nr_cpu_ids) - lim = nr_cpu_ids; - shift = ilog2(nr_cpu_ids / lim); - if (((nr_cpu_ids - 1) >> shift) >= lim) - shift++; - WRITE_ONCE(rtp->percpu_enqueue_shift, shift); - WRITE_ONCE(rtp->percpu_dequeue_lim, lim); - smp_store_release(&rtp->percpu_enqueue_lim, lim); + rtp->rtpcp_array = kcalloc(num_possible_cpus(), sizeof(struct rcu_tasks_percpu *), GFP_KERNEL); + BUG_ON(!rtp->rtpcp_array); + for_each_possible_cpu(cpu) { struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); @@ -273,14 +276,29 @@ static void cblist_init_generic(struct rcu_tasks *rtp) INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq); rtpcp->cpu = cpu; rtpcp->rtpp = rtp; + rtpcp->index = index; + rtp->rtpcp_array[index] = rtpcp; + index++; if (!rtpcp->rtp_blkd_tasks.next) INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); if (!rtpcp->rtp_exit_list.next) INIT_LIST_HEAD(&rtpcp->rtp_exit_list); + maxcpu = cpu; } - pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name, - data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), rcu_task_cb_adjust); + rcu_task_cpu_ids = maxcpu + 1; + if (lim > rcu_task_cpu_ids) + lim = rcu_task_cpu_ids; + shift = ilog2(rcu_task_cpu_ids / lim); + if (((rcu_task_cpu_ids - 1) >> shift) >= lim) + shift++; + WRITE_ONCE(rtp->percpu_enqueue_shift, shift); + WRITE_ONCE(rtp->percpu_dequeue_lim, lim); + smp_store_release(&rtp->percpu_enqueue_lim, lim); + + pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d rcu_task_cpu_ids=%d.\n", + rtp->name, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), + rcu_task_cb_adjust, rcu_task_cpu_ids); } // Compute wakeup time for lazy callback timer. @@ -348,7 +366,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, rtpcp->rtp_n_lock_retries = 0; } if (rcu_task_cb_adjust && ++rtpcp->rtp_n_lock_retries > rcu_task_contend_lim && - READ_ONCE(rtp->percpu_enqueue_lim) != nr_cpu_ids) + READ_ONCE(rtp->percpu_enqueue_lim) != rcu_task_cpu_ids) needadjust = true; // Defer adjustment to avoid deadlock. } // Queuing callbacks before initialization not yet supported. @@ -368,10 +386,10 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); if (unlikely(needadjust)) { raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); - if (rtp->percpu_enqueue_lim != nr_cpu_ids) { + if (rtp->percpu_enqueue_lim != rcu_task_cpu_ids) { WRITE_ONCE(rtp->percpu_enqueue_shift, 0); - WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids); - smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids); + WRITE_ONCE(rtp->percpu_dequeue_lim, rcu_task_cpu_ids); + smp_store_release(&rtp->percpu_enqueue_lim, rcu_task_cpu_ids); pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name); } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); @@ -444,6 +462,8 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) dequeue_limit = smp_load_acquire(&rtp->percpu_dequeue_lim); for (cpu = 0; cpu < dequeue_limit; cpu++) { + if (!cpu_possible(cpu)) + continue; struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); /* Advance and accelerate any new callbacks. */ @@ -481,7 +501,7 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) { raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rtp->percpu_enqueue_lim > 1) { - WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids)); + WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(rcu_task_cpu_ids)); smp_store_release(&rtp->percpu_enqueue_lim, 1); rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu(); gpdone = false; @@ -496,7 +516,9 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name); } if (rtp->percpu_dequeue_lim == 1) { - for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) { + for (cpu = rtp->percpu_dequeue_lim; cpu < rcu_task_cpu_ids; cpu++) { + if (!cpu_possible(cpu)) + continue; struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist)); @@ -511,30 +533,32 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) // Advance callbacks and invoke any that are ready. static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu *rtpcp) { - int cpu; - int cpunext; int cpuwq; unsigned long flags; int len; + int index; struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); struct rcu_tasks_percpu *rtpcp_next; - cpu = rtpcp->cpu; - cpunext = cpu * 2 + 1; - if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { - rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); - cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND; - queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); - cpunext++; - if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { - rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); - cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND; + index = rtpcp->index * 2 + 1; + if (index < num_possible_cpus()) { + rtpcp_next = rtp->rtpcp_array[index]; + if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) { + cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND; queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); + index++; + if (index < num_possible_cpus()) { + rtpcp_next = rtp->rtpcp_array[index]; + if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) { + cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND; + queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); + } + } } } - if (rcu_segcblist_empty(&rtpcp->cblist) || !cpu_possible(cpu)) + if (rcu_segcblist_empty(&rtpcp->cblist)) return; raw_spin_lock_irqsave_rcu_node(rtpcp, flags); rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); From 49f49266ec9d61a4d038cb4ddff4417ba793198f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:34:21 -0700 Subject: [PATCH 33/64] rcu/tasks: Check processor-ID assumptions The current mapping of smp_processor_id() to a CPU processing Tasks-RCU callbacks makes some assumptions about layout. This commit therefore adds a WARN_ON() to check these assumptions. [ neeraj.upadhyay: Replace nr_cpu_ids with rcu_task_cpu_ids. ] Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tasks.h | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 72d564c84499..9fb8bb0afff7 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -357,6 +357,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, rcu_read_lock(); ideal_cpu = smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift); chosen_cpu = cpumask_next(ideal_cpu - 1, cpu_possible_mask); + WARN_ON_ONCE(chosen_cpu >= rcu_task_cpu_ids); rtpcp = per_cpu_ptr(rtp->rtpcpu, chosen_cpu); if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled. raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. From 522295774db51ab39ea3d46926555401c8f10130 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:34:22 -0700 Subject: [PATCH 34/64] rcu/tasks: Update rtp->tasks_gp_seq comment The rtp->tasks_gp_seq grace-period sequence number is not a strict count, but rather the usual RCU sequence number with the lower few bits tracking per-grace-period state and the upper bits the count of grace periods since boot, give or take the initial value. This commit therefore adjusts this comment. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 9fb8bb0afff7..127018280618 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -65,7 +65,7 @@ struct rcu_tasks_percpu { * @init_fract: Initial backoff sleep interval. * @gp_jiffies: Time of last @gp_state transition. * @gp_start: Most recent grace-period start in jiffies. - * @tasks_gp_seq: Number of grace periods completed since boot. + * @tasks_gp_seq: Number of grace periods completed since boot in upper bits. * @n_ipis: Number of IPIs sent to encourage grace periods to end. * @n_ipis_fails: Number of IPI-send failures. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. From 54973cdd166b13195a99940cd21a827c3a99cac2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:34:23 -0700 Subject: [PATCH 35/64] rcu: Provide rcu_barrier_cb_is_done() to check rcu_barrier() CBs This commit provides a rcu_barrier_cb_is_done() function that returns true if the *rcu_barrier*() callback passed in is done. This will be used when printing grace-period debugging information. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcu.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 38238e595a61..caaed27e476b 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -255,6 +255,11 @@ static inline void debug_rcu_head_callback(struct rcu_head *rhp) kmem_dump_obj(rhp); } +static inline bool rcu_barrier_cb_is_done(struct rcu_head *rhp) +{ + return rhp->next == rhp; +} + extern int rcu_cpu_stall_suppress_at_boot; static inline bool rcu_stall_is_suppressed_at_boot(void) From d3f84aeb71d61ece24164bb2ce3161c60922fd8c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:34:24 -0700 Subject: [PATCH 36/64] rcu/tasks: Mark callbacks not currently participating in barrier operation Each Tasks RCU flavor keeps a count of the number of callbacks that the current rcu_barrier_tasks*() is waiting on, but there is currently no easy way to work out which callback is stuck. One way to do this is to mark idle RCU-barrier callbacks by making the ->next pointer point to the callback itself, and this commit does just that. Later commits will use this for debug output. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tasks.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 127018280618..d44abcd656d6 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -283,6 +283,7 @@ static void cblist_init_generic(struct rcu_tasks *rtp) INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); if (!rtpcp->rtp_exit_list.next) INIT_LIST_HEAD(&rtpcp->rtp_exit_list); + rtpcp->barrier_q_head.next = &rtpcp->barrier_q_head; maxcpu = cpu; } @@ -407,6 +408,7 @@ static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp) struct rcu_tasks *rtp; struct rcu_tasks_percpu *rtpcp; + rhp->next = rhp; // Mark the callback as having been invoked. rtpcp = container_of(rhp, struct rcu_tasks_percpu, barrier_q_head); rtp = rtpcp->rtpp; if (atomic_dec_and_test(&rtp->barrier_q_count)) From fe91cf39db0939ac8d523b5a1c31840f7cbe205c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:34:25 -0700 Subject: [PATCH 37/64] rcu/tasks: Add detailed grace-period and barrier diagnostics This commit adds rcu_tasks_torture_stats_print(), rcu_tasks_trace_torture_stats_print(), and rcu_tasks_rude_torture_stats_print() functions that provide detailed diagnostics on grace-period, callback, and barrier state. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- include/linux/rcupdate.h | 3 ++ kernel/rcu/tasks.h | 66 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 31e679c7110e..17463e95b6ef 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -164,6 +164,7 @@ static inline void rcu_nocb_flush_deferred_wakeup(void) { } } while (0) void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks(void); +void rcu_tasks_torture_stats_print(char *tt, char *tf); # else # define rcu_tasks_classic_qs(t, preempt) do { } while (0) # define call_rcu_tasks call_rcu @@ -190,6 +191,7 @@ void rcu_tasks_trace_qs_blkd(struct task_struct *t); rcu_tasks_trace_qs_blkd(t); \ } \ } while (0) +void rcu_tasks_trace_torture_stats_print(char *tt, char *tf); # else # define rcu_tasks_trace_qs(t) do { } while (0) # endif @@ -202,6 +204,7 @@ do { \ # ifdef CONFIG_TASKS_RUDE_RCU void synchronize_rcu_tasks_rude(void); +void rcu_tasks_rude_torture_stats_print(char *tt, char *tf); # endif #define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d44abcd656d6..5f6d80ce1e47 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -714,9 +714,7 @@ static void __init rcu_tasks_bootup_oddness(void) #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } -#endif /* #ifndef CONFIG_TINY_RCU */ -#ifndef CONFIG_TINY_RCU /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { @@ -750,6 +748,52 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) rtp->lazy_jiffies, s); } + +/* Dump out more rcutorture-relevant state common to all RCU-tasks flavors. */ +static void rcu_tasks_torture_stats_print_generic(struct rcu_tasks *rtp, char *tt, + char *tf, char *tst) +{ + cpumask_var_t cm; + int cpu; + bool gotcb = false; + unsigned long j = jiffies; + + pr_alert("%s%s Tasks%s RCU g%ld gp_start %lu gp_jiffies %lu gp_state %d (%s).\n", + tt, tf, tst, data_race(rtp->tasks_gp_seq), + j - data_race(rtp->gp_start), j - data_race(rtp->gp_jiffies), + data_race(rtp->gp_state), tasks_gp_state_getname(rtp)); + pr_alert("\tEnqueue shift %d limit %d Dequeue limit %d gpseq %lu.\n", + data_race(rtp->percpu_enqueue_shift), + data_race(rtp->percpu_enqueue_lim), + data_race(rtp->percpu_dequeue_lim), + data_race(rtp->percpu_dequeue_gpseq)); + (void)zalloc_cpumask_var(&cm, GFP_KERNEL); + pr_alert("\tCallback counts:"); + for_each_possible_cpu(cpu) { + long n; + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + if (cpumask_available(cm) && !rcu_barrier_cb_is_done(&rtpcp->barrier_q_head)) + cpumask_set_cpu(cpu, cm); + n = rcu_segcblist_n_cbs(&rtpcp->cblist); + if (!n) + continue; + pr_cont(" %d:%ld", cpu, n); + gotcb = true; + } + if (gotcb) + pr_cont(".\n"); + else + pr_cont(" (none).\n"); + pr_alert("\tBarrier seq %lu count %d holdout CPUs ", + data_race(rtp->barrier_q_seq), atomic_read(&rtp->barrier_q_count)); + if (cpumask_available(cm) && !cpumask_empty(cm)) + pr_cont(" %*pbl.\n", cpumask_pr_args(cm)); + else + pr_cont("(none).\n"); + free_cpumask_var(cm); +} + #endif // #ifndef CONFIG_TINY_RCU static void exit_tasks_rcu_finish_trace(struct task_struct *t); @@ -1201,6 +1245,12 @@ void show_rcu_tasks_classic_gp_kthread(void) show_rcu_tasks_generic_gp_kthread(&rcu_tasks, ""); } EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread); + +void rcu_tasks_torture_stats_print(char *tt, char *tf) +{ + rcu_tasks_torture_stats_print_generic(&rcu_tasks, tt, tf, ""); +} +EXPORT_SYMBOL_GPL(rcu_tasks_torture_stats_print); #endif // !defined(CONFIG_TINY_RCU) struct task_struct *get_rcu_tasks_gp_kthread(void) @@ -1361,6 +1411,12 @@ void show_rcu_tasks_rude_gp_kthread(void) show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, ""); } EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); + +void rcu_tasks_rude_torture_stats_print(char *tt, char *tf) +{ + rcu_tasks_torture_stats_print_generic(&rcu_tasks_rude, tt, tf, ""); +} +EXPORT_SYMBOL_GPL(rcu_tasks_rude_torture_stats_print); #endif // !defined(CONFIG_TINY_RCU) struct task_struct *get_rcu_tasks_rude_gp_kthread(void) @@ -2038,6 +2094,12 @@ void show_rcu_tasks_trace_gp_kthread(void) show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); } EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread); + +void rcu_tasks_trace_torture_stats_print(char *tt, char *tf) +{ + rcu_tasks_torture_stats_print_generic(&rcu_tasks_trace, tt, tf, ""); +} +EXPORT_SYMBOL_GPL(rcu_tasks_trace_torture_stats_print); #endif // !defined(CONFIG_TINY_RCU) struct task_struct *get_rcu_tasks_trace_gp_kthread(void) From 591ce640819ff10385cd072fb19c2ca5d5181fe5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:34:26 -0700 Subject: [PATCH 38/64] rcu/tasks: Add rcu_barrier_tasks*() start time to diagnostics This commit adds the start time, in jiffies, of the most recently started rcu_barrier_tasks*() operation to the diagnostic output used by rcuscale. This information can be helpful in distinguishing a hung barrier operation from a long series of barrier operations. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tasks.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 5f6d80ce1e47..36be92efb0da 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -87,6 +87,7 @@ struct rcu_tasks_percpu { * @barrier_q_count: Number of queues being waited on. * @barrier_q_completion: Barrier wait/wakeup mechanism. * @barrier_q_seq: Sequence number for barrier operations. + * @barrier_q_start: Most recent barrier start in jiffies. * @name: This flavor's textual name. * @kname: This flavor's kthread name. */ @@ -122,6 +123,7 @@ struct rcu_tasks { atomic_t barrier_q_count; struct completion barrier_q_completion; unsigned long barrier_q_seq; + unsigned long barrier_q_start; char *name; char *kname; }; @@ -430,6 +432,7 @@ static void __maybe_unused rcu_barrier_tasks_generic(struct rcu_tasks *rtp) mutex_unlock(&rtp->barrier_q_mutex); return; } + rtp->barrier_q_start = jiffies; rcu_seq_start(&rtp->barrier_q_seq); init_completion(&rtp->barrier_q_completion); atomic_set(&rtp->barrier_q_count, 2); @@ -785,8 +788,9 @@ static void rcu_tasks_torture_stats_print_generic(struct rcu_tasks *rtp, char *t pr_cont(".\n"); else pr_cont(" (none).\n"); - pr_alert("\tBarrier seq %lu count %d holdout CPUs ", - data_race(rtp->barrier_q_seq), atomic_read(&rtp->barrier_q_count)); + pr_alert("\tBarrier seq %lu start %lu count %d holdout CPUs ", + data_race(rtp->barrier_q_seq), j - data_race(rtp->barrier_q_start), + atomic_read(&rtp->barrier_q_count)); if (cpumask_available(cm) && !cpumask_empty(cm)) pr_cont(" %*pbl.\n", cpumask_pr_args(cm)); else From 3e49aea71d5bac93b892f954d14bc8070e4cc651 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Jul 2024 19:57:47 -0700 Subject: [PATCH 39/64] refscale: Add TINY scenario This commit adds a TINY scenario in order to support tests of Tiny RCU and Tiny SRCU. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- .../rcutorture/configs/refscale/TINY | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 tools/testing/selftests/rcutorture/configs/refscale/TINY diff --git a/tools/testing/selftests/rcutorture/configs/refscale/TINY b/tools/testing/selftests/rcutorture/configs/refscale/TINY new file mode 100644 index 000000000000..759343980b80 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/refscale/TINY @@ -0,0 +1,20 @@ +CONFIG_SMP=n +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +CONFIG_PREEMPT_DYNAMIC=n +#CHECK#CONFIG_PREEMPT_RCU=n +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_HOTPLUG_CPU=n +CONFIG_SUSPEND=n +CONFIG_HIBERNATION=n +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +CONFIG_RCU_BOOST=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y +CONFIG_KPROBES=n +CONFIG_FTRACE=n From 4e39bb49c2de445848c2668d7f9fec6ba70724ae Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 21 Jul 2024 09:23:46 +0200 Subject: [PATCH 40/64] refscale: Optimize process_durations() process_durations() is not a hot path, but there is no good reason to iterate over and over the data already in 'buf'. Using a seq_buf saves some useless strcat() and the need of a temp buffer. Data is written directly at the correct place. Signed-off-by: Christophe JAILLET Tested-by: "Paul E. McKenney" Reviewed-by: Davidlohr Bueso Signed-off-by: Neeraj Upadhyay --- kernel/rcu/refscale.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index f4ea5b1ec068..cfec0648e141 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -891,32 +892,34 @@ static u64 process_durations(int n) { int i; struct reader_task *rt; - char buf1[64]; + struct seq_buf s; char *buf; u64 sum = 0; buf = kmalloc(800 + 64, GFP_KERNEL); if (!buf) return 0; - buf[0] = 0; - sprintf(buf, "Experiment #%d (Format: :)", - exp_idx); + seq_buf_init(&s, buf, 800 + 64); + + seq_buf_printf(&s, "Experiment #%d (Format: :)", + exp_idx); for (i = 0; i < n && !torture_must_stop(); i++) { rt = &(reader_tasks[i]); - sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns); if (i % 5 == 0) - strcat(buf, "\n"); - if (strlen(buf) >= 800) { - pr_alert("%s", buf); - buf[0] = 0; + seq_buf_putc(&s, '\n'); + + if (seq_buf_used(&s) >= 800) { + pr_alert("%s", seq_buf_str(&s)); + seq_buf_clear(&s); } - strcat(buf, buf1); + + seq_buf_printf(&s, "%d: %llu\t", i, rt->last_duration_ns); sum += rt->last_duration_ns; } - pr_alert("%s\n", buf); + pr_alert("%s\n", seq_buf_str(&s)); kfree(buf); return sum; From ea793764b5c677cf16f114a5ee3fb620560a78f6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:42:58 -0700 Subject: [PATCH 41/64] rcuscale: Save a few lines with whitespace-only change This whitespace-only commit fuses a few lines of code, taking advantage of the newish 100-character-per-line limit to save a few lines of code. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcuscale.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index d534d4ec2314..3269dd9c639f 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -1015,13 +1015,9 @@ rcu_scale_init(void) } while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders) schedule_timeout_uninterruptible(1); - writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), - GFP_KERNEL); - writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), - GFP_KERNEL); - writer_n_durations = - kcalloc(nrealwriters, sizeof(*writer_n_durations), - GFP_KERNEL); + writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), GFP_KERNEL); + writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), GFP_KERNEL); + writer_n_durations = kcalloc(nrealwriters, sizeof(*writer_n_durations), GFP_KERNEL); if (!writer_tasks || !writer_durations || !writer_n_durations) { SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; From 42a8a2695ccbb98c15d73703cb39af70cdcf45fb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:42:59 -0700 Subject: [PATCH 42/64] rcuscale: Dump stacks of stalled rcu_scale_writer() instances This commit improves debuggability by dumping the stacks of rcu_scale_writer() instances that have not completed in a reasonable timeframe. These stacks are dumped remotely, but they will be accurate in the thus-far common case where the stalled rcu_scale_writer() instances are blocked. [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcuscale.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 3269dd9c639f..5087ca7062d9 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "rcu.h" @@ -111,6 +112,7 @@ static struct task_struct **reader_tasks; static struct task_struct *shutdown_task; static u64 **writer_durations; +static bool *writer_done; static int *writer_n_durations; static atomic_t n_rcu_scale_reader_started; static atomic_t n_rcu_scale_writer_started; @@ -524,6 +526,7 @@ rcu_scale_writer(void *arg) started = true; if (!done && i >= MIN_MEAS && time_after(jiffies, jdone)) { done = true; + WRITE_ONCE(writer_done[me], true); sched_set_normal(current, 0); pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n", scale_type, SCALE_FLAG, me, MIN_MEAS); @@ -549,6 +552,19 @@ rcu_scale_writer(void *arg) if (done && !alldone && atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters) alldone = true; + if (done && !alldone && time_after(jiffies, jdone + HZ * 60)) { + static atomic_t dumped; + int i; + + if (!atomic_xchg(&dumped, 1)) { + for (i = 0; i < nrealwriters; i++) { + if (writer_done[i]) + continue; + pr_info("%s: Task %ld flags writer %d:\n", __func__, me, i); + sched_show_task(writer_tasks[i]); + } + } + } if (started && !alldone && i < MAX_MEAS - 1) i++; rcu_scale_wait_shutdown(); @@ -921,6 +937,8 @@ rcu_scale_cleanup(void) kfree(writer_tasks); kfree(writer_durations); kfree(writer_n_durations); + kfree(writer_done); + writer_done = NULL; } /* Do torture-type-specific cleanup operations. */ @@ -1015,10 +1033,11 @@ rcu_scale_init(void) } while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders) schedule_timeout_uninterruptible(1); - writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), GFP_KERNEL); + writer_tasks = kcalloc(nrealwriters, sizeof(writer_tasks[0]), GFP_KERNEL); writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), GFP_KERNEL); writer_n_durations = kcalloc(nrealwriters, sizeof(*writer_n_durations), GFP_KERNEL); - if (!writer_tasks || !writer_durations || !writer_n_durations) { + writer_done = kcalloc(nrealwriters, sizeof(writer_done[0]), GFP_KERNEL); + if (!writer_tasks || !writer_durations || !writer_n_durations || !writer_done) { SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; From 48c21c020fcacba6deb065c78ac1876418b5be11 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:43:00 -0700 Subject: [PATCH 43/64] rcuscale: Dump grace-period statistics when rcu_scale_writer() stalls This commit adds a .stats function pointer to the rcu_scale_ops structure, and if this is non-NULL, it is invoked after stack traces are dumped in response to a rcu_scale_writer() stall. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcuscale.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 5087ca7062d9..1b9a43653d7e 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -145,6 +145,7 @@ struct rcu_scale_ops { void (*sync)(void); void (*exp_sync)(void); struct task_struct *(*rso_gp_kthread)(void); + void (*stats)(void); const char *name; }; @@ -226,6 +227,11 @@ static void srcu_scale_synchronize(void) synchronize_srcu(srcu_ctlp); } +static void srcu_scale_stats(void) +{ + srcu_torture_stats_print(srcu_ctlp, scale_type, SCALE_FLAG); +} + static void srcu_scale_synchronize_expedited(void) { synchronize_srcu_expedited(srcu_ctlp); @@ -243,6 +249,7 @@ static struct rcu_scale_ops srcu_ops = { .gp_barrier = srcu_rcu_barrier, .sync = srcu_scale_synchronize, .exp_sync = srcu_scale_synchronize_expedited, + .stats = srcu_scale_stats, .name = "srcu" }; @@ -272,6 +279,7 @@ static struct rcu_scale_ops srcud_ops = { .gp_barrier = srcu_rcu_barrier, .sync = srcu_scale_synchronize, .exp_sync = srcu_scale_synchronize_expedited, + .stats = srcu_scale_stats, .name = "srcud" }; @@ -563,6 +571,8 @@ rcu_scale_writer(void *arg) pr_info("%s: Task %ld flags writer %d:\n", __func__, me, i); sched_show_task(writer_tasks[i]); } + if (cur_ops->stats) + cur_ops->stats(); } } if (started && !alldone && i < MAX_MEAS - 1) From 0616f7e981968743edb1ef758ba1a0e984240419 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:43:01 -0700 Subject: [PATCH 44/64] rcu: Mark callbacks not currently participating in barrier operation RCU keeps a count of the number of callbacks that the current rcu_barrier() is waiting on, but there is currently no easy way to work out which callback is stuck. One way to do this is to mark idle RCU-barrier callbacks by making the ->next pointer point to the callback itself, and this commit does just that. Later commits will use this for debug output. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e641cc681901..f931171daecd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4403,6 +4403,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp) { unsigned long __maybe_unused s = rcu_state.barrier_sequence; + rhp->next = rhp; // Mark the callback as having been invoked. if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) { rcu_barrier_trace(TPS("LastCB"), -1, s); complete(&rcu_state.barrier_completion); @@ -5424,6 +5425,8 @@ static void __init rcu_init_one(void) while (i > rnp->grphi) rnp++; per_cpu_ptr(&rcu_data, i)->mynode = rnp; + per_cpu_ptr(&rcu_data, i)->barrier_head.next = + &per_cpu_ptr(&rcu_data, i)->barrier_head; rcu_boot_init_percpu_data(i); } } From 674fc922f06f7632ac94536ca92dd8addb606f46 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:43:02 -0700 Subject: [PATCH 45/64] rcuscale: Print detailed grace-period and barrier diagnostics This commit uses the new rcu_tasks_torture_stats_print(), rcu_tasks_trace_torture_stats_print(), and rcu_tasks_rude_torture_stats_print() functions in order to provide detailed diagnostics on grace-period, callback, and barrier state when rcu_scale_writer() hangs. [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcuscale.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 1b9a43653d7e..c507750e94d8 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -298,6 +298,11 @@ static void tasks_scale_read_unlock(int idx) { } +static void rcu_tasks_scale_stats(void) +{ + rcu_tasks_torture_stats_print(scale_type, SCALE_FLAG); +} + static struct rcu_scale_ops tasks_ops = { .ptype = RCU_TASKS_FLAVOR, .init = rcu_sync_scale_init, @@ -310,6 +315,7 @@ static struct rcu_scale_ops tasks_ops = { .sync = synchronize_rcu_tasks, .exp_sync = synchronize_rcu_tasks, .rso_gp_kthread = get_rcu_tasks_gp_kthread, + .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_scale_stats, .name = "tasks" }; @@ -336,6 +342,11 @@ static void tasks_rude_scale_read_unlock(int idx) { } +static void rcu_tasks_rude_scale_stats(void) +{ + rcu_tasks_rude_torture_stats_print(scale_type, SCALE_FLAG); +} + static struct rcu_scale_ops tasks_rude_ops = { .ptype = RCU_TASKS_RUDE_FLAVOR, .init = rcu_sync_scale_init, @@ -346,6 +357,7 @@ static struct rcu_scale_ops tasks_rude_ops = { .sync = synchronize_rcu_tasks_rude, .exp_sync = synchronize_rcu_tasks_rude, .rso_gp_kthread = get_rcu_tasks_rude_gp_kthread, + .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_rude_scale_stats, .name = "tasks-rude" }; @@ -374,6 +386,11 @@ static void tasks_trace_scale_read_unlock(int idx) rcu_read_unlock_trace(); } +static void rcu_tasks_trace_scale_stats(void) +{ + rcu_tasks_trace_torture_stats_print(scale_type, SCALE_FLAG); +} + static struct rcu_scale_ops tasks_tracing_ops = { .ptype = RCU_TASKS_FLAVOR, .init = rcu_sync_scale_init, @@ -386,6 +403,7 @@ static struct rcu_scale_ops tasks_tracing_ops = { .sync = synchronize_rcu_tasks_trace, .exp_sync = synchronize_rcu_tasks_trace, .rso_gp_kthread = get_rcu_tasks_trace_gp_kthread, + .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_trace_scale_stats, .name = "tasks-tracing" }; From ac9d45544cd571decca395715d0b0a3b617d02f4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Jul 2024 13:33:58 -0700 Subject: [PATCH 46/64] locking/csd_lock: Provide an indication of ongoing CSD-lock stall If a CSD-lock stall goes on long enough, it will cause an RCU CPU stall warning. This additional warning provides much additional console-log traffic and little additional information. Therefore, provide a new csd_lock_is_stuck() function that returns true if there is an ongoing CSD-lock stall. This function will be used by the RCU CPU stall warnings to provide a one-line indication of the stall when this function returns true. [ neeraj.upadhyay: Apply Rik van Riel feedback. ] [ neeraj.upadhyay: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney Cc: Imran Khan Cc: Ingo Molnar Cc: Leonardo Bras Cc: "Peter Zijlstra (Intel)" Cc: Rik van Riel Signed-off-by: Neeraj Upadhyay --- include/linux/smp.h | 6 ++++++ kernel/smp.c | 16 ++++++++++++++++ lib/Kconfig.debug | 1 + 3 files changed, 23 insertions(+) diff --git a/include/linux/smp.h b/include/linux/smp.h index fcd61dfe2af3..3871bd32018f 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -294,4 +294,10 @@ int smpcfd_prepare_cpu(unsigned int cpu); int smpcfd_dead_cpu(unsigned int cpu); int smpcfd_dying_cpu(unsigned int cpu); +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG +bool csd_lock_is_stuck(void); +#else +static inline bool csd_lock_is_stuck(void) { return false; } +#endif + #endif /* __LINUX_SMP_H */ diff --git a/kernel/smp.c b/kernel/smp.c index e87953729230..202cda4d2a55 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -208,6 +208,19 @@ static int csd_lock_wait_getcpu(call_single_data_t *csd) return -1; } +static atomic_t n_csd_lock_stuck; + +/** + * csd_lock_is_stuck - Has a CSD-lock acquisition been stuck too long? + * + * Returns @true if a CSD-lock acquisition is stuck and has been stuck + * long enough for a "non-responsive CSD lock" message to be printed. + */ +bool csd_lock_is_stuck(void) +{ + return !!atomic_read(&n_csd_lock_stuck); +} + /* * Complain if too much time spent waiting. Note that only * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, @@ -229,6 +242,7 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in cpu = csd_lock_wait_getcpu(csd); pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n", *bug_id, raw_smp_processor_id(), cpu); + atomic_dec(&n_csd_lock_stuck); return true; } @@ -252,6 +266,8 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %lld ns for CPU#%02d %pS(%ps).\n", firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), (s64)ts_delta, cpu, csd->func, csd->info); + if (firsttime) + atomic_inc(&n_csd_lock_stuck); /* * If the CSD lock is still stuck after 5 minutes, it is unlikely * to become unstuck. Use a signed comparison to avoid triggering diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a30c03a66172..4e5f61cba8e4 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1614,6 +1614,7 @@ config SCF_TORTURE_TEST config CSD_LOCK_WAIT_DEBUG bool "Debugging for csd_lock_wait(), called from smp_call_function*()" depends on DEBUG_KERNEL + depends on SMP depends on 64BIT default n help From d40760d6811d55172ba6b90ebd2a60a75f88bffe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Jul 2024 14:32:20 -0700 Subject: [PATCH 47/64] locking/csd-lock: Use backoff for repeated reports of same incident Currently, the CSD-lock diagnostics in CONFIG_CSD_LOCK_WAIT_DEBUG=y kernels are emitted at five-second intervals. Although this has proven to be a good time interval for the first diagnostic, if the target CPU keeps interrupts disabled for way longer than five seconds, the ratio of useful new information to pointless repetition increases considerably. Therefore, back off the time period for repeated reports of the same incident, increasing linearly with the number of reports and logarithmicly with the number of online CPUs. [ paulmck: Apply Dan Carpenter feedback. ] Signed-off-by: Paul E. McKenney Cc: Imran Khan Cc: Ingo Molnar Cc: Leonardo Bras Cc: "Peter Zijlstra (Intel)" Cc: Rik van Riel Reviewed-by: Rik van Riel Signed-off-by: Neeraj Upadhyay --- kernel/smp.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/smp.c b/kernel/smp.c index 202cda4d2a55..b484ee6dcaf6 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -226,7 +226,7 @@ bool csd_lock_is_stuck(void) * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, * so waiting on other types gets much less information. */ -static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id) +static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id, unsigned long *nmessages) { int cpu = -1; int cpux; @@ -249,7 +249,9 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in ts2 = sched_clock(); /* How long since we last checked for a stuck CSD lock.*/ ts_delta = ts2 - *ts1; - if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0)) + if (likely(ts_delta <= csd_lock_timeout_ns * (*nmessages + 1) * + (!*nmessages ? 1 : (ilog2(num_online_cpus()) / 2 + 1)) || + csd_lock_timeout_ns == 0)) return false; firsttime = !*bug_id; @@ -266,6 +268,7 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %lld ns for CPU#%02d %pS(%ps).\n", firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), (s64)ts_delta, cpu, csd->func, csd->info); + (*nmessages)++; if (firsttime) atomic_inc(&n_csd_lock_stuck); /* @@ -306,12 +309,13 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in */ static void __csd_lock_wait(call_single_data_t *csd) { + unsigned long nmessages = 0; int bug_id = 0; u64 ts0, ts1; ts1 = ts0 = sched_clock(); for (;;) { - if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id)) + if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id, &nmessages)) break; cpu_relax(); } From 9fbaa44114ca6b69074a488295e86a9fa7e685f9 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 15 Jul 2024 13:49:41 -0400 Subject: [PATCH 48/64] smp: print only local CPU info when sched_clock goes backward About 40% of all csd_lock warnings observed in our fleet appear to be due to sched_clock() going backward in time (usually only a little bit), resulting in ts0 being larger than ts2. When the local CPU is at fault, we should print out a message reflecting that, rather than trying to get the remote CPU's stack trace. Signed-off-by: Rik van Riel Tested-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/smp.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/smp.c b/kernel/smp.c index b484ee6dcaf6..f25e20617b7e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -254,6 +254,14 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in csd_lock_timeout_ns == 0)) return false; + if (ts0 > ts2) { + /* Our own sched_clock went backward; don't blame another CPU. */ + ts_delta = ts0 - ts2; + pr_alert("sched_clock on CPU %d went backward by %llu ns\n", raw_smp_processor_id(), ts_delta); + *ts1 = ts2; + return false; + } + firsttime = !*bug_id; if (firsttime) *bug_id = atomic_inc_return(&csd_bug_count); From 1dd01c06506c42ab3a8026272ee93c466ae0cb4b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Jul 2024 19:30:56 -0700 Subject: [PATCH 49/64] rcu: Summarize RCU CPU stall warnings during CSD-lock stalls During CSD-lock stalls, the additional information output by RCU CPU stall warnings is usually redundant, flooding the console for not good reason. However, this has been the way things work for a few years. This commit therefore adds an rcutree.csd_lock_suppress_rcu_stall kernel boot parameter that causes RCU CPU stall warnings to be abbreviated to a single line when there is at least one CPU that has been stuck waiting for CSD lock for more than five seconds. To make this abbreviated message happen with decent probability: tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 8 \ --configs "2*TREE01" --kconfig "CONFIG_CSD_LOCK_WAIT_DEBUG=y" \ --bootargs "csdlock_debug=1 rcutorture.stall_cpu=200 \ rcutorture.stall_cpu_holdoff=120 rcutorture.stall_cpu_irqsoff=1 \ rcutree.csd_lock_suppress_rcu_stall=1 \ rcupdate.rcu_exp_cpu_stall_timeout=5000" --trust-make [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- Documentation/admin-guide/kernel-parameters.txt | 4 ++++ kernel/rcu/tree_stall.h | 8 +++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f1384c7b59c9..d56356c13184 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4937,6 +4937,10 @@ Set maximum number of finished RCU callbacks to process in one batch. + rcutree.csd_lock_suppress_rcu_stall= [KNL] + Do only a one-line RCU CPU stall warning when + there is an ongoing too-long CSD-lock wait. + rcutree.do_rcu_barrier= [KNL] Request a call to rcu_barrier(). This is throttled so that userspace tests can safely diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 4b0e9d7c4c68..b497d4c6dabd 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -9,6 +9,7 @@ #include #include +#include ////////////////////////////////////////////////////////////////////////////// // @@ -719,6 +720,9 @@ static void print_cpu_stall(unsigned long gps) set_preempt_need_resched(); } +static bool csd_lock_suppress_rcu_stall; +module_param(csd_lock_suppress_rcu_stall, bool, 0644); + static void check_cpu_stall(struct rcu_data *rdp) { bool self_detected; @@ -791,7 +795,9 @@ static void check_cpu_stall(struct rcu_data *rdp) return; rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps); - if (self_detected) { + if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) { + pr_err("INFO: %s detected stall, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name); + } else if (self_detected) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(gps); } else { From 27d5749d075578b105e958369a0263a0e15a8d3f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Jul 2024 21:19:06 -0700 Subject: [PATCH 50/64] rcu: Extract synchronize_rcu_expedited_stall() from synchronize_rcu_expedited_wait() This commit extracts the RCU CPU stall-warning report code from synchronize_rcu_expedited_wait() and places it in a new function named synchronize_rcu_expedited_stall(). This is strictly a code-movement commit. A later commit will use this reorganization to avoid printing expedited RCU CPU stall warnings while there are ongoing CSD-lock stall reports. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree_exp.h | 111 +++++++++++++++++++++++------------------- 1 file changed, 60 insertions(+), 51 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 4acd29d16fdb..17dd5169012d 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -542,6 +542,65 @@ static bool synchronize_rcu_expedited_wait_once(long tlimit) return false; } +/* + * Print out an expedited RCU CPU stall warning message. + */ +static void synchronize_rcu_expedited_stall(unsigned long jiffies_start, unsigned long j) +{ + int cpu; + unsigned long mask; + int ndetected; + struct rcu_node *rnp; + struct rcu_node *rnp_root = rcu_get_root(); + + pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name); + ndetected = 0; + rcu_for_each_leaf_node(rnp) { + ndetected += rcu_print_task_exp_stall(rnp); + for_each_leaf_node_possible_cpu(rnp, cpu) { + struct rcu_data *rdp; + + mask = leaf_node_cpu_bit(rnp, cpu); + if (!(READ_ONCE(rnp->expmask) & mask)) + continue; + ndetected++; + rdp = per_cpu_ptr(&rcu_data, cpu); + pr_cont(" %d-%c%c%c%c", cpu, + "O."[!!cpu_online(cpu)], + "o."[!!(rdp->grpmask & rnp->expmaskinit)], + "N."[!!(rdp->grpmask & rnp->expmaskinitnext)], + "D."[!!data_race(rdp->cpu_no_qs.b.exp)]); + } + } + pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", + j - jiffies_start, rcu_state.expedited_sequence, data_race(rnp_root->expmask), + ".T"[!!data_race(rnp_root->exp_tasks)]); + if (ndetected) { + pr_err("blocking rcu_node structures (internal RCU debug):"); + rcu_for_each_node_breadth_first(rnp) { + if (rnp == rnp_root) + continue; /* printed unconditionally */ + if (sync_rcu_exp_done_unlocked(rnp)) + continue; + pr_cont(" l=%u:%d-%d:%#lx/%c", + rnp->level, rnp->grplo, rnp->grphi, data_race(rnp->expmask), + ".T"[!!data_race(rnp->exp_tasks)]); + } + pr_cont("\n"); + } + rcu_for_each_leaf_node(rnp) { + for_each_leaf_node_possible_cpu(rnp, cpu) { + mask = leaf_node_cpu_bit(rnp, cpu); + if (!(READ_ONCE(rnp->expmask) & mask)) + continue; + preempt_disable(); // For smp_processor_id() in dump_cpu_task(). + dump_cpu_task(cpu); + preempt_enable(); + } + rcu_exp_print_detail_task_stall_rnp(rnp); + } +} + /* * Wait for the expedited grace period to elapse, issuing any needed * RCU CPU stall warnings along the way. @@ -553,10 +612,8 @@ static void synchronize_rcu_expedited_wait(void) unsigned long jiffies_stall; unsigned long jiffies_start; unsigned long mask; - int ndetected; struct rcu_data *rdp; struct rcu_node *rnp; - struct rcu_node *rnp_root = rcu_get_root(); unsigned long flags; trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); @@ -593,55 +650,7 @@ static void synchronize_rcu_expedited_wait(void) j = jiffies; rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_EXP, (void *)(j - jiffies_start)); trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall")); - pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", - rcu_state.name); - ndetected = 0; - rcu_for_each_leaf_node(rnp) { - ndetected += rcu_print_task_exp_stall(rnp); - for_each_leaf_node_possible_cpu(rnp, cpu) { - struct rcu_data *rdp; - - mask = leaf_node_cpu_bit(rnp, cpu); - if (!(READ_ONCE(rnp->expmask) & mask)) - continue; - ndetected++; - rdp = per_cpu_ptr(&rcu_data, cpu); - pr_cont(" %d-%c%c%c%c", cpu, - "O."[!!cpu_online(cpu)], - "o."[!!(rdp->grpmask & rnp->expmaskinit)], - "N."[!!(rdp->grpmask & rnp->expmaskinitnext)], - "D."[!!data_race(rdp->cpu_no_qs.b.exp)]); - } - } - pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", - j - jiffies_start, rcu_state.expedited_sequence, - data_race(rnp_root->expmask), - ".T"[!!data_race(rnp_root->exp_tasks)]); - if (ndetected) { - pr_err("blocking rcu_node structures (internal RCU debug):"); - rcu_for_each_node_breadth_first(rnp) { - if (rnp == rnp_root) - continue; /* printed unconditionally */ - if (sync_rcu_exp_done_unlocked(rnp)) - continue; - pr_cont(" l=%u:%d-%d:%#lx/%c", - rnp->level, rnp->grplo, rnp->grphi, - data_race(rnp->expmask), - ".T"[!!data_race(rnp->exp_tasks)]); - } - pr_cont("\n"); - } - rcu_for_each_leaf_node(rnp) { - for_each_leaf_node_possible_cpu(rnp, cpu) { - mask = leaf_node_cpu_bit(rnp, cpu); - if (!(READ_ONCE(rnp->expmask) & mask)) - continue; - preempt_disable(); // For smp_processor_id() in dump_cpu_task(). - dump_cpu_task(cpu); - preempt_enable(); - } - rcu_exp_print_detail_task_stall_rnp(rnp); - } + synchronize_rcu_expedited_stall(jiffies_start, j); jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3; panic_on_rcu_stall(); } From 7c72dedb0079e62c1c75dbab038332017f34a6b8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Jul 2024 21:24:29 -0700 Subject: [PATCH 51/64] rcu: Summarize expedited RCU CPU stall warnings during CSD-lock stalls During CSD-lock stalls, the additional information output by expedited RCU CPU stall warnings is usually redundant, flooding the console for not good reason. However, this has been the way things work for a few years. This commit therefore uses rcutree.csd_lock_suppress_rcu_stall kernel boot parameter that causes expedited RCU CPU stall warnings to be abbreviated to a single line when there is at least one CPU that has been stuck waiting for CSD lock for more than five seconds. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree_exp.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 17dd5169012d..d4be644afb50 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -553,6 +553,10 @@ static void synchronize_rcu_expedited_stall(unsigned long jiffies_start, unsigne struct rcu_node *rnp; struct rcu_node *rnp_root = rcu_get_root(); + if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) { + pr_err("INFO: %s detected expedited stalls, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name); + return; + } pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name); ndetected = 0; rcu_for_each_leaf_node(rnp) { From 51b739990cc81316768b85db519eac3999ba92a9 Mon Sep 17 00:00:00 2001 From: Ryo Takakura Date: Fri, 28 Jun 2024 13:18:26 +0900 Subject: [PATCH 52/64] rcu: Let dump_cpu_task() be used without preemption disabled The commit 2d7f00b2f0130 ("rcu: Suppress smp_processor_id() complaint in synchronize_rcu_expedited_wait()") disabled preemption around dump_cpu_task() to suppress warning on its usage within preemtible context. Calling dump_cpu_task() doesn't required to be in non-preemptible context except for suppressing the smp_processor_id() warning. As the smp_processor_id() is evaluated along with in_hardirq() to check if it's in interrupt context, this patch removes the need for its preemtion disablement by reordering the condition so that smp_processor_id() only gets evaluated when it's in interrupt context. Signed-off-by: Ryo Takakura Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree_exp.h | 2 -- kernel/sched/core.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index d4be644afb50..c5d9a7eb0803 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -597,9 +597,7 @@ static void synchronize_rcu_expedited_stall(unsigned long jiffies_start, unsigne mask = leaf_node_cpu_bit(rnp, cpu); if (!(READ_ONCE(rnp->expmask) & mask)) continue; - preempt_disable(); // For smp_processor_id() in dump_cpu_task(). dump_cpu_task(cpu); - preempt_enable(); } rcu_exp_print_detail_task_stall_rnp(rnp); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a9f655025607..78ae888a1fa2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9726,7 +9726,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { void dump_cpu_task(int cpu) { - if (cpu == smp_processor_id() && in_hardirq()) { + if (in_hardirq() && cpu == smp_processor_id()) { struct pt_regs *regs; regs = get_irq_regs(); From 11377947b5861fa59bf77c827e1dd7c081842cc9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:43:03 -0700 Subject: [PATCH 53/64] rcuscale: Provide clear error when async specified without primitives Currently, if the rcuscale module's async module parameter is specified for RCU implementations that do not have async primitives such as RCU Tasks Rude (which now lacks a call_rcu_tasks_rude() function), there will be a series of splats due to calls to a NULL pointer. This commit therefore warns of this situation, but switches to non-async testing. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcuscale.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index c507750e94d8..79e1c32d5c0f 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -525,7 +525,7 @@ rcu_scale_writer(void *arg) schedule_timeout_idle(torture_random(&tr) % writer_holdoff_jiffies + 1); wdp = &wdpp[i]; *wdp = ktime_get_mono_fast_ns(); - if (gp_async) { + if (gp_async && !WARN_ON_ONCE(!cur_ops->async)) { retry: if (!rhp) rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); @@ -597,7 +597,7 @@ rcu_scale_writer(void *arg) i++; rcu_scale_wait_shutdown(); } while (!torture_must_stop()); - if (gp_async) { + if (gp_async && cur_ops->async) { cur_ops->gp_barrier(); } writer_n_durations[me] = i_max + 1; From abaf1322adbfd4faa28072fd5412e54b9722e477 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:43:04 -0700 Subject: [PATCH 54/64] rcuscale: Make all writer tasks report upon hang This commit causes all writer tasks to provide a brief report after a hang has been reported, spaced at one-second intervals. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcuscale.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 79e1c32d5c0f..dfe8e0faa4d8 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -483,6 +483,7 @@ rcu_scale_writer(void *arg) unsigned long jdone; long me = (long)arg; struct rcu_head *rhp = NULL; + bool selfreport = false; bool started = false, done = false, alldone = false; u64 t; DEFINE_TORTURE_RANDOM(tr); @@ -593,6 +594,11 @@ rcu_scale_writer(void *arg) cur_ops->stats(); } } + if (!selfreport && time_after(jiffies, jdone + HZ * (70 + me))) { + pr_info("%s: Writer %ld self-report: started %d done %d/%d->%d i %d jdone %lu.\n", + __func__, me, started, done, writer_done[me], atomic_read(&n_rcu_scale_writer_finished), i, jiffies - jdone); + selfreport = true; + } if (started && !alldone && i < MAX_MEAS - 1) i++; rcu_scale_wait_shutdown(); From 3e3c4f0e2753e92aa2a248d34c6866c0a264ac96 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:43:05 -0700 Subject: [PATCH 55/64] rcuscale: Make rcu_scale_writer() tolerate repeated GFP_KERNEL failure Under some conditions, kmalloc(GFP_KERNEL) allocations have been observed to repeatedly fail. This situation has been observed to cause one of the rcu_scale_writer() instances to loop indefinitely retrying memory allocation for an asynchronous grace-period primitive. The problem is that if memory is short, all the other instances will allocate all available memory before the looping task is awakened from its rcu_barrier*() call. This in turn results in hangs, so that rcuscale fails to complete. This commit therefore removes the tight retry loop, so that when this condition occurs, the affected task is still passing through the full loop with its full set of termination checks. This spreads the risk of indefinite memory-allocation retry failures across all instances of rcu_scale_writer() tasks, which in turn prevents the hangs. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcuscale.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index dfe8e0faa4d8..80518662273b 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -520,6 +520,8 @@ rcu_scale_writer(void *arg) jdone = jiffies + minruntime * HZ; do { + bool gp_succeeded = false; + if (writer_holdoff) udelay(writer_holdoff); if (writer_holdoff_jiffies) @@ -527,23 +529,24 @@ rcu_scale_writer(void *arg) wdp = &wdpp[i]; *wdp = ktime_get_mono_fast_ns(); if (gp_async && !WARN_ON_ONCE(!cur_ops->async)) { -retry: if (!rhp) rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { atomic_inc(this_cpu_ptr(&n_async_inflight)); cur_ops->async(rhp, rcu_scale_async_cb); rhp = NULL; + gp_succeeded = true; } else if (!kthread_should_stop()) { cur_ops->gp_barrier(); - goto retry; } else { kfree(rhp); /* Because we are stopping. */ } } else if (gp_exp) { cur_ops->exp_sync(); + gp_succeeded = true; } else { cur_ops->sync(); + gp_succeeded = true; } t = ktime_get_mono_fast_ns(); *wdp = t - *wdp; @@ -599,7 +602,7 @@ rcu_scale_writer(void *arg) __func__, me, started, done, writer_done[me], atomic_read(&n_rcu_scale_writer_finished), i, jiffies - jdone); selfreport = true; } - if (started && !alldone && i < MAX_MEAS - 1) + if (gp_succeeded && started && !alldone && i < MAX_MEAS - 1) i++; rcu_scale_wait_shutdown(); } while (!torture_must_stop()); From 1c3e6e7903fa939601fdd326b04507304ba22337 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:43:06 -0700 Subject: [PATCH 56/64] rcuscale: Use special allocator for rcu_scale_writer() The rcu_scale_writer() function needs only a fixed number of rcu_head structures per kthread, which means that a trivial allocator suffices. This commit therefore uses an llist-based allocator using a fixed array of structures per kthread. This allows aggressive testing of RCU performance without stressing the slab allocators. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcuscale.c | 123 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 113 insertions(+), 10 deletions(-) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 80518662273b..bc7cca979c06 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -105,6 +105,19 @@ static char *scale_type = "rcu"; module_param(scale_type, charp, 0444); MODULE_PARM_DESC(scale_type, "Type of RCU to scalability-test (rcu, srcu, ...)"); +// Structure definitions for custom fixed-per-task allocator. +struct writer_mblock { + struct rcu_head wmb_rh; + struct llist_node wmb_node; + struct writer_freelist *wmb_wfl; +}; + +struct writer_freelist { + struct llist_head ws_lhg; + struct llist_head ____cacheline_internodealigned_in_smp ws_lhp; + struct writer_mblock *ws_mblocks; +}; + static int nrealreaders; static int nrealwriters; static struct task_struct **writer_tasks; @@ -113,6 +126,7 @@ static struct task_struct *shutdown_task; static u64 **writer_durations; static bool *writer_done; +static struct writer_freelist *writer_freelists; static int *writer_n_durations; static atomic_t n_rcu_scale_reader_started; static atomic_t n_rcu_scale_writer_started; @@ -463,13 +477,52 @@ rcu_scale_reader(void *arg) return 0; } +/* + * Allocate a writer_mblock structure for the specified rcu_scale_writer + * task. + */ +static struct writer_mblock *rcu_scale_alloc(long me) +{ + struct llist_node *llnp; + struct writer_freelist *wflp; + struct writer_mblock *wmbp; + + if (WARN_ON_ONCE(!writer_freelists)) + return NULL; + wflp = &writer_freelists[me]; + if (llist_empty(&wflp->ws_lhp)) { + // ->ws_lhp is private to its rcu_scale_writer task. + wmbp = container_of(llist_del_all(&wflp->ws_lhg), struct writer_mblock, wmb_node); + wflp->ws_lhp.first = &wmbp->wmb_node; + } + llnp = llist_del_first(&wflp->ws_lhp); + if (!llnp) + return NULL; + return container_of(llnp, struct writer_mblock, wmb_node); +} + +/* + * Free a writer_mblock structure to its rcu_scale_writer task. + */ +static void rcu_scale_free(struct writer_mblock *wmbp) +{ + struct writer_freelist *wflp; + + if (!wmbp) + return; + wflp = wmbp->wmb_wfl; + llist_add(&wmbp->wmb_node, &wflp->ws_lhg); +} + /* * Callback function for asynchronous grace periods from rcu_scale_writer(). */ static void rcu_scale_async_cb(struct rcu_head *rhp) { + struct writer_mblock *wmbp = container_of(rhp, struct writer_mblock, wmb_rh); + atomic_dec(this_cpu_ptr(&n_async_inflight)); - kfree(rhp); + rcu_scale_free(wmbp); } /* @@ -482,13 +535,13 @@ rcu_scale_writer(void *arg) int i_max; unsigned long jdone; long me = (long)arg; - struct rcu_head *rhp = NULL; bool selfreport = false; bool started = false, done = false, alldone = false; u64 t; DEFINE_TORTURE_RANDOM(tr); u64 *wdp; u64 *wdpp = writer_durations[me]; + struct writer_mblock *wmbp = NULL; VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started"); WARN_ON(!wdpp); @@ -529,17 +582,18 @@ rcu_scale_writer(void *arg) wdp = &wdpp[i]; *wdp = ktime_get_mono_fast_ns(); if (gp_async && !WARN_ON_ONCE(!cur_ops->async)) { - if (!rhp) - rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); - if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { + if (!wmbp) + wmbp = rcu_scale_alloc(me); + if (wmbp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { atomic_inc(this_cpu_ptr(&n_async_inflight)); - cur_ops->async(rhp, rcu_scale_async_cb); - rhp = NULL; + cur_ops->async(&wmbp->wmb_rh, rcu_scale_async_cb); + wmbp = NULL; gp_succeeded = true; } else if (!kthread_should_stop()) { cur_ops->gp_barrier(); } else { - kfree(rhp); /* Because we are stopping. */ + rcu_scale_free(wmbp); /* Because we are stopping. */ + wmbp = NULL; } } else if (gp_exp) { cur_ops->exp_sync(); @@ -607,6 +661,7 @@ rcu_scale_writer(void *arg) rcu_scale_wait_shutdown(); } while (!torture_must_stop()); if (gp_async && cur_ops->async) { + rcu_scale_free(wmbp); cur_ops->gp_barrier(); } writer_n_durations[me] = i_max + 1; @@ -970,12 +1025,30 @@ rcu_scale_cleanup(void) schedule_timeout_uninterruptible(1); } kfree(writer_durations[i]); + if (writer_freelists) { + int ctr = 0; + struct llist_node *llnp; + struct writer_freelist *wflp = &writer_freelists[i]; + + if (wflp->ws_mblocks) { + llist_for_each(llnp, wflp->ws_lhg.first) + ctr++; + llist_for_each(llnp, wflp->ws_lhp.first) + ctr++; + WARN_ONCE(ctr != gp_async_max, + "%s: ctr = %d gp_async_max = %d\n", + __func__, ctr, gp_async_max); + kfree(wflp->ws_mblocks); + } + } } kfree(writer_tasks); kfree(writer_durations); kfree(writer_n_durations); kfree(writer_done); writer_done = NULL; + kfree(writer_freelists); + writer_freelists = NULL; } /* Do torture-type-specific cleanup operations. */ @@ -1002,8 +1075,9 @@ rcu_scale_shutdown(void *arg) static int __init rcu_scale_init(void) { - long i; int firsterr = 0; + long i; + long j; static struct rcu_scale_ops *scale_ops[] = { &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS }; @@ -1074,7 +1148,18 @@ rcu_scale_init(void) writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), GFP_KERNEL); writer_n_durations = kcalloc(nrealwriters, sizeof(*writer_n_durations), GFP_KERNEL); writer_done = kcalloc(nrealwriters, sizeof(writer_done[0]), GFP_KERNEL); - if (!writer_tasks || !writer_durations || !writer_n_durations || !writer_done) { + if (gp_async) { + if (gp_async_max <= 0) { + pr_warn("%s: gp_async_max = %d must be greater than zero.\n", + __func__, gp_async_max); + WARN_ON_ONCE(IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)); + firsterr = -EINVAL; + goto unwind; + } + writer_freelists = kcalloc(nrealwriters, sizeof(writer_freelists[0]), GFP_KERNEL); + } + if (!writer_tasks || !writer_durations || !writer_n_durations || !writer_done || + (gp_async && !writer_freelists)) { SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; @@ -1087,6 +1172,24 @@ rcu_scale_init(void) firsterr = -ENOMEM; goto unwind; } + if (writer_freelists) { + struct writer_freelist *wflp = &writer_freelists[i]; + + init_llist_head(&wflp->ws_lhg); + init_llist_head(&wflp->ws_lhp); + wflp->ws_mblocks = kcalloc(gp_async_max, sizeof(wflp->ws_mblocks[0]), + GFP_KERNEL); + if (!wflp->ws_mblocks) { + firsterr = -ENOMEM; + goto unwind; + } + for (j = 0; j < gp_async_max; j++) { + struct writer_mblock *wmbp = &wflp->ws_mblocks[j]; + + wmbp->wmb_wfl = wflp; + llist_add(&wmbp->wmb_node, &wflp->ws_lhp); + } + } firsterr = torture_create_kthread(rcu_scale_writer, (void *)i, writer_tasks[i]); if (torture_init_error(firsterr)) From 554f07a119866a78606b36123fb28e9451b1f994 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:43:07 -0700 Subject: [PATCH 57/64] rcuscale: NULL out top-level pointers to heap memory Currently, if someone modprobes and rmmods rcuscale successfully, but the next run errors out during the modprobe, non-NULL pointers to freed memory will remain. If the run after that also errors out during the modprobe, there will be double-free bugs. This commit therefore NULLs out top-level pointers to memory that has just been freed. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcuscale.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index bc7cca979c06..61a178914256 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -819,6 +819,7 @@ kfree_scale_cleanup(void) torture_stop_kthread(kfree_scale_thread, kfree_reader_tasks[i]); kfree(kfree_reader_tasks); + kfree_reader_tasks = NULL; } torture_cleanup_end(); @@ -987,6 +988,7 @@ rcu_scale_cleanup(void) torture_stop_kthread(rcu_scale_reader, reader_tasks[i]); kfree(reader_tasks); + reader_tasks = NULL; } if (writer_tasks) { @@ -1043,8 +1045,11 @@ rcu_scale_cleanup(void) } } kfree(writer_tasks); + writer_tasks = NULL; kfree(writer_durations); + writer_durations = NULL; kfree(writer_n_durations); + writer_n_durations = NULL; kfree(writer_done); writer_done = NULL; kfree(writer_freelists); From f1fd0e0bb12d09ca0564425b8af76fddf2e380fb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:43:08 -0700 Subject: [PATCH 58/64] rcuscale: Count outstanding callbacks per-task rather than per-CPU The current rcu_scale_writer() asynchronous grace-period testing uses a per-CPU counter to track the number of outstanding callbacks. This is subject to CPU-imbalance errors when tasks migrate from one CPU to another between the time that the counter is incremented and the callback is queued, and additionally in kernels configured such that callbacks can be invoked on some CPU other than the one that queued it. This commit therefore arranges for per-task callback counts, thus avoiding any issues with migration of either tasks or callbacks. Reported-by: Vlastimil Babka Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcuscale.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 61a178914256..6d37596deb1f 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -114,6 +114,7 @@ struct writer_mblock { struct writer_freelist { struct llist_head ws_lhg; + atomic_t ws_inflight; struct llist_head ____cacheline_internodealigned_in_smp ws_lhp; struct writer_mblock *ws_mblocks; }; @@ -136,7 +137,6 @@ static u64 t_rcu_scale_writer_started; static u64 t_rcu_scale_writer_finished; static unsigned long b_rcu_gp_test_started; static unsigned long b_rcu_gp_test_finished; -static DEFINE_PER_CPU(atomic_t, n_async_inflight); #define MAX_MEAS 10000 #define MIN_MEAS 100 @@ -520,8 +520,9 @@ static void rcu_scale_free(struct writer_mblock *wmbp) static void rcu_scale_async_cb(struct rcu_head *rhp) { struct writer_mblock *wmbp = container_of(rhp, struct writer_mblock, wmb_rh); + struct writer_freelist *wflp = wmbp->wmb_wfl; - atomic_dec(this_cpu_ptr(&n_async_inflight)); + atomic_dec(&wflp->ws_inflight); rcu_scale_free(wmbp); } @@ -541,6 +542,7 @@ rcu_scale_writer(void *arg) DEFINE_TORTURE_RANDOM(tr); u64 *wdp; u64 *wdpp = writer_durations[me]; + struct writer_freelist *wflp = &writer_freelists[me]; struct writer_mblock *wmbp = NULL; VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started"); @@ -584,8 +586,8 @@ rcu_scale_writer(void *arg) if (gp_async && !WARN_ON_ONCE(!cur_ops->async)) { if (!wmbp) wmbp = rcu_scale_alloc(me); - if (wmbp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { - atomic_inc(this_cpu_ptr(&n_async_inflight)); + if (wmbp && atomic_read(&wflp->ws_inflight) < gp_async_max) { + atomic_inc(&wflp->ws_inflight); cur_ops->async(&wmbp->wmb_rh, rcu_scale_async_cb); wmbp = NULL; gp_succeeded = true; From 8f35fefad06323ab8ad5915bb7c6bc1a684cb8b5 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 6 Aug 2024 15:30:16 +0200 Subject: [PATCH 59/64] refscale: Constify struct ref_scale_ops 'struct ref_scale_ops' are not modified in these drivers. Constifying this structure moves some data to a read-only section, so increase overall security. On a x86_64, with allmodconfig: Before: ====== text data bss dec hex filename 34231 4167 736 39134 98de kernel/rcu/refscale.o After: ===== text data bss dec hex filename 35175 3239 736 39150 98ee kernel/rcu/refscale.o Signed-off-by: Christophe JAILLET Tested-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/refscale.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index cfec0648e141..0db9db73f57f 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -135,7 +135,7 @@ struct ref_scale_ops { const char *name; }; -static struct ref_scale_ops *cur_ops; +static const struct ref_scale_ops *cur_ops; static void un_delay(const int udl, const int ndl) { @@ -171,7 +171,7 @@ static bool rcu_sync_scale_init(void) return true; } -static struct ref_scale_ops rcu_ops = { +static const struct ref_scale_ops rcu_ops = { .init = rcu_sync_scale_init, .readsection = ref_rcu_read_section, .delaysection = ref_rcu_delay_section, @@ -205,7 +205,7 @@ static void srcu_ref_scale_delay_section(const int nloops, const int udl, const } } -static struct ref_scale_ops srcu_ops = { +static const struct ref_scale_ops srcu_ops = { .init = rcu_sync_scale_init, .readsection = srcu_ref_scale_read_section, .delaysection = srcu_ref_scale_delay_section, @@ -232,7 +232,7 @@ static void rcu_tasks_ref_scale_delay_section(const int nloops, const int udl, c un_delay(udl, ndl); } -static struct ref_scale_ops rcu_tasks_ops = { +static const struct ref_scale_ops rcu_tasks_ops = { .init = rcu_sync_scale_init, .readsection = rcu_tasks_ref_scale_read_section, .delaysection = rcu_tasks_ref_scale_delay_section, @@ -271,7 +271,7 @@ static void rcu_trace_ref_scale_delay_section(const int nloops, const int udl, c } } -static struct ref_scale_ops rcu_trace_ops = { +static const struct ref_scale_ops rcu_trace_ops = { .init = rcu_sync_scale_init, .readsection = rcu_trace_ref_scale_read_section, .delaysection = rcu_trace_ref_scale_delay_section, @@ -310,7 +310,7 @@ static void ref_refcnt_delay_section(const int nloops, const int udl, const int } } -static struct ref_scale_ops refcnt_ops = { +static const struct ref_scale_ops refcnt_ops = { .init = rcu_sync_scale_init, .readsection = ref_refcnt_section, .delaysection = ref_refcnt_delay_section, @@ -347,7 +347,7 @@ static void ref_rwlock_delay_section(const int nloops, const int udl, const int } } -static struct ref_scale_ops rwlock_ops = { +static const struct ref_scale_ops rwlock_ops = { .init = ref_rwlock_init, .readsection = ref_rwlock_section, .delaysection = ref_rwlock_delay_section, @@ -384,7 +384,7 @@ static void ref_rwsem_delay_section(const int nloops, const int udl, const int n } } -static struct ref_scale_ops rwsem_ops = { +static const struct ref_scale_ops rwsem_ops = { .init = ref_rwsem_init, .readsection = ref_rwsem_section, .delaysection = ref_rwsem_delay_section, @@ -419,7 +419,7 @@ static void ref_lock_delay_section(const int nloops, const int udl, const int nd preempt_enable(); } -static struct ref_scale_ops lock_ops = { +static const struct ref_scale_ops lock_ops = { .readsection = ref_lock_section, .delaysection = ref_lock_delay_section, .name = "lock" @@ -454,7 +454,7 @@ static void ref_lock_irq_delay_section(const int nloops, const int udl, const in preempt_enable(); } -static struct ref_scale_ops lock_irq_ops = { +static const struct ref_scale_ops lock_irq_ops = { .readsection = ref_lock_irq_section, .delaysection = ref_lock_irq_delay_section, .name = "lock-irq" @@ -490,7 +490,7 @@ static void ref_acqrel_delay_section(const int nloops, const int udl, const int preempt_enable(); } -static struct ref_scale_ops acqrel_ops = { +static const struct ref_scale_ops acqrel_ops = { .readsection = ref_acqrel_section, .delaysection = ref_acqrel_delay_section, .name = "acqrel" @@ -524,7 +524,7 @@ static void ref_clock_delay_section(const int nloops, const int udl, const int n stopopts = x; } -static struct ref_scale_ops clock_ops = { +static const struct ref_scale_ops clock_ops = { .readsection = ref_clock_section, .delaysection = ref_clock_delay_section, .name = "clock" @@ -556,7 +556,7 @@ static void ref_jiffies_delay_section(const int nloops, const int udl, const int stopopts = x; } -static struct ref_scale_ops jiffies_ops = { +static const struct ref_scale_ops jiffies_ops = { .readsection = ref_jiffies_section, .delaysection = ref_jiffies_delay_section, .name = "jiffies" @@ -706,9 +706,9 @@ static void refscale_typesafe_ctor(void *rtsp_in) preempt_enable(); } -static struct ref_scale_ops typesafe_ref_ops; -static struct ref_scale_ops typesafe_lock_ops; -static struct ref_scale_ops typesafe_seqlock_ops; +static const struct ref_scale_ops typesafe_ref_ops; +static const struct ref_scale_ops typesafe_lock_ops; +static const struct ref_scale_ops typesafe_seqlock_ops; // Initialize for a typesafe test. static bool typesafe_init(void) @@ -769,7 +769,7 @@ static void typesafe_cleanup(void) } // The typesafe_init() function distinguishes these structures by address. -static struct ref_scale_ops typesafe_ref_ops = { +static const struct ref_scale_ops typesafe_ref_ops = { .init = typesafe_init, .cleanup = typesafe_cleanup, .readsection = typesafe_read_section, @@ -777,7 +777,7 @@ static struct ref_scale_ops typesafe_ref_ops = { .name = "typesafe_ref" }; -static struct ref_scale_ops typesafe_lock_ops = { +static const struct ref_scale_ops typesafe_lock_ops = { .init = typesafe_init, .cleanup = typesafe_cleanup, .readsection = typesafe_read_section, @@ -785,7 +785,7 @@ static struct ref_scale_ops typesafe_lock_ops = { .name = "typesafe_lock" }; -static struct ref_scale_ops typesafe_seqlock_ops = { +static const struct ref_scale_ops typesafe_seqlock_ops = { .init = typesafe_init, .cleanup = typesafe_cleanup, .readsection = typesafe_read_section, @@ -1026,7 +1026,7 @@ static int main_func(void *arg) } static void -ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag) +ref_scale_print_module_parms(const struct ref_scale_ops *cur_ops, const char *tag) { pr_alert("%s" SCALE_FLAG "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, @@ -1081,7 +1081,7 @@ ref_scale_init(void) { long i; int firsterr = 0; - static struct ref_scale_ops *scale_ops[] = { + static const struct ref_scale_ops *scale_ops[] = { &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, From 1fcb932c8b5ce86219d7dedcd63659351a43291c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 4 Jul 2024 00:56:40 +0200 Subject: [PATCH 60/64] rcu/nocb: Simplify (de-)offloading state machine Now that the (de-)offloading process can only apply to offline CPUs, there is no more concurrency between rcu_core and nocb kthreads. Also the mutation now happens on empty queues. Therefore the state machine can be reduced to a single bit called SEGCBLIST_OFFLOADED. Simplify the transition as follows: * Upon offloading: queue the rdp to be added to the rcuog list and wait for the rcuog kthread to set the SEGCBLIST_OFFLOADED bit. Unpark rcuo kthread. * Upon de-offloading: Park rcuo kthread. Queue the rdp to be removed from the rcuog list and wait for the rcuog kthread to clear the SEGCBLIST_OFFLOADED bit. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- include/linux/rcu_segcblist.h | 4 +- kernel/rcu/rcu_segcblist.c | 11 --- kernel/rcu/rcu_segcblist.h | 2 +- kernel/rcu/tree_nocb.h | 138 ++++++++++++++++------------------ 4 files changed, 68 insertions(+), 87 deletions(-) diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 1ef1bb54853d..2fdc2208f1ca 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -185,9 +185,7 @@ struct rcu_cblist { * ---------------------------------------------------------------------------- */ #define SEGCBLIST_ENABLED BIT(0) -#define SEGCBLIST_LOCKING BIT(1) -#define SEGCBLIST_KTHREAD_GP BIT(2) -#define SEGCBLIST_OFFLOADED BIT(3) +#define SEGCBLIST_OFFLOADED BIT(1) struct rcu_segcblist { struct rcu_head *head; diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 1693ea22ef1b..298a2c573f02 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -260,17 +260,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) rcu_segcblist_clear_flags(rsclp, SEGCBLIST_ENABLED); } -/* - * Mark the specified rcu_segcblist structure as offloaded (or not) - */ -void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload) -{ - if (offload) - rcu_segcblist_set_flags(rsclp, SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED); - else - rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED); -} - /* * Does the specified rcu_segcblist structure contain callbacks that * are ready to be invoked? diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 7a0962dfee86..259904075636 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -89,7 +89,7 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) { if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - rcu_segcblist_test_flags(rsclp, SEGCBLIST_LOCKING)) + rcu_segcblist_test_flags(rsclp, SEGCBLIST_OFFLOADED)) return true; return false; diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 24daf606de0c..857024ee0349 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -604,37 +604,33 @@ static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, } } -static int nocb_gp_toggle_rdp(struct rcu_data *rdp) +static void nocb_gp_toggle_rdp(struct rcu_data *rdp_gp, struct rcu_data *rdp) { struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; - int ret; - rcu_nocb_lock_irqsave(rdp, flags); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) && - !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + /* + * Locking orders future de-offloaded callbacks enqueue against previous + * handling of this rdp. Ie: Make sure rcuog is done with this rdp before + * deoffloaded callbacks can be enqueued. + */ + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { /* * Offloading. Set our flag and notify the offload worker. * We will handle this rdp until it ever gets de-offloaded. */ - rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); - ret = 1; - } else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) && - rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + list_add_tail(&rdp->nocb_entry_rdp, &rdp_gp->nocb_head_rdp); + rcu_segcblist_set_flags(cblist, SEGCBLIST_OFFLOADED); + } else { /* * De-offloading. Clear our flag and notify the de-offload worker. * We will ignore this rdp until it ever gets re-offloaded. */ - rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); - ret = 0; - } else { - WARN_ON_ONCE(1); - ret = -1; + list_del(&rdp->nocb_entry_rdp); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_OFFLOADED); } - - rcu_nocb_unlock_irqrestore(rdp, flags); - - return ret; + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu) @@ -841,14 +837,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) } if (rdp_toggling) { - int ret; - - ret = nocb_gp_toggle_rdp(rdp_toggling); - if (ret == 1) - list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp); - else if (ret == 0) - list_del(&rdp_toggling->nocb_entry_rdp); - + nocb_gp_toggle_rdp(my_rdp, rdp_toggling); swake_up_one(&rdp_toggling->nocb_state_wq); } @@ -1018,16 +1007,11 @@ void rcu_nocb_flush_deferred_wakeup(void) } EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup); -static int rdp_offload_toggle(struct rcu_data *rdp, - bool offload, unsigned long flags) - __releases(rdp->nocb_lock) +static int rcu_nocb_queue_toggle_rdp(struct rcu_data *rdp) { - struct rcu_segcblist *cblist = &rdp->cblist; struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; bool wake_gp = false; - - rcu_segcblist_offload(cblist, offload); - rcu_nocb_unlock_irqrestore(rdp, flags); + unsigned long flags; raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); // Queue this rdp for add/del to/from the list to iterate on rcuog @@ -1041,9 +1025,25 @@ static int rdp_offload_toggle(struct rcu_data *rdp, return wake_gp; } +static bool rcu_nocb_rdp_deoffload_wait_cond(struct rcu_data *rdp) +{ + unsigned long flags; + bool ret; + + /* + * Locking makes sure rcuog is done handling this rdp before deoffloaded + * enqueue can happen. Also it keeps the SEGCBLIST_OFFLOADED flag stable + * while the ->nocb_lock is held. + */ + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + ret = !rcu_segcblist_test_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + + return ret; +} + static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp) { - struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; int wake_gp; struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; @@ -1056,51 +1056,42 @@ static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp) /* Flush all callbacks from segcblist and bypass */ rcu_barrier(); + /* + * Make sure the rcuoc kthread isn't in the middle of a nocb locked + * sequence while offloading is deactivated, along with nocb locking. + */ + if (rdp->nocb_cb_kthread) + kthread_park(rdp->nocb_cb_kthread); + rcu_nocb_lock_irqsave(rdp, flags); WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); + rcu_nocb_unlock_irqrestore(rdp, flags); - wake_gp = rdp_offload_toggle(rdp, false, flags); + wake_gp = rcu_nocb_queue_toggle_rdp(rdp); mutex_lock(&rdp_gp->nocb_gp_kthread_mutex); + if (rdp_gp->nocb_gp_kthread) { if (wake_gp) wake_up_process(rdp_gp->nocb_gp_kthread); swait_event_exclusive(rdp->nocb_state_wq, - !rcu_segcblist_test_flags(cblist, - SEGCBLIST_KTHREAD_GP)); - if (rdp->nocb_cb_kthread) - kthread_park(rdp->nocb_cb_kthread); + rcu_nocb_rdp_deoffload_wait_cond(rdp)); } else { /* * No kthread to clear the flags for us or remove the rdp from the nocb list * to iterate. Do it here instead. Locking doesn't look stricly necessary * but we stick to paranoia in this rare path. */ - rcu_nocb_lock_irqsave(rdp, flags); - rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); - rcu_nocb_unlock_irqrestore(rdp, flags); + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); list_del(&rdp->nocb_entry_rdp); } - mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); - /* - * Lock one last time to acquire latest callback updates from kthreads - * so we can later handle callbacks locally without locking. - */ - rcu_nocb_lock_irqsave(rdp, flags); - /* - * Theoretically we could clear SEGCBLIST_LOCKING after the nocb - * lock is released but how about being paranoid for once? - */ - rcu_segcblist_clear_flags(cblist, SEGCBLIST_LOCKING); - /* - * Without SEGCBLIST_LOCKING, we can't use - * rcu_nocb_unlock_irqrestore() anymore. - */ - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); return 0; } @@ -1129,10 +1120,20 @@ int rcu_nocb_cpu_deoffload(int cpu) } EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); +static bool rcu_nocb_rdp_offload_wait_cond(struct rcu_data *rdp) +{ + unsigned long flags; + bool ret; + + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + ret = rcu_segcblist_test_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + + return ret; +} + static int rcu_nocb_rdp_offload(struct rcu_data *rdp) { - struct rcu_segcblist *cblist = &rdp->cblist; - unsigned long flags; int wake_gp; struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; @@ -1152,20 +1153,14 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp) WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); - /* - * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING - * is set. - */ - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); - - wake_gp = rdp_offload_toggle(rdp, true, flags); + wake_gp = rcu_nocb_queue_toggle_rdp(rdp); if (wake_gp) wake_up_process(rdp_gp->nocb_gp_kthread); - kthread_unpark(rdp->nocb_cb_kthread); - swait_event_exclusive(rdp->nocb_state_wq, - rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); + rcu_nocb_rdp_offload_wait_cond(rdp)); + + kthread_unpark(rdp->nocb_cb_kthread); return 0; } @@ -1340,8 +1335,7 @@ void __init rcu_init_nohz(void) rdp = per_cpu_ptr(&rcu_data, cpu); if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); - rcu_segcblist_offload(&rdp->cblist, true); - rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); } rcu_organize_nocb_kthreads(); } From 9139f93209d1ffd7f489ab19dee01b7c3a1a43d2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 14 Aug 2024 00:56:40 +0200 Subject: [PATCH 61/64] rcu/nocb: Fix RT throttling hrtimer armed from offline CPU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After a CPU is marked offline and until it reaches its final trip to idle, rcuo has several opportunities to be woken up, either because a callback has been queued in the meantime or because rcutree_report_cpu_dead() has issued the final deferred NOCB wake up. If RCU-boosting is enabled, RCU kthreads are set to SCHED_FIFO policy. And if RT-bandwidth is enabled, the related hrtimer might be armed. However this then happens after hrtimers have been migrated at the CPUHP_AP_HRTIMERS_DYING stage, which is broken as reported by the following warning: Call trace: enqueue_hrtimer+0x7c/0xf8 hrtimer_start_range_ns+0x2b8/0x300 enqueue_task_rt+0x298/0x3f0 enqueue_task+0x94/0x188 ttwu_do_activate+0xb4/0x27c try_to_wake_up+0x2d8/0x79c wake_up_process+0x18/0x28 __wake_nocb_gp+0x80/0x1a0 do_nocb_deferred_wakeup_common+0x3c/0xcc rcu_report_dead+0x68/0x1ac cpuhp_report_idle_dead+0x48/0x9c do_idle+0x288/0x294 cpu_startup_entry+0x34/0x3c secondary_start_kernel+0x138/0x158 Fix this with waking up rcuo using an IPI if necessary. Since the existing API to deal with this situation only handles swait queue, rcuo is only woken up from offline CPUs if it's not already waiting on a grace period. In the worst case some callbacks will just wait for a grace period to complete before being assigned to a subsequent one. Reported-by: "Cheng-Jui Wang (王正睿)" Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier") Signed-off-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree_nocb.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 857024ee0349..e3cf354f7a7f 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -216,7 +216,10 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp, raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); if (needwake) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); - wake_up_process(rdp_gp->nocb_gp_kthread); + if (cpu_is_offline(raw_smp_processor_id())) + swake_up_one_online(&rdp_gp->nocb_gp_wq); + else + wake_up_process(rdp_gp->nocb_gp_kthread); } return needwake; From 1b022b8763fde84c51ca06218306d7d88fb090cb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 14 Aug 2024 00:56:41 +0200 Subject: [PATCH 62/64] rcu/nocb: Conditionally wake up rcuo if not already waiting on GP A callback enqueuer currently wakes up the rcuo kthread if it is adding the first non-done callback of a CPU, whether the kthread is waiting on a grace period or not (unless the CPU is offline). This looks like a desired behaviour because then the rcuo kthread doesn't wait for the end of the current grace period to handle the callback. It is accelerated right away and assigned to the next grace period. The GP kthread is notified about that fact and iterates with the upcoming GP without sleeping in-between. However this best-case scenario is contradicted by a few details, depending on the situation: 1) If the callback is a non-bypass one queued with IRQs enabled, the wake up only occurs if no other pending callbacks are on the list. Therefore the theoretical "optimization" actually applies on rare occasions. 2) If the callback is a non-bypass one queued with IRQs disabled, the situation is similar with even more uncertainty due to the deferred wake up. 3) If the callback is lazy, a few jiffies don't make any difference. 4) If the callback is bypass, the wake up timer is programmed 2 jiffies ahead by rcuo in case the regular pending queue has been handled in the meantime. The rare storm of callbacks can otherwise wait for the currently elapsing grace period to be flushed and handled. For all those reasons, the optimization is only theoretical and occasional. Therefore it is reasonable that callbacks enqueuers only wake up the rcuo kthread when it is not already waiting on a grace period to complete. Signed-off-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree_nocb.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index e3cf354f7a7f..14d183438423 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -216,10 +216,7 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp, raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); if (needwake) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); - if (cpu_is_offline(raw_smp_processor_id())) - swake_up_one_online(&rdp_gp->nocb_gp_wq); - else - wake_up_process(rdp_gp->nocb_gp_kthread); + swake_up_one_online(&rdp_gp->nocb_gp_wq); } return needwake; From 7562eed272b49e233c430524e684b957f34f2fd2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 14 Aug 2024 00:56:42 +0200 Subject: [PATCH 63/64] rcu/nocb: Remove superfluous memory barrier after bypass enqueue Pre-GP accesses performed by the update side must be ordered against post-GP accesses performed by the readers. This is ensured by the bypass or nocb locking on enqueue time, followed by the fully ordered rnp locking initiated while callbacks are accelerated, and then propagated throughout the whole GP lifecyle associated with the callbacks. Therefore the explicit barrier advertizing ordering between bypass enqueue and rcuo wakeup is superfluous. If anything, it would even only order the first bypass callback enqueue against the rcuo wakeup and ignore all the subsequent ones. Remove the needless barrier. Signed-off-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree_nocb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 14d183438423..4ffb8c90d695 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -493,7 +493,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ")); } rcu_nocb_bypass_unlock(rdp); - smp_mb(); /* Order enqueue before wake. */ + // A wake up of the grace period kthread or timer adjustment // needs to be done only if: // 1. Bypass list was fully empty before (this is the first From 1ecd9d68eb44e4b7972aee2840eb4fdf29b9de2b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 23 Aug 2024 14:15:12 -0700 Subject: [PATCH 64/64] rcu: Defer printing stall-warning backtrace when holding rcu_node lock The rcu_dump_cpu_stacks() holds the leaf rcu_node structure's ->lock when dumping the stakcks of any CPUs stalling the current grace period. This lock is held to prevent confusion that would otherwise occur when the stalled CPU reported its quiescent state (and then went on to do unrelated things) just as the backtrace NMI was heading towards it. This has worked well, but on larger systems has recently been observed to cause severe lock contention resulting in CSD-lock stalls and other general unhappiness. This commit therefore does printk_deferred_enter() before acquiring the lock and printk_deferred_exit() after releasing it, thus deferring the overhead of actually outputting the stack trace out of that lock's critical section. Reported-by: Rik van Riel Suggested-by: Rik van Riel Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree_stall.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index b497d4c6dabd..9772e1ffcf6e 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -371,6 +371,7 @@ static void rcu_dump_cpu_stacks(void) struct rcu_node *rnp; rcu_for_each_leaf_node(rnp) { + printk_deferred_enter(); raw_spin_lock_irqsave_rcu_node(rnp, flags); for_each_leaf_node_possible_cpu(rnp, cpu) if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { @@ -380,6 +381,7 @@ static void rcu_dump_cpu_stacks(void) dump_cpu_task(cpu); } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + printk_deferred_exit(); } }