From 8808c5732265c301d3fe7e5043ea8162eef3421a Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Thu, 15 Aug 2024 16:50:53 +0800 Subject: [PATCH 01/30] rcu: Remove unused declaration rcu_segcblist_offload() Commit 17351eb59abd ("rcu/nocb: Simplify (de-)offloading state machine") removed the implementation but leave declaration. Signed-off-by: Yue Haibing Reviewed-by: Frederic Weisbecker Reviewed-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- kernel/rcu/rcu_segcblist.h | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 259904075636..fadc08ad4b7b 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -120,7 +120,6 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp); void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v); void rcu_segcblist_init(struct rcu_segcblist *rsclp); void rcu_segcblist_disable(struct rcu_segcblist *rsclp); -void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload); bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); From 26ff1fb02991e1260481185bb5ccab1ee498d5e4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Oct 2024 09:19:29 -0700 Subject: [PATCH 02/30] rcu: Delete unused rcu_gp_might_be_stalled() function The rcu_gp_might_be_stalled() function is no longer used, so this commit removes it. Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) Signed-off-by: Frederic Weisbecker --- include/linux/rcutiny.h | 1 - include/linux/rcutree.h | 1 - kernel/rcu/tree_stall.h | 30 ------------------------------ 3 files changed, 32 deletions(-) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 0ee270b3f5ed..fe42315f667f 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -165,7 +165,6 @@ static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } static inline void rcu_momentary_eqs(void) { } static inline void kfree_rcu_scheduler_running(void) { } -static inline bool rcu_gp_might_be_stalled(void) { return false; } /* Avoid RCU read-side critical sections leaking across. */ static inline void rcu_all_qs(void) { barrier(); } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 90a684f94776..27d86d912781 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -40,7 +40,6 @@ void kvfree_rcu_barrier(void); void rcu_barrier(void); void rcu_momentary_eqs(void); void kfree_rcu_scheduler_running(void); -bool rcu_gp_might_be_stalled(void); struct rcu_gp_oldstate { unsigned long rgos_norm; diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 4432db6d0b99..d7cdd535e50b 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -76,36 +76,6 @@ int rcu_jiffies_till_stall_check(void) } EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); -/** - * rcu_gp_might_be_stalled - Is it likely that the grace period is stalled? - * - * Returns @true if the current grace period is sufficiently old that - * it is reasonable to assume that it might be stalled. This can be - * useful when deciding whether to allocate memory to enable RCU-mediated - * freeing on the one hand or just invoking synchronize_rcu() on the other. - * The latter is preferable when the grace period is stalled. - * - * Note that sampling of the .gp_start and .gp_seq fields must be done - * carefully to avoid false positives at the beginnings and ends of - * grace periods. - */ -bool rcu_gp_might_be_stalled(void) -{ - unsigned long d = rcu_jiffies_till_stall_check() / RCU_STALL_MIGHT_DIV; - unsigned long j = jiffies; - - if (d < RCU_STALL_MIGHT_MIN) - d = RCU_STALL_MIGHT_MIN; - smp_mb(); // jiffies before .gp_seq to avoid false positives. - if (!rcu_gp_in_progress()) - return false; - // Long delays at this point avoids false positive, but a delay - // of ULONG_MAX/4 jiffies voids your no-false-positive warranty. - smp_mb(); // .gp_seq before second .gp_start - // And ditto here. - return !time_before(j, READ_ONCE(rcu_state.gp_start) + d); -} - /* Don't do RCU CPU stall warnings during long sysrq printouts. */ void rcu_sysrq_start(void) { From cbe644aa6fe176bdeb7e175bb194ad644d65319f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Oct 2024 09:19:30 -0700 Subject: [PATCH 03/30] rcu: Stop stall warning from dumping stacks if grace period ends Currently, once an RCU CPU stall warning decides to dump the stalling CPUs' stacks, the rcu_dump_cpu_stacks() function persists until it has gone through the full list. Unfortunately, if the stalled grace periods ends midway through, this function will be dumping stacks of innocent-bystander CPUs that happen to be blocking not the old grace period, but instead the new one. This can cause serious confusion. This commit therefore stops dumping stacks if and when the stalled grace period ends. [ paulmck: Apply Joel Fernandes feedback. ] Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree_stall.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index d7cdd535e50b..b530844becf8 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -335,13 +335,17 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) * that don't support NMI-based stack dumps. The NMI-triggered stack * traces are more accurate because they are printed by the target CPU. */ -static void rcu_dump_cpu_stacks(void) +static void rcu_dump_cpu_stacks(unsigned long gp_seq) { int cpu; unsigned long flags; struct rcu_node *rnp; rcu_for_each_leaf_node(rnp) { + if (gp_seq != data_race(rcu_state.gp_seq)) { + pr_err("INFO: Stall ended during stack backtracing.\n"); + return; + } printk_deferred_enter(); raw_spin_lock_irqsave_rcu_node(rnp, flags); for_each_leaf_node_possible_cpu(rnp, cpu) @@ -608,7 +612,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, data_race(rcu_state.n_online_cpus)); // Diagnostic read if (ndetected) { - rcu_dump_cpu_stacks(); + rcu_dump_cpu_stacks(gp_seq); /* Complain about tasks blocking the grace period. */ rcu_for_each_leaf_node(rnp) @@ -640,7 +644,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) rcu_force_quiescent_state(); /* Kick them all. */ } -static void print_cpu_stall(unsigned long gps) +static void print_cpu_stall(unsigned long gp_seq, unsigned long gps) { int cpu; unsigned long flags; @@ -677,7 +681,7 @@ static void print_cpu_stall(unsigned long gps) rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); - rcu_dump_cpu_stacks(); + rcu_dump_cpu_stacks(gp_seq); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Rewrite if needed in case of slow consoles. */ @@ -759,7 +763,8 @@ static void check_cpu_stall(struct rcu_data *rdp) gs2 = READ_ONCE(rcu_state.gp_seq); if (gs1 != gs2 || ULONG_CMP_LT(j, js) || - ULONG_CMP_GE(gps, js)) + ULONG_CMP_GE(gps, js) || + !rcu_seq_state(gs2)) return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; jn = jiffies + ULONG_MAX / 2; @@ -780,7 +785,7 @@ static void check_cpu_stall(struct rcu_data *rdp) pr_err("INFO: %s detected stall, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name); } else if (self_detected) { /* We haven't checked in, so go dump stack. */ - print_cpu_stall(gps); + print_cpu_stall(gs2, gps); } else { /* They had a few time units to dump stack, so complain. */ print_other_cpu_stall(gs2, gps); From 79a20a857009f12ab28c716d3ba218cf3ceee6f7 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Thu, 22 Aug 2024 11:21:28 +0800 Subject: [PATCH 04/30] srcu: Replace WARN_ON_ONCE() with BUILD_BUG_ON() if possible The value of ARRAY_SIZE() can be determined at compile time, so if both sides of the equation are ARRAY_SIZE(), using BUILD_BUG_ON() can help us catch the problem earlier. While there are cases where unequal array sizes will work, there is no point in allowing them, so it makes more sense to force them to be equal using BUILD_BUG_ON(). Signed-off-by: Zhen Lei Signed-off-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- kernel/rcu/srcutree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 31706e3293bc..78afaffd1b26 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -128,7 +128,7 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) * Initialize the per-CPU srcu_data array, which feeds into the * leaves of the srcu_node tree. */ - WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != + BUILD_BUG_ON(ARRAY_SIZE(sdp->srcu_lock_count) != ARRAY_SIZE(sdp->srcu_unlock_count)); for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); @@ -187,7 +187,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) /* Each pass through this loop initializes one srcu_node structure. */ srcu_for_each_node_breadth_first(ssp, snp) { spin_lock_init(&ACCESS_PRIVATE(snp, lock)); - WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != + BUILD_BUG_ON(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ; From 5bc455ff25762b1ac0463a005ccb3baf9f1fe7b8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Oct 2024 09:10:58 -0700 Subject: [PATCH 05/30] srcu: Rename srcu_might_be_idle() to srcu_should_expedite() SRCU auto-expedites grace periods that follow a sufficiently long idle period, and the srcu_might_be_idle() function is used to make this decision. However, the upcoming light-weight SRCU readers will not do auto-expediting because doing so would cause the grace-period machinery to invoke synchronize_rcu_expedited() twice, with IPIs all around. However, software-engineering considerations force this determination to remain in srcu_might_be_idle(). This commit therefore changes the name of srcu_might_be_idle() to srcu_should_expedite(), thus moving from what it currently does to why it does it, this latter being more future-proof. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- kernel/rcu/srcutree.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 78afaffd1b26..2fe0abade9c0 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1139,7 +1139,8 @@ static void srcu_flip(struct srcu_struct *ssp) } /* - * If SRCU is likely idle, return true, otherwise return false. + * If SRCU is likely idle, in other words, the next SRCU grace period + * should be expedited, return true, otherwise return false. * * Note that it is OK for several current from-idle requests for a new * grace period from idle to specify expediting because they will all end @@ -1159,7 +1160,7 @@ static void srcu_flip(struct srcu_struct *ssp) * negligible when amortized over that time period, and the extra latency * of a needlessly non-expedited grace period is similarly negligible. */ -static bool srcu_might_be_idle(struct srcu_struct *ssp) +static bool srcu_should_expedite(struct srcu_struct *ssp) { unsigned long curseq; unsigned long flags; @@ -1469,14 +1470,15 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); * Implementation of these memory-ordering guarantees is similar to * that of synchronize_rcu(). * - * If SRCU is likely idle, expedite the first request. This semantic - * was provided by Classic SRCU, and is relied upon by its users, so TREE - * SRCU must also provide it. Note that detecting idleness is heuristic - * and subject to both false positives and negatives. + * If SRCU is likely idle as determined by srcu_should_expedite(), + * expedite the first request. This semantic was provided by Classic SRCU, + * and is relied upon by its users, so TREE SRCU must also provide it. + * Note that detecting idleness is heuristic and subject to both false + * positives and negatives. */ void synchronize_srcu(struct srcu_struct *ssp) { - if (srcu_might_be_idle(ssp) || rcu_gp_is_expedited()) + if (srcu_should_expedite(ssp) || rcu_gp_is_expedited()) synchronize_srcu_expedited(ssp); else __synchronize_srcu(ssp, true); From e3d6718677628680ae75a42bdb2cdc27d28e8ce8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Oct 2024 09:10:59 -0700 Subject: [PATCH 06/30] srcu: Introduce srcu_gp_is_expedited() helper function Even though the open-coded expressions usually fit on one line, this commit replaces them with a call to a new srcu_gp_is_expedited() helper function in order to improve readability. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- kernel/rcu/srcutree.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 2fe0abade9c0..5b1a315f77bc 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -418,6 +418,16 @@ static void check_init_srcu_struct(struct srcu_struct *ssp) spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } +/* + * Is the current or any upcoming grace period to be expedited? + */ +static bool srcu_gp_is_expedited(struct srcu_struct *ssp) +{ + struct srcu_usage *sup = ssp->srcu_sup; + + return ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)); +} + /* * Returns approximate total of the readers' ->srcu_lock_count[] values * for the rank of per-CPU counters specified by idx. @@ -622,7 +632,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) unsigned long jbase = SRCU_INTERVAL; struct srcu_usage *sup = ssp->srcu_sup; - if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp))) + if (srcu_gp_is_expedited(ssp)) jbase = 0; if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) { j = jiffies - 1; @@ -867,7 +877,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) spin_lock_irq_rcu_node(sup); idx = rcu_seq_state(sup->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); - if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp))) + if (srcu_gp_is_expedited(ssp)) cbdelay = 0; WRITE_ONCE(sup->srcu_last_gp_end, ktime_get_mono_fast_ns()); From 9650edd9bf1d152f69ccf96b67c4e28577a4cf98 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Oct 2024 17:22:51 -0700 Subject: [PATCH 07/30] rcu: Finer-grained grace-period-end checks in rcu_dump_cpu_stacks() This commit pushes the grace-period-end checks further down into rcu_dump_cpu_stacks(), and also uses lockless checks coupled with finer-grained locking. The result is that the current leaf rcu_node structure's ->lock is acquired only if a stack backtrace might be needed from the current CPU, and is held across only that CPU's backtrace. As a result, if there are no stalled CPUs associated with a given rcu_node structure, then its ->lock will not be acquired at all. On large systems, it is usually (though not always) the case that a small number of CPUs are stalling the current grace period, which means that the ->lock need be acquired only for a small fraction of the rcu_node structures. [ paulmck: Apply Dan Carpenter feedback. ] Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree_stall.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index b530844becf8..925fcdad5dea 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -342,20 +342,24 @@ static void rcu_dump_cpu_stacks(unsigned long gp_seq) struct rcu_node *rnp; rcu_for_each_leaf_node(rnp) { - if (gp_seq != data_race(rcu_state.gp_seq)) { - pr_err("INFO: Stall ended during stack backtracing.\n"); - return; - } printk_deferred_enter(); - raw_spin_lock_irqsave_rcu_node(rnp, flags); - for_each_leaf_node_possible_cpu(rnp, cpu) + for_each_leaf_node_possible_cpu(rnp, cpu) { + if (gp_seq != data_race(rcu_state.gp_seq)) { + printk_deferred_exit(); + pr_err("INFO: Stall ended during stack backtracing.\n"); + return; + } + if (!(data_race(rnp->qsmask) & leaf_node_cpu_bit(rnp, cpu))) + continue; + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { if (cpu_is_offline(cpu)) pr_err("Offline CPU %d blocking current GP.\n", cpu); else dump_cpu_task(cpu); } - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } printk_deferred_exit(); } } From 365f34483be33a9d0151c06ac39627d7927210d9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Oct 2024 09:11:00 -0700 Subject: [PATCH 08/30] srcu: Renaming in preparation for additional reader flavor Currently, there are only two flavors of readers, normal and NMI-safe. A number of fields, functions, and types reflect this restriction. This renaming-only commit prepares for the addition of light-weight (as in memory-barrier-free) readers. OK, OK, there is also a drive-by white-space fixeup! Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- include/linux/srcu.h | 21 ++++++++++----------- include/linux/srcutree.h | 2 +- kernel/rcu/srcutree.c | 22 +++++++++++----------- 3 files changed, 22 insertions(+), 23 deletions(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 835bbb2d1f88..06728ef6f32a 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -181,10 +181,9 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) #define SRCU_NMI_SAFE 0x2 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TREE_SRCU) -void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe); +void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); #else -static inline void srcu_check_nmi_safety(struct srcu_struct *ssp, - bool nmi_safe) { } +static inline void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) { } #endif @@ -245,7 +244,7 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) { int retval; - srcu_check_nmi_safety(ssp, false); + srcu_check_read_flavor(ssp, false); retval = __srcu_read_lock(ssp); srcu_lock_acquire(&ssp->dep_map); return retval; @@ -262,7 +261,7 @@ static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp { int retval; - srcu_check_nmi_safety(ssp, true); + srcu_check_read_flavor(ssp, true); retval = __srcu_read_lock_nmisafe(ssp); rcu_try_lock_acquire(&ssp->dep_map); return retval; @@ -274,7 +273,7 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp) { int retval; - srcu_check_nmi_safety(ssp, false); + srcu_check_read_flavor(ssp, false); retval = __srcu_read_lock(ssp); return retval; } @@ -303,7 +302,7 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp) static inline int srcu_down_read(struct srcu_struct *ssp) __acquires(ssp) { WARN_ON_ONCE(in_nmi()); - srcu_check_nmi_safety(ssp, false); + srcu_check_read_flavor(ssp, false); return __srcu_read_lock(ssp); } @@ -318,7 +317,7 @@ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp) { WARN_ON_ONCE(idx & ~0x1); - srcu_check_nmi_safety(ssp, false); + srcu_check_read_flavor(ssp, false); srcu_lock_release(&ssp->dep_map); __srcu_read_unlock(ssp, idx); } @@ -334,7 +333,7 @@ static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp) { WARN_ON_ONCE(idx & ~0x1); - srcu_check_nmi_safety(ssp, true); + srcu_check_read_flavor(ssp, true); rcu_lock_release(&ssp->dep_map); __srcu_read_unlock_nmisafe(ssp, idx); } @@ -343,7 +342,7 @@ static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) static inline notrace void srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp) { - srcu_check_nmi_safety(ssp, false); + srcu_check_read_flavor(ssp, false); __srcu_read_unlock(ssp, idx); } @@ -360,7 +359,7 @@ static inline void srcu_up_read(struct srcu_struct *ssp, int idx) { WARN_ON_ONCE(idx & ~0x1); WARN_ON_ONCE(in_nmi()); - srcu_check_nmi_safety(ssp, false); + srcu_check_read_flavor(ssp, false); __srcu_read_unlock(ssp, idx); } diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index ed57598394de..ab7d8d215b84 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -25,7 +25,7 @@ struct srcu_data { /* Read-side state. */ atomic_long_t srcu_lock_count[2]; /* Locks per CPU. */ atomic_long_t srcu_unlock_count[2]; /* Unlocks per CPU. */ - int srcu_nmi_safety; /* NMI-safe srcu_struct structure? */ + int srcu_reader_flavor; /* Reader flavor for srcu_struct structure? */ /* Update-side state. */ spinlock_t __private lock ____cacheline_internodealigned_in_smp; diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 5b1a315f77bc..f259dd834272 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -460,7 +460,7 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) sum += atomic_long_read(&cpuc->srcu_unlock_count[idx]); if (IS_ENABLED(CONFIG_PROVE_RCU)) - mask = mask | READ_ONCE(cpuc->srcu_nmi_safety); + mask = mask | READ_ONCE(cpuc->srcu_reader_flavor); } WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask >> 1)), "Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp); @@ -699,25 +699,25 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct); #ifdef CONFIG_PROVE_RCU /* - * Check for consistent NMI safety. + * Check for consistent reader flavor. */ -void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe) +void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) { - int nmi_safe_mask = 1 << nmi_safe; - int old_nmi_safe_mask; + int reader_flavor_mask = 1 << read_flavor; + int old_reader_flavor_mask; struct srcu_data *sdp; /* NMI-unsafe use in NMI is a bad sign */ - WARN_ON_ONCE(!nmi_safe && in_nmi()); + WARN_ON_ONCE(!read_flavor && in_nmi()); sdp = raw_cpu_ptr(ssp->sda); - old_nmi_safe_mask = READ_ONCE(sdp->srcu_nmi_safety); - if (!old_nmi_safe_mask) { - WRITE_ONCE(sdp->srcu_nmi_safety, nmi_safe_mask); + old_reader_flavor_mask = READ_ONCE(sdp->srcu_reader_flavor); + if (!old_reader_flavor_mask) { + WRITE_ONCE(sdp->srcu_reader_flavor, reader_flavor_mask); return; } - WARN_ONCE(old_nmi_safe_mask != nmi_safe_mask, "CPU %d old state %d new state %d\n", sdp->cpu, old_nmi_safe_mask, nmi_safe_mask); + WARN_ONCE(old_reader_flavor_mask != reader_flavor_mask, "CPU %d old state %d new state %d\n", sdp->cpu, old_reader_flavor_mask, reader_flavor_mask); } -EXPORT_SYMBOL_GPL(srcu_check_nmi_safety); +EXPORT_SYMBOL_GPL(srcu_check_read_flavor); #endif /* CONFIG_PROVE_RCU */ /* From c2f9467c77941cae5a41aa10c06ff0d5b00f69f9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Oct 2024 09:11:01 -0700 Subject: [PATCH 09/30] srcu: Bit manipulation changes for additional reader flavor Currently, there are only two flavors of readers, normal and NMI-safe. Very straightforward state updates suffice to check for erroneous mixing of reader flavors on a given srcu_struct structure. This commit upgrades the checking in preparation for the addition of light-weight (as in memory-barrier-free) readers. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- kernel/rcu/srcutree.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index f259dd834272..9774bc500de5 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -462,7 +462,7 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) if (IS_ENABLED(CONFIG_PROVE_RCU)) mask = mask | READ_ONCE(cpuc->srcu_reader_flavor); } - WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask >> 1)), + WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), "Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp); return sum; } @@ -712,8 +712,9 @@ void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) sdp = raw_cpu_ptr(ssp->sda); old_reader_flavor_mask = READ_ONCE(sdp->srcu_reader_flavor); if (!old_reader_flavor_mask) { - WRITE_ONCE(sdp->srcu_reader_flavor, reader_flavor_mask); - return; + old_reader_flavor_mask = cmpxchg(&sdp->srcu_reader_flavor, 0, reader_flavor_mask); + if (!old_reader_flavor_mask) + return; } WARN_ONCE(old_reader_flavor_mask != reader_flavor_mask, "CPU %d old state %d new state %d\n", sdp->cpu, old_reader_flavor_mask, reader_flavor_mask); } From 9a87bda2b6881de10fb5791cade3719663b8d660 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Oct 2024 09:11:02 -0700 Subject: [PATCH 10/30] srcu: Standardize srcu_data pointers to "sdp" and similar This commit changes a few "cpuc" variables to "sdp" to align with usage elsewhere. [ paulmck: Apply Neeraj Upadhyay feedback. ] Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- kernel/rcu/srcutree.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 9774bc500de5..b85da944d794 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -438,9 +438,9 @@ static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx) unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); + struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&cpuc->srcu_lock_count[idx]); + sum += atomic_long_read(&sdp->srcu_lock_count[idx]); } return sum; } @@ -456,11 +456,11 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); + struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&cpuc->srcu_unlock_count[idx]); + sum += atomic_long_read(&sdp->srcu_unlock_count[idx]); if (IS_ENABLED(CONFIG_PROVE_RCU)) - mask = mask | READ_ONCE(cpuc->srcu_reader_flavor); + mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), "Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp); @@ -564,12 +564,12 @@ static bool srcu_readers_active(struct srcu_struct *ssp) unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); + struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&cpuc->srcu_lock_count[0]); - sum += atomic_long_read(&cpuc->srcu_lock_count[1]); - sum -= atomic_long_read(&cpuc->srcu_unlock_count[0]); - sum -= atomic_long_read(&cpuc->srcu_unlock_count[1]); + sum += atomic_long_read(&sdp->srcu_lock_count[0]); + sum += atomic_long_read(&sdp->srcu_lock_count[1]); + sum -= atomic_long_read(&sdp->srcu_unlock_count[0]); + sum -= atomic_long_read(&sdp->srcu_unlock_count[1]); } return sum; } From c071b8e5351453c2cb2d12f425d928f3a24ed2d3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Oct 2024 09:11:03 -0700 Subject: [PATCH 11/30] srcu: Improve srcu_read_lock{,_nmisafe}() comments This commit adds some additional usage constraints to the kernel-doc headers of srcu_read_lock() and srcu_read_lock_nmi_safe(). Suggested-by: Andrii Nakryiko Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- include/linux/srcu.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 06728ef6f32a..46fd06b212ba 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -235,10 +235,13 @@ static inline void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flav * a mutex that is held elsewhere while calling synchronize_srcu() or * synchronize_srcu_expedited(). * - * Note that srcu_read_lock() and the matching srcu_read_unlock() must - * occur in the same context, for example, it is illegal to invoke - * srcu_read_unlock() in an irq handler if the matching srcu_read_lock() - * was invoked in process context. + * The return value from srcu_read_lock() must be passed unaltered + * to the matching srcu_read_unlock(). Note that srcu_read_lock() and + * the matching srcu_read_unlock() must occur in the same context, for + * example, it is illegal to invoke srcu_read_unlock() in an irq handler + * if the matching srcu_read_lock() was invoked in process context. Or, + * for that matter to invoke srcu_read_unlock() from one task and the + * matching srcu_read_lock() from another. */ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) { @@ -256,6 +259,10 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) * * Enter an SRCU read-side critical section, but in an NMI-safe manner. * See srcu_read_lock() for more information. + * + * If srcu_read_lock_nmisafe() is ever used on an srcu_struct structure, + * then none of the other flavors may be used, whether before, during, + * or after. */ static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp) { From 05829be27fe6f64e0675dc3be3a12d43b52492e1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Oct 2024 09:11:04 -0700 Subject: [PATCH 12/30] srcu: Create CPP macros for normal and NMI-safe SRCU readers This commit creates SRCU_READ_FLAVOR_NORMAL and SRCU_READ_FLAVOR_NMI C-preprocessor macros for srcu_read_lock() and srcu_read_lock_nmisafe(), respectively. These replace the old true/false values that were previously passed to srcu_check_read_flavor(). In addition, the srcu_check_read_flavor() function itself requires a bit of rework to handle bitmasks instead of true/false values. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- include/linux/srcu.h | 20 ++++++++------------ include/linux/srcutree.h | 4 ++++ kernel/rcu/srcutree.c | 21 +++++++++++---------- 3 files changed, 23 insertions(+), 22 deletions(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 46fd06b212ba..84daaa33ea0a 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -176,10 +176,6 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -#define SRCU_NMI_UNKNOWN 0x0 -#define SRCU_NMI_UNSAFE 0x1 -#define SRCU_NMI_SAFE 0x2 - #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TREE_SRCU) void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); #else @@ -247,7 +243,7 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) { int retval; - srcu_check_read_flavor(ssp, false); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); retval = __srcu_read_lock(ssp); srcu_lock_acquire(&ssp->dep_map); return retval; @@ -268,7 +264,7 @@ static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp { int retval; - srcu_check_read_flavor(ssp, true); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NMI); retval = __srcu_read_lock_nmisafe(ssp); rcu_try_lock_acquire(&ssp->dep_map); return retval; @@ -280,7 +276,7 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp) { int retval; - srcu_check_read_flavor(ssp, false); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); retval = __srcu_read_lock(ssp); return retval; } @@ -309,7 +305,7 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp) static inline int srcu_down_read(struct srcu_struct *ssp) __acquires(ssp) { WARN_ON_ONCE(in_nmi()); - srcu_check_read_flavor(ssp, false); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); return __srcu_read_lock(ssp); } @@ -324,7 +320,7 @@ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp) { WARN_ON_ONCE(idx & ~0x1); - srcu_check_read_flavor(ssp, false); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); srcu_lock_release(&ssp->dep_map); __srcu_read_unlock(ssp, idx); } @@ -340,7 +336,7 @@ static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp) { WARN_ON_ONCE(idx & ~0x1); - srcu_check_read_flavor(ssp, true); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NMI); rcu_lock_release(&ssp->dep_map); __srcu_read_unlock_nmisafe(ssp, idx); } @@ -349,7 +345,7 @@ static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) static inline notrace void srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp) { - srcu_check_read_flavor(ssp, false); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); __srcu_read_unlock(ssp, idx); } @@ -366,7 +362,7 @@ static inline void srcu_up_read(struct srcu_struct *ssp, int idx) { WARN_ON_ONCE(idx & ~0x1); WARN_ON_ONCE(in_nmi()); - srcu_check_read_flavor(ssp, false); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); __srcu_read_unlock(ssp, idx); } diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index ab7d8d215b84..79ad809c7f03 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -43,6 +43,10 @@ struct srcu_data { struct srcu_struct *ssp; }; +/* Values for ->srcu_reader_flavor. */ +#define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). +#define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). + /* * Node in SRCU combining tree, similar in function to rcu_data. */ diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index b85da944d794..602b4b8c4b89 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -463,7 +463,7 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), - "Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp); + "Mixed reader flavors for srcu_struct at %ps.\n", ssp); return sum; } @@ -703,20 +703,21 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct); */ void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) { - int reader_flavor_mask = 1 << read_flavor; - int old_reader_flavor_mask; + int old_read_flavor; struct srcu_data *sdp; - /* NMI-unsafe use in NMI is a bad sign */ - WARN_ON_ONCE(!read_flavor && in_nmi()); + /* NMI-unsafe use in NMI is a bad sign, as is multi-bit read_flavor values. */ + WARN_ON_ONCE((read_flavor != SRCU_READ_FLAVOR_NMI) && in_nmi()); + WARN_ON_ONCE(read_flavor & (read_flavor - 1)); + sdp = raw_cpu_ptr(ssp->sda); - old_reader_flavor_mask = READ_ONCE(sdp->srcu_reader_flavor); - if (!old_reader_flavor_mask) { - old_reader_flavor_mask = cmpxchg(&sdp->srcu_reader_flavor, 0, reader_flavor_mask); - if (!old_reader_flavor_mask) + old_read_flavor = READ_ONCE(sdp->srcu_reader_flavor); + if (!old_read_flavor) { + old_read_flavor = cmpxchg(&sdp->srcu_reader_flavor, 0, read_flavor); + if (!old_read_flavor) return; } - WARN_ONCE(old_reader_flavor_mask != reader_flavor_mask, "CPU %d old state %d new state %d\n", sdp->cpu, old_reader_flavor_mask, reader_flavor_mask); + WARN_ONCE(old_read_flavor != read_flavor, "CPU %d old state %d new state %d\n", sdp->cpu, old_read_flavor, read_flavor); } EXPORT_SYMBOL_GPL(srcu_check_read_flavor); #endif /* CONFIG_PROVE_RCU */ From 6364dd8191d27230176ac4b1b4daaecaf4807399 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Oct 2024 09:11:05 -0700 Subject: [PATCH 13/30] srcu: Add srcu_read_lock_lite() and srcu_read_unlock_lite() This patch adds srcu_read_lock_lite() and srcu_read_unlock_lite(), which dispense with the read-side smp_mb() but also are restricted to code regions that RCU is watching. If a given srcu_struct structure uses srcu_read_lock_lite() and srcu_read_unlock_lite(), it is not permitted to use any other SRCU read-side marker, before, during, or after. Another price of light-weight readers is heavier weight grace periods. Such readers mean that SRCU grace periods on srcu_struct structures used by light-weight readers will incur at least two calls to synchronize_rcu(). In addition, normal SRCU grace periods for light-weight-reader srcu_struct structures never auto-expedite. Note that expedited SRCU grace periods for light-weight-reader srcu_struct structures still invoke synchronize_rcu(), not synchronize_srcu_expedited(). Something about wishing to keep the IPIs down to a dull roar. The srcu_read_lock_lite() and srcu_read_unlock_lite() functions may not (repeat, *not*) be used from NMI handlers, but if this is needed, an additional flavor of SRCU reader can be added by some future commit. [ paulmck: Apply Alexei Starovoitov expediting feedback. ] [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney Tested-by: kernel test robot Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- include/linux/srcu.h | 51 ++++++++++++++++++++++++- include/linux/srcutree.h | 1 + kernel/rcu/srcutree.c | 82 ++++++++++++++++++++++++++++++++++------ 3 files changed, 122 insertions(+), 12 deletions(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 84daaa33ea0a..4ba96e2cfa40 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -56,6 +56,13 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *head, void cleanup_srcu_struct(struct srcu_struct *ssp); int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); +#ifdef CONFIG_TINY_SRCU +#define __srcu_read_lock_lite __srcu_read_lock +#define __srcu_read_unlock_lite __srcu_read_unlock +#else // #ifdef CONFIG_TINY_SRCU +int __srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp); +void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) __releases(ssp); +#endif // #else // #ifdef CONFIG_TINY_SRCU void synchronize_srcu(struct srcu_struct *ssp); #define SRCU_GET_STATE_COMPLETED 0x1 @@ -179,7 +186,7 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TREE_SRCU) void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); #else -static inline void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) { } +#define srcu_check_read_flavor(ssp, read_flavor) do { } while (0) #endif @@ -249,6 +256,32 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) return retval; } +/** + * srcu_read_lock_lite - register a new reader for an SRCU-protected structure. + * @ssp: srcu_struct in which to register the new reader. + * + * Enter an SRCU read-side critical section, but for a light-weight + * smp_mb()-free reader. See srcu_read_lock() for more information. + * + * If srcu_read_lock_lite() is ever used on an srcu_struct structure, + * then none of the other flavors may be used, whether before, during, + * or after. Note that grace-period auto-expediting is disabled for _lite + * srcu_struct structures because auto-expedited grace periods invoke + * synchronize_rcu_expedited(), IPIs and all. + * + * Note that srcu_read_lock_lite() can be invoked only from those contexts + * where RCU is watching. Otherwise, lockdep will complain. + */ +static inline int srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp) +{ + int retval; + + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE); + retval = __srcu_read_lock_lite(ssp); + rcu_try_lock_acquire(&ssp->dep_map); + return retval; +} + /** * srcu_read_lock_nmisafe - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. @@ -325,6 +358,22 @@ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) __srcu_read_unlock(ssp, idx); } +/** + * srcu_read_unlock_lite - unregister a old reader from an SRCU-protected structure. + * @ssp: srcu_struct in which to unregister the old reader. + * @idx: return value from corresponding srcu_read_lock(). + * + * Exit a light-weight SRCU read-side critical section. + */ +static inline void srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) + __releases(ssp) +{ + WARN_ON_ONCE(idx & ~0x1); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE); + srcu_lock_release(&ssp->dep_map); + __srcu_read_unlock(ssp, idx); +} + /** * srcu_read_unlock_nmisafe - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 79ad809c7f03..8074138cbd62 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -46,6 +46,7 @@ struct srcu_data { /* Values for ->srcu_reader_flavor. */ #define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). #define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). +#define SRCU_READ_FLAVOR_LITE 0x4 // srcu_read_lock_lite(). /* * Node in SRCU combining tree, similar in function to rcu_data. diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 602b4b8c4b89..8632a3caeb33 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -429,20 +429,29 @@ static bool srcu_gp_is_expedited(struct srcu_struct *ssp) } /* - * Returns approximate total of the readers' ->srcu_lock_count[] values - * for the rank of per-CPU counters specified by idx. + * Computes approximate total of the readers' ->srcu_lock_count[] values + * for the rank of per-CPU counters specified by idx, and returns true if + * the caller did the proper barrier (gp), and if the count of the locks + * matches that of the unlocks passed in. */ -static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx) +static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, unsigned long unlocks) { int cpu; + unsigned long mask = 0; unsigned long sum = 0; for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); sum += atomic_long_read(&sdp->srcu_lock_count[idx]); + if (IS_ENABLED(CONFIG_PROVE_RCU)) + mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } - return sum; + WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), + "Mixed reader flavors for srcu_struct at %ps.\n", ssp); + if (mask & SRCU_READ_FLAVOR_LITE && !gp) + return false; + return sum == unlocks; } /* @@ -473,6 +482,7 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) */ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) { + bool did_gp = !!(raw_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE); unsigned long unlocks; unlocks = srcu_readers_unlock_idx(ssp, idx); @@ -482,13 +492,16 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) * unlock is counted. Needs to be a smp_mb() as the read side may * contain a read from a variable that is written to before the * synchronize_srcu() in the write side. In this case smp_mb()s - * A and B act like the store buffering pattern. + * A and B (or X and Y) act like the store buffering pattern. * - * This smp_mb() also pairs with smp_mb() C to prevent accesses - * after the synchronize_srcu() from being executed before the - * grace period ends. + * This smp_mb() also pairs with smp_mb() C (or, in the case of X, + * Z) to prevent accesses after the synchronize_srcu() from being + * executed before the grace period ends. */ - smp_mb(); /* A */ + if (!did_gp) + smp_mb(); /* A */ + else + synchronize_rcu(); /* X */ /* * If the locks are the same as the unlocks, then there must have @@ -546,7 +559,7 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) * which are unlikely to be configured with an address space fully * populated with memory, at least not anytime soon. */ - return srcu_readers_lock_idx(ssp, idx) == unlocks; + return srcu_readers_lock_idx(ssp, idx, did_gp, unlocks); } /** @@ -750,6 +763,47 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) } EXPORT_SYMBOL_GPL(__srcu_read_unlock); +/* + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. Returns an index that must be passed to the matching + * srcu_read_unlock_lite(). + * + * Note that this_cpu_inc() is an RCU read-side critical section either + * because it disables interrupts, because it is a single instruction, + * or because it is a read-modify-write atomic operation, depending on + * the whims of the architecture. + */ +int __srcu_read_lock_lite(struct srcu_struct *ssp) +{ + int idx; + + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_lite()."); + idx = READ_ONCE(ssp->srcu_idx) & 0x1; + this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); /* Y */ + barrier(); /* Avoid leaking the critical section. */ + return idx; +} +EXPORT_SYMBOL_GPL(__srcu_read_lock_lite); + +/* + * Removes the count for the old reader from the appropriate + * per-CPU element of the srcu_struct. Note that this may well be a + * different CPU than that which was incremented by the corresponding + * srcu_read_lock_lite(), but it must be within the same task. + * + * Note that this_cpu_inc() is an RCU read-side critical section either + * because it disables interrupts, because it is a single instruction, + * or because it is a read-modify-write atomic operation, depending on + * the whims of the architecture. + */ +void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) +{ + barrier(); /* Avoid leaking the critical section. */ + this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); /* Z */ + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_lite()."); +} +EXPORT_SYMBOL_GPL(__srcu_read_unlock_lite); + #ifdef CONFIG_NEED_SRCU_NMI_SAFE /* @@ -1134,6 +1188,8 @@ static void srcu_flip(struct srcu_struct *ssp) * it stays until either (1) Compilers learn about this sort of * control dependency or (2) Some production workload running on * a production system is unduly delayed by this slowpath smp_mb(). + * Except for _lite() readers, where it is inoperative, which + * means that it is a good thing that it is redundant. */ smp_mb(); /* E */ /* Pairs with B and C. */ @@ -1152,7 +1208,8 @@ static void srcu_flip(struct srcu_struct *ssp) /* * If SRCU is likely idle, in other words, the next SRCU grace period - * should be expedited, return true, otherwise return false. + * should be expedited, return true, otherwise return false. Except that + * in the presence of _lite() readers, always return false. * * Note that it is OK for several current from-idle requests for a new * grace period from idle to specify expediting because they will all end @@ -1181,6 +1238,9 @@ static bool srcu_should_expedite(struct srcu_struct *ssp) unsigned long tlast; check_init_srcu_struct(ssp); + /* If _lite() readers, don't do unsolicited expediting. */ + if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE) + return false; /* If the local srcu_data structure has callbacks, not idle. */ sdp = raw_cpu_ptr(ssp->sda); spin_lock_irqsave_rcu_node(sdp, flags); From bb94b12e4503bdce003a74e95ee4214eba923f86 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Oct 2024 09:11:06 -0700 Subject: [PATCH 14/30] srcu: Allow inlining of __srcu_read_{,un}lock_lite() This commit moves __srcu_read_lock_lite() and __srcu_read_unlock_lite() into include/linux/srcu.h and marks them "static inline" so that they can be inlined into srcu_read_lock_lite() and srcu_read_unlock_lite(), respectively. They are not hand-inlined due to Tree SRCU and Tiny SRCU having different implementations. The earlier removal of smp_mb() combined with the inlining produce significant single-percentage performance wins. Link: https://lore.kernel.org/all/CAEf4BzYgiNmSb=ZKQ65tm6nJDi1UX2Gq26cdHSH1mPwXJYZj5g@mail.gmail.com/ Reported-by: Alexei Starovoitov Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- include/linux/srcutree.h | 39 ++++++++++++++++++++++++++++++++++++++ kernel/rcu/srcutree.c | 41 ---------------------------------------- 2 files changed, 39 insertions(+), 41 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 8074138cbd62..778eb61542e1 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -209,4 +209,43 @@ void synchronize_srcu_expedited(struct srcu_struct *ssp); void srcu_barrier(struct srcu_struct *ssp); void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf); +/* + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. Returns an index that must be passed to the matching + * srcu_read_unlock_lite(). + * + * Note that this_cpu_inc() is an RCU read-side critical section either + * because it disables interrupts, because it is a single instruction, + * or because it is a read-modify-write atomic operation, depending on + * the whims of the architecture. + */ +static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) +{ + int idx; + + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_lite()."); + idx = READ_ONCE(ssp->srcu_idx) & 0x1; + this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); /* Y */ + barrier(); /* Avoid leaking the critical section. */ + return idx; +} + +/* + * Removes the count for the old reader from the appropriate + * per-CPU element of the srcu_struct. Note that this may well be a + * different CPU than that which was incremented by the corresponding + * srcu_read_lock_lite(), but it must be within the same task. + * + * Note that this_cpu_inc() is an RCU read-side critical section either + * because it disables interrupts, because it is a single instruction, + * or because it is a read-modify-write atomic operation, depending on + * the whims of the architecture. + */ +static inline void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) +{ + barrier(); /* Avoid leaking the critical section. */ + this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); /* Z */ + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_lite()."); +} + #endif diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 8632a3caeb33..d3a0c76ce590 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -763,47 +763,6 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) } EXPORT_SYMBOL_GPL(__srcu_read_unlock); -/* - * Counts the new reader in the appropriate per-CPU element of the - * srcu_struct. Returns an index that must be passed to the matching - * srcu_read_unlock_lite(). - * - * Note that this_cpu_inc() is an RCU read-side critical section either - * because it disables interrupts, because it is a single instruction, - * or because it is a read-modify-write atomic operation, depending on - * the whims of the architecture. - */ -int __srcu_read_lock_lite(struct srcu_struct *ssp) -{ - int idx; - - RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_lite()."); - idx = READ_ONCE(ssp->srcu_idx) & 0x1; - this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); /* Y */ - barrier(); /* Avoid leaking the critical section. */ - return idx; -} -EXPORT_SYMBOL_GPL(__srcu_read_lock_lite); - -/* - * Removes the count for the old reader from the appropriate - * per-CPU element of the srcu_struct. Note that this may well be a - * different CPU than that which was incremented by the corresponding - * srcu_read_lock_lite(), but it must be within the same task. - * - * Note that this_cpu_inc() is an RCU read-side critical section either - * because it disables interrupts, because it is a single instruction, - * or because it is a read-modify-write atomic operation, depending on - * the whims of the architecture. - */ -void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) -{ - barrier(); /* Avoid leaking the critical section. */ - this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); /* Z */ - RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_lite()."); -} -EXPORT_SYMBOL_GPL(__srcu_read_unlock_lite); - #ifdef CONFIG_NEED_SRCU_NMI_SAFE /* From 37a1decb43f381d5b8f5d4a64608d916949dd9ee Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Oct 2024 09:11:07 -0700 Subject: [PATCH 15/30] rcutorture: Expand RCUTORTURE_RDR_MASK_[12] to eight bits This commit prepares for testing of multiple SRCU reader flavors by expanding RCUTORTURE_RDR_MASK_1 and RCUTORTURE_RDR_MASK_2 from a single bit to eight bits, allowing them to accommodate the return values from multiple calls to srcu_read_lock*(). This will in turn permit better testing coverage for these SRCU reader flavors, including testing of the diagnostics for inproper use of mixed reader flavors. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- kernel/rcu/rcutorture.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bb75dbf5c800..f96ab98f8182 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -57,9 +57,9 @@ MODULE_AUTHOR("Paul E. McKenney and Josh Triplett extendables field, extendables param, and related definitions. */ #define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK_1 (1 << RCUTORTURE_RDR_SHIFT_1) -#define RCUTORTURE_RDR_SHIFT_2 9 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK_2 (1 << RCUTORTURE_RDR_SHIFT_2) +#define RCUTORTURE_RDR_MASK_1 (0xff << RCUTORTURE_RDR_SHIFT_1) +#define RCUTORTURE_RDR_SHIFT_2 16 /* Put SRCU index in upper bits. */ +#define RCUTORTURE_RDR_MASK_2 (0xff << RCUTORTURE_RDR_SHIFT_2) #define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */ #define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */ #define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */ @@ -71,6 +71,9 @@ MODULE_AUTHOR("Paul E. McKenney and Josh Triplett > RCUTORTURE_RDR_SHIFT_2) > 1); + WARN_ON_ONCE(idxold2 & ~RCUTORTURE_RDR_ALLBITS); rtrsp->rt_readstate = newstate; /* First, put new protection in place to avoid critical-section gap. */ @@ -1835,9 +1838,9 @@ static void rcutorture_one_extend(int *readstate, int newstate, if (statesnew & RCUTORTURE_RDR_SCHED) rcu_read_lock_sched(); if (statesnew & RCUTORTURE_RDR_RCU_1) - idxnew1 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_1; + idxnew1 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1; if (statesnew & RCUTORTURE_RDR_RCU_2) - idxnew2 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_2; + idxnew2 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_2) & RCUTORTURE_RDR_MASK_2; /* * Next, remove old protection, in decreasing order of strength @@ -1857,7 +1860,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, if (statesold & RCUTORTURE_RDR_RBH) rcu_read_unlock_bh(); if (statesold & RCUTORTURE_RDR_RCU_2) { - cur_ops->readunlock((idxold2 >> RCUTORTURE_RDR_SHIFT_2) & 0x1); + cur_ops->readunlock((idxold2 & RCUTORTURE_RDR_MASK_2) >> RCUTORTURE_RDR_SHIFT_2); WARN_ON_ONCE(idxnew2 != -1); idxold2 = 0; } @@ -1867,7 +1870,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, lockit = !cur_ops->no_pi_lock && !statesnew && !(torture_random(trsp) & 0xffff); if (lockit) raw_spin_lock_irqsave(¤t->pi_lock, flags); - cur_ops->readunlock((idxold1 >> RCUTORTURE_RDR_SHIFT_1) & 0x1); + cur_ops->readunlock((idxold1 & RCUTORTURE_RDR_MASK_1) >> RCUTORTURE_RDR_SHIFT_1); WARN_ON_ONCE(idxnew1 != -1); idxold1 = 0; if (lockit) @@ -1882,16 +1885,13 @@ static void rcutorture_one_extend(int *readstate, int newstate, if (idxnew1 == -1) idxnew1 = idxold1 & RCUTORTURE_RDR_MASK_1; WARN_ON_ONCE(idxnew1 < 0); - if (WARN_ON_ONCE((idxnew1 >> RCUTORTURE_RDR_SHIFT_1) > 1)) - pr_info("Unexpected idxnew1 value of %#x\n", idxnew1); if (idxnew2 == -1) idxnew2 = idxold2 & RCUTORTURE_RDR_MASK_2; WARN_ON_ONCE(idxnew2 < 0); - WARN_ON_ONCE((idxnew2 >> RCUTORTURE_RDR_SHIFT_2) > 1); *readstate = idxnew1 | idxnew2 | newstate; WARN_ON_ONCE(*readstate < 0); - if (WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT_2) > 1)) - pr_info("Unexpected idxnew2 value of %#x\n", idxnew2); + if (WARN_ON_ONCE(*readstate & ~RCUTORTURE_RDR_ALLBITS)) + pr_info("Unexpected readstate value of %#x\n", *readstate); } /* Return the biggest extendables mask given current RCU and boot parameters. */ @@ -1916,7 +1916,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ; unsigned long bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; - WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1); + WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1); // Can't have reader idx bits. /* Mostly only one bit (need preemption!), sometimes lots of bits. */ if (!(randmask1 & 0x7)) mask = mask & randmask2; From 95a5de21541d9eb0cf4983f9ffe8b7140db66ef3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Oct 2024 09:11:08 -0700 Subject: [PATCH 16/30] rcutorture: Add reader_flavor parameter for SRCU readers This commit adds an rcutorture.reader_flavor parameter whose bits correspond to reader flavors. For example, SRCU's readers are 0x1 for normal and 0x2 for NMI-safe. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- .../admin-guide/kernel-parameters.txt | 8 +++++ kernel/rcu/rcutorture.c | 30 ++++++++++++++----- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1518343bbe22..52922727006f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5426,6 +5426,14 @@ The delay, in seconds, between successive read-then-exit testing episodes. + rcutorture.reader_flavor= [KNL] + A bit mask indicating which readers to use. + If there is more than one bit set, the readers + are entered from low-order bit up, and are + exited in the opposite order. For SRCU, the + 0x1 bit is normal readers and the 0x2 bit is + for NMI-safe readers. + rcutorture.shuffle_interval= [KNL] Set task-shuffle interval (s). Shuffling tasks allows some CPUs to go into dyntick-idle mode diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f96ab98f8182..405decec3367 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -111,6 +111,7 @@ torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disab torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)"); torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)"); torture_param(int, read_exit_burst, 16, "# of read-then-exit bursts per episode, zero to disable"); +torture_param(int, reader_flavor, 0x1, "Reader flavors to use, one per bit."); torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); @@ -644,10 +645,20 @@ static void srcu_get_gp_data(int *flags, unsigned long *gp_seq) static int srcu_torture_read_lock(void) { - if (cur_ops == &srcud_ops) - return srcu_read_lock_nmisafe(srcu_ctlp); - else - return srcu_read_lock(srcu_ctlp); + int idx; + int ret = 0; + + if ((reader_flavor & 0x1) || !(reader_flavor & 0x7)) { + idx = srcu_read_lock(srcu_ctlp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx; + } + if (reader_flavor & 0x2) { + idx = srcu_read_lock_nmisafe(srcu_ctlp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx << 1; + } + return ret; } static void @@ -671,10 +682,11 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) static void srcu_torture_read_unlock(int idx) { - if (cur_ops == &srcud_ops) - srcu_read_unlock_nmisafe(srcu_ctlp, idx); - else - srcu_read_unlock(srcu_ctlp, idx); + WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); + if (reader_flavor & 0x2) + srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1); + if ((reader_flavor & 0x1) || !(reader_flavor & 0x7)) + srcu_read_unlock(srcu_ctlp, idx & 0x1); } static int torture_srcu_read_lock_held(void) @@ -2389,6 +2401,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d " "read_exit_delay=%d read_exit_burst=%d " + "reader_flavor=%x " "nocbs_nthreads=%d nocbs_toggle=%d " "test_nmis=%d\n", torture_type, tag, nrealreaders, nfakewriters, @@ -2401,6 +2414,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) n_barrier_cbs, onoff_interval, onoff_holdoff, read_exit_delay, read_exit_burst, + reader_flavor, nocbs_nthreads, nocbs_toggle, test_nmis); } From 43349fc4d8098d8679e7b142841a9661c623ed3a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Oct 2024 09:11:09 -0700 Subject: [PATCH 17/30] rcutorture: Add srcu_read_lock_lite() support to rcutorture.reader_flavor This commit causes bit 0x4 of rcutorture.reader_flavor to select the new srcu_read_lock_lite() and srcu_read_unlock_lite() functions. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- Documentation/admin-guide/kernel-parameters.txt | 4 ++-- kernel/rcu/rcutorture.c | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 52922727006f..203ec51e41d4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5431,8 +5431,8 @@ If there is more than one bit set, the readers are entered from low-order bit up, and are exited in the opposite order. For SRCU, the - 0x1 bit is normal readers and the 0x2 bit is - for NMI-safe readers. + 0x1 bit is normal readers, 0x2 NMI-safe readers, + and 0x4 light-weight readers. rcutorture.shuffle_interval= [KNL] Set task-shuffle interval (s). Shuffling tasks diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 405decec3367..a313cdcb0960 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -658,6 +658,11 @@ static int srcu_torture_read_lock(void) WARN_ON_ONCE(idx & ~0x1); ret += idx << 1; } + if (reader_flavor & 0x4) { + idx = srcu_read_lock_lite(srcu_ctlp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx << 2; + } return ret; } @@ -683,6 +688,8 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) static void srcu_torture_read_unlock(int idx) { WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); + if (reader_flavor & 0x4) + srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2); if (reader_flavor & 0x2) srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1); if ((reader_flavor & 0x1) || !(reader_flavor & 0x7)) From 930d4e1344f16e20c9d9ffc3f8888ac78dfd5659 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Oct 2024 09:11:10 -0700 Subject: [PATCH 18/30] rcutorture: Add light-weight SRCU scenario This commit adds an rcutorture scenario that tests light-weight SRCU readers. While in the area, it adjusts the size of the TREE10 scenario. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- tools/testing/selftests/rcutorture/configs/rcu/CFLIST | 1 + tools/testing/selftests/rcutorture/configs/rcu/SRCU-L | 10 ++++++++++ .../selftests/rcutorture/configs/rcu/SRCU-L.boot | 3 +++ .../selftests/rcutorture/configs/rcu/SRCU-N.boot | 1 + tools/testing/selftests/rcutorture/configs/rcu/TREE10 | 2 +- 5 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/SRCU-L create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/SRCU-L.boot diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST index 98b6175e5aa0..45f572570a8c 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST @@ -5,6 +5,7 @@ TREE04 TREE05 TREE07 TREE09 +SRCU-L SRCU-N SRCU-P SRCU-T diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L new file mode 100644 index 000000000000..3b4fa8dbef8a --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L @@ -0,0 +1,10 @@ +CONFIG_RCU_TRACE=n +CONFIG_SMP=y +CONFIG_NR_CPUS=6 +CONFIG_HOTPLUG_CPU=y +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +#CHECK#CONFIG_RCU_EXPERT=n +CONFIG_KPROBES=n +CONFIG_FTRACE=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L.boot new file mode 100644 index 000000000000..0207b3138c5b --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L.boot @@ -0,0 +1,3 @@ +rcutorture.torture_type=srcu +rcutorture.reader_flavor=0x4 +rcutorture.fwd_progress=3 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N.boot index ce0694fd9b92..b54cf87dc110 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N.boot @@ -1,2 +1,3 @@ rcutorture.torture_type=srcu +rcutorture.reader_flavor=0x2 rcutorture.fwd_progress=3 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE10 b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 index a323d8948b7c..759ee51d3ddc 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE10 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 @@ -1,5 +1,5 @@ CONFIG_SMP=y -CONFIG_NR_CPUS=56 +CONFIG_NR_CPUS=74 CONFIG_PREEMPT_NONE=y CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n From 6a2c0255e8a0fea7439bf395eb290f5734e3d345 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Oct 2024 17:25:12 -0700 Subject: [PATCH 19/30] refscale: Add srcu_read_lock_lite() support using "srcu-lite" This commit creates a new srcu-lite option for the refscale.scale_type module parameter that selects srcu_read_lock_lite() and srcu_read_unlock_lite(). [ paulmck: Apply Dan Carpenter feedback. ] Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- kernel/rcu/refscale.c | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 0db9db73f57f..338e7c5ac44a 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -212,6 +212,36 @@ static const struct ref_scale_ops srcu_ops = { .name = "srcu" }; +static void srcu_lite_ref_scale_read_section(const int nloops) +{ + int i; + int idx; + + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock_lite(srcu_ctlp); + srcu_read_unlock_lite(srcu_ctlp, idx); + } +} + +static void srcu_lite_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + int idx; + + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock_lite(srcu_ctlp); + un_delay(udl, ndl); + srcu_read_unlock_lite(srcu_ctlp, idx); + } +} + +static const struct ref_scale_ops srcu_lite_ops = { + .init = rcu_sync_scale_init, + .readsection = srcu_lite_ref_scale_read_section, + .delaysection = srcu_lite_ref_scale_delay_section, + .name = "srcu-lite" +}; + #ifdef CONFIG_TASKS_RCU // Definitions for RCU Tasks ref scale testing: Empty read markers. @@ -1082,9 +1112,10 @@ ref_scale_init(void) long i; int firsterr = 0; static const struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, - &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, &jiffies_ops, - &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, + &rcu_ops, &srcu_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS + &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, + &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, + &typesafe_seqlock_ops, }; if (!torture_init_begin(scale_type, verbose)) From 768b1f87098a4a586353898b074989808b1b27ad Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Oct 2024 09:11:12 -0700 Subject: [PATCH 20/30] srcu: Improve srcu_read_lock_lite() kernel-doc comment Where RCU is watching is where it is OK to invoke rcu_read_lock(). Reported-by: Andrii Nakryiko Signed-off-by: Paul E. McKenney Acked-by: Andrii Nakryiko Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- include/linux/srcu.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 4ba96e2cfa40..bab1dae3f69e 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -270,7 +270,8 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) * synchronize_rcu_expedited(), IPIs and all. * * Note that srcu_read_lock_lite() can be invoked only from those contexts - * where RCU is watching. Otherwise, lockdep will complain. + * where RCU is watching, that is, from contexts where it would be legal + * to invoke rcu_read_lock(). Otherwise, lockdep will complain. */ static inline int srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp) { From 2996980e20b7a54a1869df15b3445374b850b155 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Tue, 22 Oct 2024 11:41:17 +0800 Subject: [PATCH 21/30] rcu/nocb: Fix missed RCU barrier on deoffloading Currently, running rcutorture test with torture_type=rcu fwd_progress=8 n_barrier_cbs=8 nocbs_nthreads=8 nocbs_toggle=100 onoff_interval=60 test_boost=2, will trigger the following warning: WARNING: CPU: 19 PID: 100 at kernel/rcu/tree_nocb.h:1061 rcu_nocb_rdp_deoffload+0x292/0x2a0 RIP: 0010:rcu_nocb_rdp_deoffload+0x292/0x2a0 Call Trace: ? __warn+0x7e/0x120 ? rcu_nocb_rdp_deoffload+0x292/0x2a0 ? report_bug+0x18e/0x1a0 ? handle_bug+0x3d/0x70 ? exc_invalid_op+0x18/0x70 ? asm_exc_invalid_op+0x1a/0x20 ? rcu_nocb_rdp_deoffload+0x292/0x2a0 rcu_nocb_cpu_deoffload+0x70/0xa0 rcu_nocb_toggle+0x136/0x1c0 ? __pfx_rcu_nocb_toggle+0x10/0x10 kthread+0xd1/0x100 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2f/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 CPU0 CPU2 CPU3 //rcu_nocb_toggle //nocb_cb_wait //rcutorture // deoffload CPU1 // process CPU1's rdp rcu_barrier() rcu_segcblist_entrain() rcu_segcblist_add_len(1); // len == 2 // enqueue barrier // callback to CPU1's // rdp->cblist rcu_do_batch() // invoke CPU1's rdp->cblist // callback rcu_barrier_callback() rcu_barrier() mutex_lock(&rcu_state.barrier_mutex); // still see len == 2 // enqueue barrier callback // to CPU1's rdp->cblist rcu_segcblist_entrain() rcu_segcblist_add_len(1); // len == 3 // decrement len rcu_segcblist_add_len(-2); kthread_parkme() // CPU1's rdp->cblist len == 1 // Warn because there is // still a pending barrier // trigger warning WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); cpus_read_unlock(); // wait CPU1 to comes online and // invoke barrier callback on // CPU1 rdp's->cblist wait_for_completion(&rcu_state.barrier_completion); // deoffload CPU4 cpus_read_lock() rcu_barrier() mutex_lock(&rcu_state.barrier_mutex); // block on barrier_mutex // wait rcu_barrier() on // CPU3 to unlock barrier_mutex // but CPU3 unlock barrier_mutex // need to wait CPU1 comes online // when CPU1 going online will block on cpus_write_lock The above scenario will not only trigger a WARN_ON_ONCE(), but also trigger a deadlock. Thanks to nocb locking, a second racing rcu_barrier() on an offline CPU will either observe the decremented callback counter down to 0 and spare the callback enqueue, or rcuo will observe the new callback and keep rdp->nocb_cb_sleep to false. Therefore check rdp->nocb_cb_sleep before parking to make sure no further rcu_barrier() is waiting on the rdp. Fixes: 1fcb932c8b5c ("rcu/nocb: Simplify (de-)offloading state machine") Suggested-by: Frederic Weisbecker Signed-off-by: Zqiang Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree_nocb.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 16865475120b..2605dd234a13 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -891,7 +891,18 @@ static void nocb_cb_wait(struct rcu_data *rdp) swait_event_interruptible_exclusive(rdp->nocb_cb_wq, nocb_cb_wait_cond(rdp)); if (kthread_should_park()) { - kthread_parkme(); + /* + * kthread_park() must be preceded by an rcu_barrier(). + * But yet another rcu_barrier() might have sneaked in between + * the barrier callback execution and the callbacks counter + * decrement. + */ + if (rdp->nocb_cb_sleep) { + rcu_nocb_lock_irqsave(rdp, flags); + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); + rcu_nocb_unlock_irqrestore(rdp, flags); + kthread_parkme(); + } } else if (READ_ONCE(rdp->nocb_cb_sleep)) { WARN_ON(signal_pending(current)); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); From 32693634cdf90e56bd167e5226db7ca569707437 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 Oct 2024 11:02:50 -0700 Subject: [PATCH 22/30] torture: Add --no-affinity parameter to kvm.sh In performance tests, it can be counter-productive to spread torture-test guest OSes across sockets. Plus the experimenter might have ideas about what CPUs individual guest OSes are to run on. This commit therefore adds a --no-affinity parameter to kvm.sh to prevent it from running taskset on its guest OSes. Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- .../rcutorture/bin/kvm-test-1-run-batch.sh | 43 ++++++++++--------- tools/testing/selftests/rcutorture/bin/kvm.sh | 6 +++ 2 files changed, 29 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh index c3808c490d92..f87046b702d8 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh @@ -56,27 +56,30 @@ do echo > $i/kvm-test-1-run-qemu.sh.out export TORTURE_AFFINITY= kvm-get-cpus-script.sh $T/cpuarray.awk $T/cpubatches.awk $T/cpustate - cat << ' ___EOF___' >> $T/cpubatches.awk - END { - affinitylist = ""; - if (!gotcpus()) { - print "echo No CPU-affinity information, so no taskset command."; - } else if (cpu_count !~ /^[0-9][0-9]*$/) { - print "echo " scenario ": Bogus number of CPUs (old qemu-cmd?), so no taskset command."; - } else { - affinitylist = nextcpus(cpu_count); - if (!(affinitylist ~ /^[0-9,-][0-9,-]*$/)) - print "echo " scenario ": Bogus CPU-affinity information, so no taskset command."; - else if (!dumpcpustate()) - print "echo " scenario ": Could not dump state, so no taskset command."; - else - print "export TORTURE_AFFINITY=" affinitylist; + if test -z "${TORTURE_NO_AFFINITY}" + then + cat << ' ___EOF___' >> $T/cpubatches.awk + END { + affinitylist = ""; + if (!gotcpus()) { + print "echo No CPU-affinity information, so no taskset command."; + } else if (cpu_count !~ /^[0-9][0-9]*$/) { + print "echo " scenario ": Bogus number of CPUs (old qemu-cmd?), so no taskset command."; + } else { + affinitylist = nextcpus(cpu_count); + if (!(affinitylist ~ /^[0-9,-][0-9,-]*$/)) + print "echo " scenario ": Bogus CPU-affinity information, so no taskset command."; + else if (!dumpcpustate()) + print "echo " scenario ": Could not dump state, so no taskset command."; + else + print "export TORTURE_AFFINITY=" affinitylist; + } } - } - ___EOF___ - cpu_count="`grep '# TORTURE_CPU_COUNT=' $i/qemu-cmd | sed -e 's/^.*=//'`" - affinity_export="`awk -f $T/cpubatches.awk -v cpu_count="$cpu_count" -v scenario=$i < /dev/null`" - $affinity_export + ___EOF___ + cpu_count="`grep '# TORTURE_CPU_COUNT=' $i/qemu-cmd | sed -e 's/^.*=//'`" + affinity_export="`awk -f $T/cpubatches.awk -v cpu_count="$cpu_count" -v scenario=$i < /dev/null`" + $affinity_export + fi kvm-test-1-run-qemu.sh $i >> $i/kvm-test-1-run-qemu.sh.out 2>&1 & done for i in $runfiles diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 7af73ddc148d..42e5e8597a1a 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -42,6 +42,7 @@ TORTURE_JITTER_STOP="" TORTURE_KCONFIG_KASAN_ARG="" TORTURE_KCONFIG_KCSAN_ARG="" TORTURE_KMAKE_ARG="" +TORTURE_NO_AFFINITY="" TORTURE_QEMU_MEM=512 torture_qemu_mem_default=1 TORTURE_REMOTE= @@ -82,6 +83,7 @@ usage () { echo " --kmake-arg kernel-make-arguments" echo " --mac nn:nn:nn:nn:nn:nn" echo " --memory megabytes|nnnG" + echo " --no-affinity" echo " --no-initrd" echo " --qemu-args qemu-arguments" echo " --qemu-cmd qemu-system-..." @@ -220,6 +222,9 @@ do torture_qemu_mem_default= shift ;; + --no-affinity) + TORTURE_NO_AFFINITY="no-affinity" + ;; --no-initrd) TORTURE_INITRD=""; export TORTURE_INITRD ;; @@ -417,6 +422,7 @@ TORTURE_KCONFIG_KASAN_ARG="$TORTURE_KCONFIG_KASAN_ARG"; export TORTURE_KCONFIG_K TORTURE_KCONFIG_KCSAN_ARG="$TORTURE_KCONFIG_KCSAN_ARG"; export TORTURE_KCONFIG_KCSAN_ARG TORTURE_KMAKE_ARG="$TORTURE_KMAKE_ARG"; export TORTURE_KMAKE_ARG TORTURE_MOD="$TORTURE_MOD"; export TORTURE_MOD +TORTURE_NO_AFFINITY="$TORTURE_NO_AFFINITY"; export TORTURE_NO_AFFINITY TORTURE_QEMU_CMD="$TORTURE_QEMU_CMD"; export TORTURE_QEMU_CMD TORTURE_QEMU_INTERACTIVE="$TORTURE_QEMU_INTERACTIVE"; export TORTURE_QEMU_INTERACTIVE TORTURE_QEMU_MAC="$TORTURE_QEMU_MAC"; export TORTURE_QEMU_MAC From 046c06f5ba97b31da189396e922ebff3f502518e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 Oct 2024 11:02:51 -0700 Subject: [PATCH 23/30] refscale: Correct affinity check The current affinity check works fine until there are more reader processes than CPUs, at which point the affinity check is looking for non-existent CPUs. This commit therefore applies the same modulus to the check as is present in the set_cpus_allowed_ptr() call. Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- kernel/rcu/refscale.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 0db9db73f57f..25910ebe95c0 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -829,7 +829,7 @@ ref_scale_reader(void *arg) goto end; // Make sure that the CPU is affinitized appropriately during testing. - WARN_ON_ONCE(raw_smp_processor_id() != me); + WARN_ON_ONCE(raw_smp_processor_id() != me % nr_cpu_ids); WRITE_ONCE(rt->start_reader, 0); if (!atomic_dec_return(&n_started)) From ff9ba8db87222be8d344f7c1cc3c28b1e22f6429 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 Oct 2024 11:02:52 -0700 Subject: [PATCH 24/30] rcuscale: Add guest_os_delay module parameter This commit adds a guest_os_delay module parameter that extends warm-up and cool-down the specified number of seconds before and after the series of test runs. This allows the data-collection intervals from any given rcuscale guest OSes to line up with active periods in the other rcuscale guest OSes, and also allows the thermal warm-up period required to obtain consistent results from one test to the next. Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- kernel/rcu/refscale.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 25910ebe95c0..c8374760e003 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -75,6 +75,9 @@ MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); torture_param(int, verbose_batched, 0, "Batch verbose debugging printk()s"); +// Number of seconds to extend warm-up and cool-down for multiple guest OSes +torture_param(long, guest_os_delay, 0, + "Number of seconds to extend warm-up/cool-down for multiple guest OSes."); // Wait until there are multiple CPUs before starting test. torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, "Holdoff time before test start (s)"); @@ -801,6 +804,18 @@ static void rcu_scale_one_reader(void) cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000); } +// Warm up cache, or, if needed run a series of rcu_scale_one_reader() +// to allow multiple rcuscale guest OSes to collect mutually valid data. +static void rcu_scale_warm_cool(void) +{ + unsigned long jdone = jiffies + (guest_os_delay > 0 ? guest_os_delay * HZ : -1); + + do { + rcu_scale_one_reader(); + cond_resched(); + } while (time_before(jiffies, jdone)); +} + // Reader kthread. Repeatedly does empty RCU read-side // critical section, minimizing update-side interference. static int @@ -957,6 +972,7 @@ static int main_func(void *arg) schedule_timeout_uninterruptible(1); // Start exp readers up per experiment + rcu_scale_warm_cool(); for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { if (torture_must_stop()) goto end; @@ -987,6 +1003,7 @@ static int main_func(void *arg) result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops); } + rcu_scale_warm_cool(); // Print the average of all experiments SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); From 80e935c8c154d8fbdd85a20d89b4962662ceddd7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 Oct 2024 11:02:53 -0700 Subject: [PATCH 25/30] rcutorture: Avoid printing cpu=-1 for no-fault RCU boost failure If a CPU runs throughout the stalled grace period without passing through a quiescent state, RCU priority boosting cannot help. The rcu_torture_boost_failed() function therefore prints a message flagging the first such CPU. However, if the stall was instead due to (for example) RCU's grace-period kthread being starved of CPU, there will be no such CPU, causing rcu_check_boost_fail() to instead pass back -1 through its cpup CPU-pointer parameter. Therefore, the current message complains about a mythical CPU -1. This commit therefore checks for this situation, and notes that all CPUs have passed through a quiescent state. Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- kernel/rcu/rcutorture.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bb75dbf5c800..e92fa97fc76f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1059,8 +1059,13 @@ static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long *star // At most one persisted message per boost test. j = jiffies; lp = READ_ONCE(last_persist); - if (time_after(j, lp + mininterval) && cmpxchg(&last_persist, lp, j) == lp) - pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu); + if (time_after(j, lp + mininterval) && + cmpxchg(&last_persist, lp, j) == lp) { + if (cpu < 0) + pr_info("Boost inversion persisted: QS from all CPUs\n"); + else + pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu); + } return false; // passed on a technicality } VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); From 174dd22a781b97cb355b1037f0931436a254d3be Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 11 Nov 2024 09:38:26 -0800 Subject: [PATCH 26/30] srcu: Remove smp_mb() from srcu_read_unlock_lite() The srcu_read_unlock_lite() function invokes __srcu_read_unlock() instead of __srcu_read_unlock_lite(), which means that it is doing an unnecessary smp_mb(). This is harmless other than the performance degradation. This commit therefore switches to __srcu_read_unlock_lite(). Reported-by: Neeraj Upadhyay Closes: https://lore.kernel.org/all/d07e8f4a-d5ff-4c8e-8e61-50db285c57e9@amd.com/ Fixes: c0f08d6b5a61 ("srcu: Add srcu_read_lock_lite() and srcu_read_unlock_lite()") Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- include/linux/srcu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index bab1dae3f69e..56f83237de4d 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -372,7 +372,7 @@ static inline void srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) WARN_ON_ONCE(idx & ~0x1); srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE); srcu_lock_release(&ssp->dep_map); - __srcu_read_unlock(ssp, idx); + __srcu_read_unlock_lite(ssp, idx); } /** From f8ce622ac9d89260595e26d4e0da8cb6b4a8e030 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 11 Nov 2024 13:57:03 -0800 Subject: [PATCH 27/30] srcu: Check for srcu_read_lock_lite() across all CPUs If srcu_read_lock_lite() is used on a given srcu_struct structure, then the grace-period processing must do synchronize_rcu() instead of smp_mb() between the scans of the ->srcu_unlock_count[] and ->srcu_lock_count[] counters. Currently, it does that by testing the SRCU_READ_FLAVOR_LITE bit of the ->srcu_reader_flavor mask, which works well. But only if the CPU running that srcu_struct structure's grace period has previously executed srcu_read_lock_lite(), which might not be the case, especially just after that srcu_struct structure has been created and initialized. This commit therefore updates the srcu_readers_unlock_idx() function to OR together the ->srcu_reader_flavor masks from all CPUs, and then make the srcu_readers_active_idx_check() function that test the SRCU_READ_FLAVOR_LITE bit in the resulting mask. Note that the srcu_readers_unlock_idx() function is already scanning all the CPUs to sum up the ->srcu_unlock_count[] fields and that this is on the grace-period slow path, hence no concerns about the small amount of extra work. Reported-by: Neeraj Upadhyay Closes: https://lore.kernel.org/all/d07e8f4a-d5ff-4c8e-8e61-50db285c57e9@amd.com/ Fixes: c0f08d6b5a61 ("srcu: Add srcu_read_lock_lite() and srcu_read_unlock_lite()") Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- kernel/rcu/srcutree.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d3a0c76ce590..a381b553cdca 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -458,7 +458,7 @@ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, uns * Returns approximate total of the readers' ->srcu_unlock_count[] values * for the rank of per-CPU counters specified by idx. */ -static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm) { int cpu; unsigned long mask = 0; @@ -468,11 +468,11 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); sum += atomic_long_read(&sdp->srcu_unlock_count[idx]); - if (IS_ENABLED(CONFIG_PROVE_RCU)) - mask = mask | READ_ONCE(sdp->srcu_reader_flavor); + mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), "Mixed reader flavors for srcu_struct at %ps.\n", ssp); + *rdm = mask; return sum; } @@ -482,10 +482,12 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) */ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) { - bool did_gp = !!(raw_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE); + bool did_gp; + unsigned long rdm; unsigned long unlocks; - unlocks = srcu_readers_unlock_idx(ssp, idx); + unlocks = srcu_readers_unlock_idx(ssp, idx, &rdm); + did_gp = !!(rdm & SRCU_READ_FLAVOR_LITE); /* * Make sure that a lock is always counted if the corresponding From 9407f5c3ec10c155aae61fc4496a7c17a96907b4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 11 Nov 2024 16:28:51 -0800 Subject: [PATCH 28/30] srcu: Unconditionally record srcu_read_lock_lite() in ->srcu_reader_flavor Currently, srcu_read_lock_lite() uses the SRCU_READ_FLAVOR_LITE bit in ->srcu_reader_flavor to communicate to the grace-period processing in srcu_readers_active_idx_check() that the smp_mb() must be replaced by a synchronize_rcu(). Unfortunately, ->srcu_reader_flavor is not updated unless the kernel is built with CONFIG_PROVE_RCU=y. Therefore in all kernels built with CONFIG_PROVE_RCU=n, srcu_readers_active_idx_check() incorrectly uses smp_mb() instead of synchronize_rcu() for srcu_struct structures whose readers use srcu_read_lock_lite(). This commit therefore causes Tree SRCU srcu_read_lock_lite() to unconditionally update ->srcu_reader_flavor so that srcu_readers_active_idx_check() can make the correct choice. Reported-by: Neeraj Upadhyay Closes: https://lore.kernel.org/all/d07e8f4a-d5ff-4c8e-8e61-50db285c57e9@amd.com/ Fixes: c0f08d6b5a61 ("srcu: Add srcu_read_lock_lite() and srcu_read_unlock_lite()") Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- include/linux/srcu.h | 8 +------- include/linux/srcutiny.h | 3 +++ include/linux/srcutree.h | 21 +++++++++++++++++++++ kernel/rcu/srcutree.c | 6 ++---- 4 files changed, 27 insertions(+), 11 deletions(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 56f83237de4d..08339eb8a01c 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -183,12 +183,6 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TREE_SRCU) -void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); -#else -#define srcu_check_read_flavor(ssp, read_flavor) do { } while (0) -#endif - /** * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing @@ -277,7 +271,7 @@ static inline int srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp) { int retval; - srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE); + srcu_check_read_flavor_lite(ssp); retval = __srcu_read_lock_lite(ssp); rcu_try_lock_acquire(&ssp->dep_map); return retval; diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 4d96bbdb45f0..1321da803274 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -81,6 +81,9 @@ static inline void srcu_barrier(struct srcu_struct *ssp) synchronize_srcu(ssp); } +#define srcu_check_read_flavor(ssp, read_flavor) do { } while (0) +#define srcu_check_read_flavor_lite(ssp) do { } while (0) + /* Defined here to avoid size increase for non-torture kernels. */ static inline void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 778eb61542e1..490aeecc6bb4 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -248,4 +248,25 @@ static inline void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_lite()."); } +void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); + +// Record _lite() usage even for CONFIG_PROVE_RCU=n kernels. +static inline void srcu_check_read_flavor_lite(struct srcu_struct *ssp) +{ + struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); + + if (likely(READ_ONCE(sdp->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE)) + return; + + // Note that the cmpxchg() in srcu_check_read_flavor() is fully ordered. + __srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE); +} + +// Record non-_lite() usage only for CONFIG_PROVE_RCU=y kernels. +static inline void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) +{ + if (IS_ENABLED(CONFIG_PROVE_RCU)) + __srcu_check_read_flavor(ssp, read_flavor); +} + #endif diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index a381b553cdca..5e2e53464794 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -712,11 +712,10 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); -#ifdef CONFIG_PROVE_RCU /* * Check for consistent reader flavor. */ -void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) +void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) { int old_read_flavor; struct srcu_data *sdp; @@ -734,8 +733,7 @@ void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) } WARN_ONCE(old_read_flavor != read_flavor, "CPU %d old state %d new state %d\n", sdp->cpu, old_read_flavor, read_flavor); } -EXPORT_SYMBOL_GPL(srcu_check_read_flavor); -#endif /* CONFIG_PROVE_RCU */ +EXPORT_SYMBOL_GPL(__srcu_check_read_flavor); /* * Counts the new reader in the appropriate per-CPU element of the From 812a1c3b9f7c36d9255f0d29d0a3d324e2f52321 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 13 Nov 2024 12:00:08 +0100 Subject: [PATCH 29/30] rcuscale: Do a proper cleanup if kfree_scale_init() fails A static analyzer for C, Smatch, reports and triggers below warnings: kernel/rcu/rcuscale.c:1215 rcu_scale_init() warn: inconsistent returns 'global &fullstop_mutex'. The checker complains about, we do not unlock the "fullstop_mutex" mutex, in case of hitting below error path: ... if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) { pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n"); WARN_ON_ONCE(1); return -1; ^^^^^^^^^^ ... it happens because "-1" is returned right away instead of doing a proper unwinding. Fix it by jumping to "unwind" label instead of returning -1. Reported-by: Dan Carpenter Reviewed-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay Closes: https://lore.kernel.org/rcu/ZxfTrHuEGtgnOYWp@pc636/T/ Fixes: 084e04fff160 ("rcuscale: Add laziness and kfree tests") Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Frederic Weisbecker --- kernel/rcu/rcuscale.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 6d37596deb1f..d360fa44b234 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -890,13 +890,15 @@ kfree_scale_init(void) if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) { pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n"); WARN_ON_ONCE(1); - return -1; + firsterr = -1; + goto unwind; } if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start > 3 * HZ)) { pr_alert("ERROR: call_rcu() CBs are being too lazy!\n"); WARN_ON_ONCE(1); - return -1; + firsterr = -1; + goto unwind; } } From c229d579d047f9c4fb4d6c37d9d04b88a398e461 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 13 Nov 2024 12:00:09 +0100 Subject: [PATCH 30/30] rcuscale: Remove redundant WARN_ON_ONCE() splat There are two places where WARN_ON_ONCE() is called two times in the error paths. One which is encapsulated into if() condition and another one, which is unnecessary, is placed in the brackets. Remove an extra WARN_ON_ONCE() splat which is in brackets. Reviewed-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Frederic Weisbecker --- kernel/rcu/rcuscale.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index d360fa44b234..0f3059b1b80d 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -889,14 +889,12 @@ kfree_scale_init(void) if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) { pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n"); - WARN_ON_ONCE(1); firsterr = -1; goto unwind; } if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start > 3 * HZ)) { pr_alert("ERROR: call_rcu() CBs are being too lazy!\n"); - WARN_ON_ONCE(1); firsterr = -1; goto unwind; }