diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst index ca7b7cd806a1..30080ff6f406 100644 --- a/Documentation/RCU/stallwarn.rst +++ b/Documentation/RCU/stallwarn.rst @@ -249,7 +249,7 @@ ticks this GP)" indicates that this CPU has not taken any scheduling-clock interrupts during the current stalled grace period. The "idle=" portion of the message prints the dyntick-idle state. -The hex number before the first "/" is the low-order 12 bits of the +The hex number before the first "/" is the low-order 16 bits of the dynticks counter, which will have an even-numbered value if the CPU is in dyntick-idle mode and an odd-numbered value otherwise. The hex number between the two "/"s is the value of the nesting, which will be diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f2875484795a..7e1315ee0507 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5415,11 +5415,6 @@ Set time (jiffies) between CPU-hotplug operations, or zero to disable CPU-hotplug testing. - rcutorture.read_exit= [KNL] - Set the number of read-then-exit kthreads used - to test the interaction of RCU updaters and - task-exit processing. - rcutorture.read_exit_burst= [KNL] The number of times in a given read-then-exit episode that a set of read-then-exit kthreads @@ -5429,6 +5424,14 @@ The delay, in seconds, between successive read-then-exit testing episodes. + rcutorture.reader_flavor= [KNL] + A bit mask indicating which readers to use. + If there is more than one bit set, the readers + are entered from low-order bit up, and are + exited in the opposite order. For SRCU, the + 0x1 bit is normal readers, 0x2 NMI-safe readers, + and 0x4 light-weight readers. + rcutorture.shuffle_interval= [KNL] Set task-shuffle interval (s). Shuffling tasks allows some CPUs to go into dyntick-idle mode diff --git a/Documentation/admin-guide/kernel-per-CPU-kthreads.rst b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst index b6aeae3327ce..ea7fa2a8bbf0 100644 --- a/Documentation/admin-guide/kernel-per-CPU-kthreads.rst +++ b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst @@ -315,7 +315,7 @@ To reduce its OS jitter, do at least one of the following: to do. Name: - rcuop/%d and rcuos/%d + rcuop/%d, rcuos/%d, and rcuog/%d Purpose: Offload RCU callbacks from the corresponding CPU. diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 0ee270b3f5ed..fe42315f667f 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -165,7 +165,6 @@ static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } static inline void rcu_momentary_eqs(void) { } static inline void kfree_rcu_scheduler_running(void) { } -static inline bool rcu_gp_might_be_stalled(void) { return false; } /* Avoid RCU read-side critical sections leaking across. */ static inline void rcu_all_qs(void) { barrier(); } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 90a684f94776..27d86d912781 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -40,7 +40,6 @@ void kvfree_rcu_barrier(void); void rcu_barrier(void); void rcu_momentary_eqs(void); void kfree_rcu_scheduler_running(void); -bool rcu_gp_might_be_stalled(void); struct rcu_gp_oldstate { unsigned long rgos_norm; diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 835bbb2d1f88..08339eb8a01c 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -56,6 +56,13 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *head, void cleanup_srcu_struct(struct srcu_struct *ssp); int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); +#ifdef CONFIG_TINY_SRCU +#define __srcu_read_lock_lite __srcu_read_lock +#define __srcu_read_unlock_lite __srcu_read_unlock +#else // #ifdef CONFIG_TINY_SRCU +int __srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp); +void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) __releases(ssp); +#endif // #else // #ifdef CONFIG_TINY_SRCU void synchronize_srcu(struct srcu_struct *ssp); #define SRCU_GET_STATE_COMPLETED 0x1 @@ -176,17 +183,6 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -#define SRCU_NMI_UNKNOWN 0x0 -#define SRCU_NMI_UNSAFE 0x1 -#define SRCU_NMI_SAFE 0x2 - -#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TREE_SRCU) -void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe); -#else -static inline void srcu_check_nmi_safety(struct srcu_struct *ssp, - bool nmi_safe) { } -#endif - /** * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing @@ -236,33 +232,67 @@ static inline void srcu_check_nmi_safety(struct srcu_struct *ssp, * a mutex that is held elsewhere while calling synchronize_srcu() or * synchronize_srcu_expedited(). * - * Note that srcu_read_lock() and the matching srcu_read_unlock() must - * occur in the same context, for example, it is illegal to invoke - * srcu_read_unlock() in an irq handler if the matching srcu_read_lock() - * was invoked in process context. + * The return value from srcu_read_lock() must be passed unaltered + * to the matching srcu_read_unlock(). Note that srcu_read_lock() and + * the matching srcu_read_unlock() must occur in the same context, for + * example, it is illegal to invoke srcu_read_unlock() in an irq handler + * if the matching srcu_read_lock() was invoked in process context. Or, + * for that matter to invoke srcu_read_unlock() from one task and the + * matching srcu_read_lock() from another. */ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) { int retval; - srcu_check_nmi_safety(ssp, false); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); retval = __srcu_read_lock(ssp); srcu_lock_acquire(&ssp->dep_map); return retval; } +/** + * srcu_read_lock_lite - register a new reader for an SRCU-protected structure. + * @ssp: srcu_struct in which to register the new reader. + * + * Enter an SRCU read-side critical section, but for a light-weight + * smp_mb()-free reader. See srcu_read_lock() for more information. + * + * If srcu_read_lock_lite() is ever used on an srcu_struct structure, + * then none of the other flavors may be used, whether before, during, + * or after. Note that grace-period auto-expediting is disabled for _lite + * srcu_struct structures because auto-expedited grace periods invoke + * synchronize_rcu_expedited(), IPIs and all. + * + * Note that srcu_read_lock_lite() can be invoked only from those contexts + * where RCU is watching, that is, from contexts where it would be legal + * to invoke rcu_read_lock(). Otherwise, lockdep will complain. + */ +static inline int srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp) +{ + int retval; + + srcu_check_read_flavor_lite(ssp); + retval = __srcu_read_lock_lite(ssp); + rcu_try_lock_acquire(&ssp->dep_map); + return retval; +} + /** * srcu_read_lock_nmisafe - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. * * Enter an SRCU read-side critical section, but in an NMI-safe manner. * See srcu_read_lock() for more information. + * + * If srcu_read_lock_nmisafe() is ever used on an srcu_struct structure, + * then none of the other flavors may be used, whether before, during, + * or after. */ static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp) { int retval; - srcu_check_nmi_safety(ssp, true); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NMI); retval = __srcu_read_lock_nmisafe(ssp); rcu_try_lock_acquire(&ssp->dep_map); return retval; @@ -274,7 +304,7 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp) { int retval; - srcu_check_nmi_safety(ssp, false); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); retval = __srcu_read_lock(ssp); return retval; } @@ -303,7 +333,7 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp) static inline int srcu_down_read(struct srcu_struct *ssp) __acquires(ssp) { WARN_ON_ONCE(in_nmi()); - srcu_check_nmi_safety(ssp, false); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); return __srcu_read_lock(ssp); } @@ -318,11 +348,27 @@ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp) { WARN_ON_ONCE(idx & ~0x1); - srcu_check_nmi_safety(ssp, false); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); srcu_lock_release(&ssp->dep_map); __srcu_read_unlock(ssp, idx); } +/** + * srcu_read_unlock_lite - unregister a old reader from an SRCU-protected structure. + * @ssp: srcu_struct in which to unregister the old reader. + * @idx: return value from corresponding srcu_read_lock(). + * + * Exit a light-weight SRCU read-side critical section. + */ +static inline void srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) + __releases(ssp) +{ + WARN_ON_ONCE(idx & ~0x1); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE); + srcu_lock_release(&ssp->dep_map); + __srcu_read_unlock_lite(ssp, idx); +} + /** * srcu_read_unlock_nmisafe - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. @@ -334,7 +380,7 @@ static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp) { WARN_ON_ONCE(idx & ~0x1); - srcu_check_nmi_safety(ssp, true); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NMI); rcu_lock_release(&ssp->dep_map); __srcu_read_unlock_nmisafe(ssp, idx); } @@ -343,7 +389,7 @@ static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) static inline notrace void srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp) { - srcu_check_nmi_safety(ssp, false); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); __srcu_read_unlock(ssp, idx); } @@ -360,7 +406,7 @@ static inline void srcu_up_read(struct srcu_struct *ssp, int idx) { WARN_ON_ONCE(idx & ~0x1); WARN_ON_ONCE(in_nmi()); - srcu_check_nmi_safety(ssp, false); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); __srcu_read_unlock(ssp, idx); } diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 4d96bbdb45f0..1321da803274 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -81,6 +81,9 @@ static inline void srcu_barrier(struct srcu_struct *ssp) synchronize_srcu(ssp); } +#define srcu_check_read_flavor(ssp, read_flavor) do { } while (0) +#define srcu_check_read_flavor_lite(ssp) do { } while (0) + /* Defined here to avoid size increase for non-torture kernels. */ static inline void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index ed57598394de..490aeecc6bb4 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -25,7 +25,7 @@ struct srcu_data { /* Read-side state. */ atomic_long_t srcu_lock_count[2]; /* Locks per CPU. */ atomic_long_t srcu_unlock_count[2]; /* Unlocks per CPU. */ - int srcu_nmi_safety; /* NMI-safe srcu_struct structure? */ + int srcu_reader_flavor; /* Reader flavor for srcu_struct structure? */ /* Update-side state. */ spinlock_t __private lock ____cacheline_internodealigned_in_smp; @@ -43,6 +43,11 @@ struct srcu_data { struct srcu_struct *ssp; }; +/* Values for ->srcu_reader_flavor. */ +#define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). +#define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). +#define SRCU_READ_FLAVOR_LITE 0x4 // srcu_read_lock_lite(). + /* * Node in SRCU combining tree, similar in function to rcu_data. */ @@ -204,4 +209,64 @@ void synchronize_srcu_expedited(struct srcu_struct *ssp); void srcu_barrier(struct srcu_struct *ssp); void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf); +/* + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. Returns an index that must be passed to the matching + * srcu_read_unlock_lite(). + * + * Note that this_cpu_inc() is an RCU read-side critical section either + * because it disables interrupts, because it is a single instruction, + * or because it is a read-modify-write atomic operation, depending on + * the whims of the architecture. + */ +static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) +{ + int idx; + + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_lite()."); + idx = READ_ONCE(ssp->srcu_idx) & 0x1; + this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); /* Y */ + barrier(); /* Avoid leaking the critical section. */ + return idx; +} + +/* + * Removes the count for the old reader from the appropriate + * per-CPU element of the srcu_struct. Note that this may well be a + * different CPU than that which was incremented by the corresponding + * srcu_read_lock_lite(), but it must be within the same task. + * + * Note that this_cpu_inc() is an RCU read-side critical section either + * because it disables interrupts, because it is a single instruction, + * or because it is a read-modify-write atomic operation, depending on + * the whims of the architecture. + */ +static inline void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) +{ + barrier(); /* Avoid leaking the critical section. */ + this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); /* Z */ + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_lite()."); +} + +void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); + +// Record _lite() usage even for CONFIG_PROVE_RCU=n kernels. +static inline void srcu_check_read_flavor_lite(struct srcu_struct *ssp) +{ + struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); + + if (likely(READ_ONCE(sdp->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE)) + return; + + // Note that the cmpxchg() in srcu_check_read_flavor() is fully ordered. + __srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE); +} + +// Record non-_lite() usage only for CONFIG_PROVE_RCU=y kernels. +static inline void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) +{ + if (IS_ENABLED(CONFIG_PROVE_RCU)) + __srcu_check_read_flavor(ssp, read_flavor); +} + #endif diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 3e079de0f5b4..b9b6bc55185d 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -249,16 +249,24 @@ config RCU_NOCB_CPU workloads will incur significant increases in context-switch rates. - This option offloads callback invocation from the set of CPUs - specified at boot time by the rcu_nocbs parameter. For each - such CPU, a kthread ("rcuox/N") will be created to invoke - callbacks, where the "N" is the CPU being offloaded, and where - the "x" is "p" for RCU-preempt (PREEMPTION kernels) and "s" for - RCU-sched (!PREEMPTION kernels). Nothing prevents this kthread - from running on the specified CPUs, but (1) the kthreads may be - preempted between each callback, and (2) affinity or cgroups can - be used to force the kthreads to run on whatever set of CPUs is - desired. + This option offloads callback invocation from the set of + CPUs specified at boot time by the rcu_nocbs parameter. + For each such CPU, a kthread ("rcuox/N") will be created to + invoke callbacks, where the "N" is the CPU being offloaded, + and where the "x" is "p" for RCU-preempt (PREEMPTION kernels) + and "s" for RCU-sched (!PREEMPTION kernels). This option + also creates another kthread for each sqrt(nr_cpu_ids) CPUs + ("rcuog/N", where N is the first CPU in that group to come + online), which handles grace periods for its group. Nothing + prevents these kthreads from running on the specified CPUs, + but (1) the kthreads may be preempted between each callback, + and (2) affinity or cgroups can be used to force the kthreads + to run on whatever set of CPUs is desired. + + The sqrt(nr_cpu_ids) grouping may be overridden using the + rcutree.rcu_nocb_gp_stride kernel boot parameter. This can + be especially helpful for smaller numbers of CPUs, where + sqrt(nr_cpu_ids) can be a bit of a blunt instrument. Say Y here if you need reduced OS jitter, despite added overhead. Say N here if you are unsure. diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 259904075636..fadc08ad4b7b 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -120,7 +120,6 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp); void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v); void rcu_segcblist_init(struct rcu_segcblist *rsclp); void rcu_segcblist_disable(struct rcu_segcblist *rsclp); -void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload); bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 6d37596deb1f..0f3059b1b80d 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -889,14 +889,14 @@ kfree_scale_init(void) if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) { pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n"); - WARN_ON_ONCE(1); - return -1; + firsterr = -1; + goto unwind; } if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start > 3 * HZ)) { pr_alert("ERROR: call_rcu() CBs are being too lazy!\n"); - WARN_ON_ONCE(1); - return -1; + firsterr = -1; + goto unwind; } } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bb75dbf5c800..2ae8a5e5e99a 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -57,9 +57,9 @@ MODULE_AUTHOR("Paul E. McKenney and Josh Triplett extendables field, extendables param, and related definitions. */ #define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK_1 (1 << RCUTORTURE_RDR_SHIFT_1) -#define RCUTORTURE_RDR_SHIFT_2 9 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK_2 (1 << RCUTORTURE_RDR_SHIFT_2) +#define RCUTORTURE_RDR_MASK_1 (0xff << RCUTORTURE_RDR_SHIFT_1) +#define RCUTORTURE_RDR_SHIFT_2 16 /* Put SRCU index in upper bits. */ +#define RCUTORTURE_RDR_MASK_2 (0xff << RCUTORTURE_RDR_SHIFT_2) #define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */ #define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */ #define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */ @@ -71,6 +71,9 @@ MODULE_AUTHOR("Paul E. McKenney and Josh Triplett > 2); + if (reader_flavor & 0x2) + srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1); + if ((reader_flavor & 0x1) || !(reader_flavor & 0x7)) + srcu_read_unlock(srcu_ctlp, idx & 0x1); } static int torture_srcu_read_lock_held(void) @@ -1059,8 +1083,13 @@ static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long *star // At most one persisted message per boost test. j = jiffies; lp = READ_ONCE(last_persist); - if (time_after(j, lp + mininterval) && cmpxchg(&last_persist, lp, j) == lp) - pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu); + if (time_after(j, lp + mininterval) && + cmpxchg(&last_persist, lp, j) == lp) { + if (cpu < 0) + pr_info("Boost inversion persisted: QS from all CPUs\n"); + else + pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu); + } return false; // passed on a technicality } VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); @@ -1695,14 +1724,22 @@ rcu_torture_fakewriter(void *arg) cur_ops->cond_sync_exp_full(&gp_snap_full); break; case RTWS_POLL_GET: + if (cur_ops->start_poll_irqsoff) + local_irq_disable(); gp_snap = cur_ops->start_gp_poll(); + if (cur_ops->start_poll_irqsoff) + local_irq_enable(); while (!cur_ops->poll_gp_state(gp_snap)) { torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); } break; case RTWS_POLL_GET_FULL: + if (cur_ops->start_poll_irqsoff) + local_irq_disable(); cur_ops->start_gp_poll_full(&gp_snap_full); + if (cur_ops->start_poll_irqsoff) + local_irq_enable(); while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); @@ -1820,7 +1857,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, int statesold = *readstate & ~newstate; WARN_ON_ONCE(idxold2 < 0); - WARN_ON_ONCE((idxold2 >> RCUTORTURE_RDR_SHIFT_2) > 1); + WARN_ON_ONCE(idxold2 & ~RCUTORTURE_RDR_ALLBITS); rtrsp->rt_readstate = newstate; /* First, put new protection in place to avoid critical-section gap. */ @@ -1835,9 +1872,9 @@ static void rcutorture_one_extend(int *readstate, int newstate, if (statesnew & RCUTORTURE_RDR_SCHED) rcu_read_lock_sched(); if (statesnew & RCUTORTURE_RDR_RCU_1) - idxnew1 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_1; + idxnew1 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1; if (statesnew & RCUTORTURE_RDR_RCU_2) - idxnew2 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_2; + idxnew2 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_2) & RCUTORTURE_RDR_MASK_2; /* * Next, remove old protection, in decreasing order of strength @@ -1857,7 +1894,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, if (statesold & RCUTORTURE_RDR_RBH) rcu_read_unlock_bh(); if (statesold & RCUTORTURE_RDR_RCU_2) { - cur_ops->readunlock((idxold2 >> RCUTORTURE_RDR_SHIFT_2) & 0x1); + cur_ops->readunlock((idxold2 & RCUTORTURE_RDR_MASK_2) >> RCUTORTURE_RDR_SHIFT_2); WARN_ON_ONCE(idxnew2 != -1); idxold2 = 0; } @@ -1867,7 +1904,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, lockit = !cur_ops->no_pi_lock && !statesnew && !(torture_random(trsp) & 0xffff); if (lockit) raw_spin_lock_irqsave(¤t->pi_lock, flags); - cur_ops->readunlock((idxold1 >> RCUTORTURE_RDR_SHIFT_1) & 0x1); + cur_ops->readunlock((idxold1 & RCUTORTURE_RDR_MASK_1) >> RCUTORTURE_RDR_SHIFT_1); WARN_ON_ONCE(idxnew1 != -1); idxold1 = 0; if (lockit) @@ -1882,16 +1919,13 @@ static void rcutorture_one_extend(int *readstate, int newstate, if (idxnew1 == -1) idxnew1 = idxold1 & RCUTORTURE_RDR_MASK_1; WARN_ON_ONCE(idxnew1 < 0); - if (WARN_ON_ONCE((idxnew1 >> RCUTORTURE_RDR_SHIFT_1) > 1)) - pr_info("Unexpected idxnew1 value of %#x\n", idxnew1); if (idxnew2 == -1) idxnew2 = idxold2 & RCUTORTURE_RDR_MASK_2; WARN_ON_ONCE(idxnew2 < 0); - WARN_ON_ONCE((idxnew2 >> RCUTORTURE_RDR_SHIFT_2) > 1); *readstate = idxnew1 | idxnew2 | newstate; WARN_ON_ONCE(*readstate < 0); - if (WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT_2) > 1)) - pr_info("Unexpected idxnew2 value of %#x\n", idxnew2); + if (WARN_ON_ONCE(*readstate & ~RCUTORTURE_RDR_ALLBITS)) + pr_info("Unexpected readstate value of %#x\n", *readstate); } /* Return the biggest extendables mask given current RCU and boot parameters. */ @@ -1916,7 +1950,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ; unsigned long bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; - WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1); + WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1); // Can't have reader idx bits. /* Mostly only one bit (need preemption!), sometimes lots of bits. */ if (!(randmask1 & 0x7)) mask = mask & randmask2; @@ -2389,6 +2423,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d " "read_exit_delay=%d read_exit_burst=%d " + "reader_flavor=%x " "nocbs_nthreads=%d nocbs_toggle=%d " "test_nmis=%d\n", torture_type, tag, nrealreaders, nfakewriters, @@ -2401,6 +2436,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) n_barrier_cbs, onoff_interval, onoff_holdoff, read_exit_delay, read_exit_burst, + reader_flavor, nocbs_nthreads, nocbs_toggle, test_nmis); } diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 0db9db73f57f..aacfcc9838b3 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -75,6 +75,9 @@ MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); torture_param(int, verbose_batched, 0, "Batch verbose debugging printk()s"); +// Number of seconds to extend warm-up and cool-down for multiple guest OSes +torture_param(long, guest_os_delay, 0, + "Number of seconds to extend warm-up/cool-down for multiple guest OSes."); // Wait until there are multiple CPUs before starting test. torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, "Holdoff time before test start (s)"); @@ -212,6 +215,36 @@ static const struct ref_scale_ops srcu_ops = { .name = "srcu" }; +static void srcu_lite_ref_scale_read_section(const int nloops) +{ + int i; + int idx; + + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock_lite(srcu_ctlp); + srcu_read_unlock_lite(srcu_ctlp, idx); + } +} + +static void srcu_lite_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + int idx; + + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock_lite(srcu_ctlp); + un_delay(udl, ndl); + srcu_read_unlock_lite(srcu_ctlp, idx); + } +} + +static const struct ref_scale_ops srcu_lite_ops = { + .init = rcu_sync_scale_init, + .readsection = srcu_lite_ref_scale_read_section, + .delaysection = srcu_lite_ref_scale_delay_section, + .name = "srcu-lite" +}; + #ifdef CONFIG_TASKS_RCU // Definitions for RCU Tasks ref scale testing: Empty read markers. @@ -801,6 +834,18 @@ static void rcu_scale_one_reader(void) cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000); } +// Warm up cache, or, if needed run a series of rcu_scale_one_reader() +// to allow multiple rcuscale guest OSes to collect mutually valid data. +static void rcu_scale_warm_cool(void) +{ + unsigned long jdone = jiffies + (guest_os_delay > 0 ? guest_os_delay * HZ : -1); + + do { + rcu_scale_one_reader(); + cond_resched(); + } while (time_before(jiffies, jdone)); +} + // Reader kthread. Repeatedly does empty RCU read-side // critical section, minimizing update-side interference. static int @@ -829,7 +874,7 @@ repeat: goto end; // Make sure that the CPU is affinitized appropriately during testing. - WARN_ON_ONCE(raw_smp_processor_id() != me); + WARN_ON_ONCE(raw_smp_processor_id() != me % nr_cpu_ids); WRITE_ONCE(rt->start_reader, 0); if (!atomic_dec_return(&n_started)) @@ -957,6 +1002,7 @@ static int main_func(void *arg) schedule_timeout_uninterruptible(1); // Start exp readers up per experiment + rcu_scale_warm_cool(); for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { if (torture_must_stop()) goto end; @@ -987,6 +1033,7 @@ static int main_func(void *arg) result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops); } + rcu_scale_warm_cool(); // Print the average of all experiments SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); @@ -1082,9 +1129,10 @@ ref_scale_init(void) long i; int firsterr = 0; static const struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, - &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, &jiffies_ops, - &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, + &rcu_ops, &srcu_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS + &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, + &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, + &typesafe_seqlock_ops, }; if (!torture_init_begin(scale_type, verbose)) diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 549c03336ee9..4dcbf8aa80ff 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -122,8 +122,8 @@ void srcu_drive_gp(struct work_struct *wp) ssp = container_of(wp, struct srcu_struct, srcu_work); preempt_disable(); // Needed for PREEMPT_AUTO if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) { - return; /* Already running or nothing to do. */ preempt_enable(); + return; /* Already running or nothing to do. */ } /* Remove recently arrived callbacks and wait for readers. */ diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 31706e3293bc..5e2e53464794 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -128,7 +128,7 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) * Initialize the per-CPU srcu_data array, which feeds into the * leaves of the srcu_node tree. */ - WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != + BUILD_BUG_ON(ARRAY_SIZE(sdp->srcu_lock_count) != ARRAY_SIZE(sdp->srcu_unlock_count)); for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); @@ -187,7 +187,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) /* Each pass through this loop initializes one srcu_node structure. */ srcu_for_each_node_breadth_first(ssp, snp) { spin_lock_init(&ACCESS_PRIVATE(snp, lock)); - WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != + BUILD_BUG_ON(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ; @@ -419,41 +419,60 @@ static void check_init_srcu_struct(struct srcu_struct *ssp) } /* - * Returns approximate total of the readers' ->srcu_lock_count[] values - * for the rank of per-CPU counters specified by idx. + * Is the current or any upcoming grace period to be expedited? */ -static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx) +static bool srcu_gp_is_expedited(struct srcu_struct *ssp) { - int cpu; - unsigned long sum = 0; + struct srcu_usage *sup = ssp->srcu_sup; - for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); - - sum += atomic_long_read(&cpuc->srcu_lock_count[idx]); - } - return sum; + return ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)); } /* - * Returns approximate total of the readers' ->srcu_unlock_count[] values - * for the rank of per-CPU counters specified by idx. + * Computes approximate total of the readers' ->srcu_lock_count[] values + * for the rank of per-CPU counters specified by idx, and returns true if + * the caller did the proper barrier (gp), and if the count of the locks + * matches that of the unlocks passed in. */ -static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) +static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, unsigned long unlocks) { int cpu; unsigned long mask = 0; unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); + struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&cpuc->srcu_unlock_count[idx]); + sum += atomic_long_read(&sdp->srcu_lock_count[idx]); if (IS_ENABLED(CONFIG_PROVE_RCU)) - mask = mask | READ_ONCE(cpuc->srcu_nmi_safety); + mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } - WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask >> 1)), - "Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp); + WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), + "Mixed reader flavors for srcu_struct at %ps.\n", ssp); + if (mask & SRCU_READ_FLAVOR_LITE && !gp) + return false; + return sum == unlocks; +} + +/* + * Returns approximate total of the readers' ->srcu_unlock_count[] values + * for the rank of per-CPU counters specified by idx. + */ +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm) +{ + int cpu; + unsigned long mask = 0; + unsigned long sum = 0; + + for_each_possible_cpu(cpu) { + struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); + + sum += atomic_long_read(&sdp->srcu_unlock_count[idx]); + mask = mask | READ_ONCE(sdp->srcu_reader_flavor); + } + WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), + "Mixed reader flavors for srcu_struct at %ps.\n", ssp); + *rdm = mask; return sum; } @@ -463,22 +482,28 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) */ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) { + bool did_gp; + unsigned long rdm; unsigned long unlocks; - unlocks = srcu_readers_unlock_idx(ssp, idx); + unlocks = srcu_readers_unlock_idx(ssp, idx, &rdm); + did_gp = !!(rdm & SRCU_READ_FLAVOR_LITE); /* * Make sure that a lock is always counted if the corresponding * unlock is counted. Needs to be a smp_mb() as the read side may * contain a read from a variable that is written to before the * synchronize_srcu() in the write side. In this case smp_mb()s - * A and B act like the store buffering pattern. + * A and B (or X and Y) act like the store buffering pattern. * - * This smp_mb() also pairs with smp_mb() C to prevent accesses - * after the synchronize_srcu() from being executed before the - * grace period ends. + * This smp_mb() also pairs with smp_mb() C (or, in the case of X, + * Z) to prevent accesses after the synchronize_srcu() from being + * executed before the grace period ends. */ - smp_mb(); /* A */ + if (!did_gp) + smp_mb(); /* A */ + else + synchronize_rcu(); /* X */ /* * If the locks are the same as the unlocks, then there must have @@ -536,7 +561,7 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) * which are unlikely to be configured with an address space fully * populated with memory, at least not anytime soon. */ - return srcu_readers_lock_idx(ssp, idx) == unlocks; + return srcu_readers_lock_idx(ssp, idx, did_gp, unlocks); } /** @@ -554,12 +579,12 @@ static bool srcu_readers_active(struct srcu_struct *ssp) unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); + struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&cpuc->srcu_lock_count[0]); - sum += atomic_long_read(&cpuc->srcu_lock_count[1]); - sum -= atomic_long_read(&cpuc->srcu_unlock_count[0]); - sum -= atomic_long_read(&cpuc->srcu_unlock_count[1]); + sum += atomic_long_read(&sdp->srcu_lock_count[0]); + sum += atomic_long_read(&sdp->srcu_lock_count[1]); + sum -= atomic_long_read(&sdp->srcu_unlock_count[0]); + sum -= atomic_long_read(&sdp->srcu_unlock_count[1]); } return sum; } @@ -622,7 +647,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) unsigned long jbase = SRCU_INTERVAL; struct srcu_usage *sup = ssp->srcu_sup; - if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp))) + if (srcu_gp_is_expedited(ssp)) jbase = 0; if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) { j = jiffies - 1; @@ -687,28 +712,28 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); -#ifdef CONFIG_PROVE_RCU /* - * Check for consistent NMI safety. + * Check for consistent reader flavor. */ -void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe) +void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) { - int nmi_safe_mask = 1 << nmi_safe; - int old_nmi_safe_mask; + int old_read_flavor; struct srcu_data *sdp; - /* NMI-unsafe use in NMI is a bad sign */ - WARN_ON_ONCE(!nmi_safe && in_nmi()); + /* NMI-unsafe use in NMI is a bad sign, as is multi-bit read_flavor values. */ + WARN_ON_ONCE((read_flavor != SRCU_READ_FLAVOR_NMI) && in_nmi()); + WARN_ON_ONCE(read_flavor & (read_flavor - 1)); + sdp = raw_cpu_ptr(ssp->sda); - old_nmi_safe_mask = READ_ONCE(sdp->srcu_nmi_safety); - if (!old_nmi_safe_mask) { - WRITE_ONCE(sdp->srcu_nmi_safety, nmi_safe_mask); - return; + old_read_flavor = READ_ONCE(sdp->srcu_reader_flavor); + if (!old_read_flavor) { + old_read_flavor = cmpxchg(&sdp->srcu_reader_flavor, 0, read_flavor); + if (!old_read_flavor) + return; } - WARN_ONCE(old_nmi_safe_mask != nmi_safe_mask, "CPU %d old state %d new state %d\n", sdp->cpu, old_nmi_safe_mask, nmi_safe_mask); + WARN_ONCE(old_read_flavor != read_flavor, "CPU %d old state %d new state %d\n", sdp->cpu, old_read_flavor, read_flavor); } -EXPORT_SYMBOL_GPL(srcu_check_nmi_safety); -#endif /* CONFIG_PROVE_RCU */ +EXPORT_SYMBOL_GPL(__srcu_check_read_flavor); /* * Counts the new reader in the appropriate per-CPU element of the @@ -867,7 +892,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) spin_lock_irq_rcu_node(sup); idx = rcu_seq_state(sup->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); - if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp))) + if (srcu_gp_is_expedited(ssp)) cbdelay = 0; WRITE_ONCE(sup->srcu_last_gp_end, ktime_get_mono_fast_ns()); @@ -1122,6 +1147,8 @@ static void srcu_flip(struct srcu_struct *ssp) * it stays until either (1) Compilers learn about this sort of * control dependency or (2) Some production workload running on * a production system is unduly delayed by this slowpath smp_mb(). + * Except for _lite() readers, where it is inoperative, which + * means that it is a good thing that it is redundant. */ smp_mb(); /* E */ /* Pairs with B and C. */ @@ -1139,7 +1166,9 @@ static void srcu_flip(struct srcu_struct *ssp) } /* - * If SRCU is likely idle, return true, otherwise return false. + * If SRCU is likely idle, in other words, the next SRCU grace period + * should be expedited, return true, otherwise return false. Except that + * in the presence of _lite() readers, always return false. * * Note that it is OK for several current from-idle requests for a new * grace period from idle to specify expediting because they will all end @@ -1159,7 +1188,7 @@ static void srcu_flip(struct srcu_struct *ssp) * negligible when amortized over that time period, and the extra latency * of a needlessly non-expedited grace period is similarly negligible. */ -static bool srcu_might_be_idle(struct srcu_struct *ssp) +static bool srcu_should_expedite(struct srcu_struct *ssp) { unsigned long curseq; unsigned long flags; @@ -1168,6 +1197,9 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp) unsigned long tlast; check_init_srcu_struct(ssp); + /* If _lite() readers, don't do unsolicited expediting. */ + if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE) + return false; /* If the local srcu_data structure has callbacks, not idle. */ sdp = raw_cpu_ptr(ssp->sda); spin_lock_irqsave_rcu_node(sdp, flags); @@ -1469,14 +1501,15 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); * Implementation of these memory-ordering guarantees is similar to * that of synchronize_rcu(). * - * If SRCU is likely idle, expedite the first request. This semantic - * was provided by Classic SRCU, and is relied upon by its users, so TREE - * SRCU must also provide it. Note that detecting idleness is heuristic - * and subject to both false positives and negatives. + * If SRCU is likely idle as determined by srcu_should_expedite(), + * expedite the first request. This semantic was provided by Classic SRCU, + * and is relied upon by its users, so TREE SRCU must also provide it. + * Note that detecting idleness is heuristic and subject to both false + * positives and negatives. */ void synchronize_srcu(struct srcu_struct *ssp) { - if (srcu_might_be_idle(ssp) || rcu_gp_is_expedited()) + if (srcu_should_expedite(ssp) || rcu_gp_is_expedited()) synchronize_srcu_expedited(ssp); else __synchronize_srcu(ssp, true); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 4d7ee95df06e..59314da5eb60 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1407,7 +1407,8 @@ static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func) */ void synchronize_rcu_tasks_rude(void) { - synchronize_rcu_tasks_generic(&rcu_tasks_rude); + if (!IS_ENABLED(CONFIG_ARCH_WANTS_NO_INSTR) || IS_ENABLED(CONFIG_FORCE_TASKS_RUDE_RCU)) + synchronize_rcu_tasks_generic(&rcu_tasks_rude); } EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude); @@ -1549,22 +1550,7 @@ static void rcu_st_need_qs(struct task_struct *t, u8 v) */ u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new) { - union rcu_special ret; - union rcu_special trs_old = READ_ONCE(t->trc_reader_special); - union rcu_special trs_new = trs_old; - - if (trs_old.b.need_qs != old) - return trs_old.b.need_qs; - trs_new.b.need_qs = new; - - // Although cmpxchg() appears to KCSAN to update all four bytes, - // only the .b.need_qs byte actually changes. - instrument_atomic_read_write(&t->trc_reader_special.b.need_qs, - sizeof(t->trc_reader_special.b.need_qs)); - // Avoid false-positive KCSAN failures. - ret.s = data_race(cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s)); - - return ret.b.need_qs; + return cmpxchg(&t->trc_reader_special.b.need_qs, old, new); } EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b1f883fcd918..ff98233d4aa5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3511,7 +3511,7 @@ static int krc_count(struct kfree_rcu_cpu *krcp) } static void -schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) +__schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) { long delay, delay_left; @@ -3525,6 +3525,16 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) queue_delayed_work(system_unbound_wq, &krcp->monitor_work, delay); } +static void +schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&krcp->lock, flags); + __schedule_delayed_monitor_work(krcp); + raw_spin_unlock_irqrestore(&krcp->lock, flags); +} + static void kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) { @@ -3836,7 +3846,7 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) - schedule_delayed_monitor_work(krcp); + __schedule_delayed_monitor_work(krcp); unlock_return: krc_this_cpu_unlock(krcp, flags); @@ -4194,7 +4204,6 @@ static void start_poll_synchronize_rcu_common(void) struct rcu_data *rdp; struct rcu_node *rnp; - lockdep_assert_irqs_enabled(); local_irq_save(flags); rdp = this_cpu_ptr(&rcu_data); rnp = rdp->mynode; @@ -4219,9 +4228,6 @@ static void start_poll_synchronize_rcu_common(void) * grace period has elapsed in the meantime. If the needed grace period * is not already slated to start, notifies RCU core of the need for that * grace period. - * - * Interrupts must be enabled for the case where it is necessary to awaken - * the grace-period kthread. */ unsigned long start_poll_synchronize_rcu(void) { @@ -4242,9 +4248,6 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); * grace period (whether normal or expedited) has elapsed in the meantime. * If the needed grace period is not already slated to start, notifies * RCU core of the need for that grace period. - * - * Interrupts must be enabled for the case where it is necessary to awaken - * the grace-period kthread. */ void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) { @@ -5580,8 +5583,7 @@ void rcu_init_geometry(void) * Complain and fall back to the compile-time values if this * limit is exceeded. */ - if (rcu_fanout_leaf < 2 || - rcu_fanout_leaf > sizeof(unsigned long) * 8) { + if (rcu_fanout_leaf < 2 || rcu_fanout_leaf > BITS_PER_LONG) { rcu_fanout_leaf = RCU_FANOUT_LEAF; WARN_ON(1); return; diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 16865475120b..2605dd234a13 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -891,7 +891,18 @@ static void nocb_cb_wait(struct rcu_data *rdp) swait_event_interruptible_exclusive(rdp->nocb_cb_wq, nocb_cb_wait_cond(rdp)); if (kthread_should_park()) { - kthread_parkme(); + /* + * kthread_park() must be preceded by an rcu_barrier(). + * But yet another rcu_barrier() might have sneaked in between + * the barrier callback execution and the callbacks counter + * decrement. + */ + if (rdp->nocb_cb_sleep) { + rcu_nocb_lock_irqsave(rdp, flags); + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); + rcu_nocb_unlock_irqrestore(rdp, flags); + kthread_parkme(); + } } else if (READ_ONCE(rdp->nocb_cb_sleep)) { WARN_ON(signal_pending(current)); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1c7cbd145d5e..3927ea5f7955 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -183,9 +183,9 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) switch (blkd_state) { case 0: case RCU_EXP_TASKS: - case RCU_EXP_TASKS + RCU_GP_BLKD: + case RCU_EXP_TASKS | RCU_GP_BLKD: case RCU_GP_TASKS: - case RCU_GP_TASKS + RCU_EXP_TASKS: + case RCU_GP_TASKS | RCU_EXP_TASKS: /* * Blocking neither GP, or first task blocking the normal @@ -198,10 +198,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) case RCU_EXP_BLKD: case RCU_GP_BLKD: - case RCU_GP_BLKD + RCU_EXP_BLKD: - case RCU_GP_TASKS + RCU_EXP_BLKD: - case RCU_GP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: - case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: + case RCU_GP_BLKD | RCU_EXP_BLKD: + case RCU_GP_TASKS | RCU_EXP_BLKD: + case RCU_GP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD: + case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD: /* * First task arriving that blocks either GP, or first task @@ -214,9 +214,9 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks); break; - case RCU_EXP_TASKS + RCU_EXP_BLKD: - case RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: - case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_EXP_BLKD: + case RCU_EXP_TASKS | RCU_EXP_BLKD: + case RCU_EXP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD: + case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_EXP_BLKD: /* * Second or subsequent task blocking the expedited GP. @@ -227,8 +227,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) list_add(&t->rcu_node_entry, rnp->exp_tasks); break; - case RCU_GP_TASKS + RCU_GP_BLKD: - case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD: + case RCU_GP_TASKS | RCU_GP_BLKD: + case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_GP_BLKD: /* * Second or subsequent task blocking the normal GP. diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 4432db6d0b99..925fcdad5dea 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -76,36 +76,6 @@ int rcu_jiffies_till_stall_check(void) } EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); -/** - * rcu_gp_might_be_stalled - Is it likely that the grace period is stalled? - * - * Returns @true if the current grace period is sufficiently old that - * it is reasonable to assume that it might be stalled. This can be - * useful when deciding whether to allocate memory to enable RCU-mediated - * freeing on the one hand or just invoking synchronize_rcu() on the other. - * The latter is preferable when the grace period is stalled. - * - * Note that sampling of the .gp_start and .gp_seq fields must be done - * carefully to avoid false positives at the beginnings and ends of - * grace periods. - */ -bool rcu_gp_might_be_stalled(void) -{ - unsigned long d = rcu_jiffies_till_stall_check() / RCU_STALL_MIGHT_DIV; - unsigned long j = jiffies; - - if (d < RCU_STALL_MIGHT_MIN) - d = RCU_STALL_MIGHT_MIN; - smp_mb(); // jiffies before .gp_seq to avoid false positives. - if (!rcu_gp_in_progress()) - return false; - // Long delays at this point avoids false positive, but a delay - // of ULONG_MAX/4 jiffies voids your no-false-positive warranty. - smp_mb(); // .gp_seq before second .gp_start - // And ditto here. - return !time_before(j, READ_ONCE(rcu_state.gp_start) + d); -} - /* Don't do RCU CPU stall warnings during long sysrq printouts. */ void rcu_sysrq_start(void) { @@ -365,7 +335,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) * that don't support NMI-based stack dumps. The NMI-triggered stack * traces are more accurate because they are printed by the target CPU. */ -static void rcu_dump_cpu_stacks(void) +static void rcu_dump_cpu_stacks(unsigned long gp_seq) { int cpu; unsigned long flags; @@ -373,15 +343,23 @@ static void rcu_dump_cpu_stacks(void) rcu_for_each_leaf_node(rnp) { printk_deferred_enter(); - raw_spin_lock_irqsave_rcu_node(rnp, flags); - for_each_leaf_node_possible_cpu(rnp, cpu) + for_each_leaf_node_possible_cpu(rnp, cpu) { + if (gp_seq != data_race(rcu_state.gp_seq)) { + printk_deferred_exit(); + pr_err("INFO: Stall ended during stack backtracing.\n"); + return; + } + if (!(data_race(rnp->qsmask) & leaf_node_cpu_bit(rnp, cpu))) + continue; + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { if (cpu_is_offline(cpu)) pr_err("Offline CPU %d blocking current GP.\n", cpu); else dump_cpu_task(cpu); } - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } printk_deferred_exit(); } } @@ -638,7 +616,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, data_race(rcu_state.n_online_cpus)); // Diagnostic read if (ndetected) { - rcu_dump_cpu_stacks(); + rcu_dump_cpu_stacks(gp_seq); /* Complain about tasks blocking the grace period. */ rcu_for_each_leaf_node(rnp) @@ -670,7 +648,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) rcu_force_quiescent_state(); /* Kick them all. */ } -static void print_cpu_stall(unsigned long gps) +static void print_cpu_stall(unsigned long gp_seq, unsigned long gps) { int cpu; unsigned long flags; @@ -707,7 +685,7 @@ static void print_cpu_stall(unsigned long gps) rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); - rcu_dump_cpu_stacks(); + rcu_dump_cpu_stacks(gp_seq); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Rewrite if needed in case of slow consoles. */ @@ -789,7 +767,8 @@ static void check_cpu_stall(struct rcu_data *rdp) gs2 = READ_ONCE(rcu_state.gp_seq); if (gs1 != gs2 || ULONG_CMP_LT(j, js) || - ULONG_CMP_GE(gps, js)) + ULONG_CMP_GE(gps, js) || + !rcu_seq_state(gs2)) return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; jn = jiffies + ULONG_MAX / 2; @@ -810,7 +789,7 @@ static void check_cpu_stall(struct rcu_data *rdp) pr_err("INFO: %s detected stall, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name); } else if (self_detected) { /* We haven't checked in, so go dump stack. */ - print_cpu_stall(gps); + print_cpu_stall(gs2, gps); } else { /* They had a few time units to dump stack, so complain. */ print_other_cpu_stall(gs2, gps); diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh index c3808c490d92..f87046b702d8 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh @@ -56,27 +56,30 @@ do echo > $i/kvm-test-1-run-qemu.sh.out export TORTURE_AFFINITY= kvm-get-cpus-script.sh $T/cpuarray.awk $T/cpubatches.awk $T/cpustate - cat << ' ___EOF___' >> $T/cpubatches.awk - END { - affinitylist = ""; - if (!gotcpus()) { - print "echo No CPU-affinity information, so no taskset command."; - } else if (cpu_count !~ /^[0-9][0-9]*$/) { - print "echo " scenario ": Bogus number of CPUs (old qemu-cmd?), so no taskset command."; - } else { - affinitylist = nextcpus(cpu_count); - if (!(affinitylist ~ /^[0-9,-][0-9,-]*$/)) - print "echo " scenario ": Bogus CPU-affinity information, so no taskset command."; - else if (!dumpcpustate()) - print "echo " scenario ": Could not dump state, so no taskset command."; - else - print "export TORTURE_AFFINITY=" affinitylist; + if test -z "${TORTURE_NO_AFFINITY}" + then + cat << ' ___EOF___' >> $T/cpubatches.awk + END { + affinitylist = ""; + if (!gotcpus()) { + print "echo No CPU-affinity information, so no taskset command."; + } else if (cpu_count !~ /^[0-9][0-9]*$/) { + print "echo " scenario ": Bogus number of CPUs (old qemu-cmd?), so no taskset command."; + } else { + affinitylist = nextcpus(cpu_count); + if (!(affinitylist ~ /^[0-9,-][0-9,-]*$/)) + print "echo " scenario ": Bogus CPU-affinity information, so no taskset command."; + else if (!dumpcpustate()) + print "echo " scenario ": Could not dump state, so no taskset command."; + else + print "export TORTURE_AFFINITY=" affinitylist; + } } - } - ___EOF___ - cpu_count="`grep '# TORTURE_CPU_COUNT=' $i/qemu-cmd | sed -e 's/^.*=//'`" - affinity_export="`awk -f $T/cpubatches.awk -v cpu_count="$cpu_count" -v scenario=$i < /dev/null`" - $affinity_export + ___EOF___ + cpu_count="`grep '# TORTURE_CPU_COUNT=' $i/qemu-cmd | sed -e 's/^.*=//'`" + affinity_export="`awk -f $T/cpubatches.awk -v cpu_count="$cpu_count" -v scenario=$i < /dev/null`" + $affinity_export + fi kvm-test-1-run-qemu.sh $i >> $i/kvm-test-1-run-qemu.sh.out 2>&1 & done for i in $runfiles diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 7af73ddc148d..42e5e8597a1a 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -42,6 +42,7 @@ TORTURE_JITTER_STOP="" TORTURE_KCONFIG_KASAN_ARG="" TORTURE_KCONFIG_KCSAN_ARG="" TORTURE_KMAKE_ARG="" +TORTURE_NO_AFFINITY="" TORTURE_QEMU_MEM=512 torture_qemu_mem_default=1 TORTURE_REMOTE= @@ -82,6 +83,7 @@ usage () { echo " --kmake-arg kernel-make-arguments" echo " --mac nn:nn:nn:nn:nn:nn" echo " --memory megabytes|nnnG" + echo " --no-affinity" echo " --no-initrd" echo " --qemu-args qemu-arguments" echo " --qemu-cmd qemu-system-..." @@ -220,6 +222,9 @@ do torture_qemu_mem_default= shift ;; + --no-affinity) + TORTURE_NO_AFFINITY="no-affinity" + ;; --no-initrd) TORTURE_INITRD=""; export TORTURE_INITRD ;; @@ -417,6 +422,7 @@ TORTURE_KCONFIG_KASAN_ARG="$TORTURE_KCONFIG_KASAN_ARG"; export TORTURE_KCONFIG_K TORTURE_KCONFIG_KCSAN_ARG="$TORTURE_KCONFIG_KCSAN_ARG"; export TORTURE_KCONFIG_KCSAN_ARG TORTURE_KMAKE_ARG="$TORTURE_KMAKE_ARG"; export TORTURE_KMAKE_ARG TORTURE_MOD="$TORTURE_MOD"; export TORTURE_MOD +TORTURE_NO_AFFINITY="$TORTURE_NO_AFFINITY"; export TORTURE_NO_AFFINITY TORTURE_QEMU_CMD="$TORTURE_QEMU_CMD"; export TORTURE_QEMU_CMD TORTURE_QEMU_INTERACTIVE="$TORTURE_QEMU_INTERACTIVE"; export TORTURE_QEMU_INTERACTIVE TORTURE_QEMU_MAC="$TORTURE_QEMU_MAC"; export TORTURE_QEMU_MAC diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST index 98b6175e5aa0..45f572570a8c 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST @@ -5,6 +5,7 @@ TREE04 TREE05 TREE07 TREE09 +SRCU-L SRCU-N SRCU-P SRCU-T diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L new file mode 100644 index 000000000000..3b4fa8dbef8a --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L @@ -0,0 +1,10 @@ +CONFIG_RCU_TRACE=n +CONFIG_SMP=y +CONFIG_NR_CPUS=6 +CONFIG_HOTPLUG_CPU=y +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +#CHECK#CONFIG_RCU_EXPERT=n +CONFIG_KPROBES=n +CONFIG_FTRACE=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L.boot new file mode 100644 index 000000000000..0207b3138c5b --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L.boot @@ -0,0 +1,3 @@ +rcutorture.torture_type=srcu +rcutorture.reader_flavor=0x4 +rcutorture.fwd_progress=3 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N.boot index ce0694fd9b92..b54cf87dc110 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N.boot @@ -1,2 +1,3 @@ rcutorture.torture_type=srcu +rcutorture.reader_flavor=0x2 rcutorture.fwd_progress=3 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE10 b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 index a323d8948b7c..759ee51d3ddc 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE10 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 @@ -1,5 +1,5 @@ CONFIG_SMP=y -CONFIG_NR_CPUS=56 +CONFIG_NR_CPUS=74 CONFIG_PREEMPT_NONE=y CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n