From 808a9d675969382c855acc304c288dfa713d3f9e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Dec 2022 11:21:09 -0800 Subject: [PATCH 01/54] rcutorture: Add test_nmis module parameter This commit adds a test_nmis module parameter to generate the specified number of NMI stack backtraces 15 seconds apart. This module parameter can be used to test NMI delivery and accompanying diagnostics. Note that this parameter is ignored when rcutorture is a module rather than built into the kernel. This could be changed with the addition of an EXPORT_SYMBOL_GPL(). [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/rcutorture.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8e6c023212cb..480bba142e3a 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -119,6 +119,7 @@ torture_param(int, stutter, 5, "Number of seconds to run/halt test"); torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds."); torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds."); +torture_param(int, test_nmis, 0, "End-test NMI tests, 0 to disable."); torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs"); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); @@ -2358,7 +2359,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d " "read_exit_delay=%d read_exit_burst=%d " - "nocbs_nthreads=%d nocbs_toggle=%d\n", + "nocbs_nthreads=%d nocbs_toggle=%d " + "test_nmis=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, @@ -2369,7 +2371,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) n_barrier_cbs, onoff_interval, onoff_holdoff, read_exit_delay, read_exit_burst, - nocbs_nthreads, nocbs_toggle); + nocbs_nthreads, nocbs_toggle, + test_nmis); } static int rcutorture_booster_cleanup(unsigned int cpu) @@ -3273,6 +3276,29 @@ static void rcu_torture_read_exit_cleanup(void) torture_stop_kthread(rcutorture_read_exit, read_exit_task); } +static void rcutorture_test_nmis(int n) +{ +#if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) + int cpu; + int dumpcpu; + int i; + + for (i = 0; i < n; i++) { + preempt_disable(); + cpu = smp_processor_id(); + dumpcpu = cpu + 1; + if (dumpcpu >= nr_cpu_ids) + dumpcpu = 0; + pr_alert("%s: CPU %d invoking dump_cpu_task(%d)\n", __func__, cpu, dumpcpu); + dump_cpu_task(dumpcpu); + preempt_enable(); + schedule_timeout_uninterruptible(15 * HZ); + } +#else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) + WARN_ONCE(n, "Non-zero rcutorture.test_nmis=%d permitted only when rcutorture is built in.\n", test_nmis); +#endif // #else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) +} + static enum cpuhp_state rcutor_hp; static void @@ -3297,6 +3323,8 @@ rcu_torture_cleanup(void) return; } + rcutorture_test_nmis(test_nmis); + if (cur_ops->gp_kthread_dbg) cur_ops->gp_kthread_dbg(); rcu_torture_read_exit_cleanup(); From 7ff0b54491791bdb99659e73c9851ac398c81416 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Jan 2023 21:01:12 -0800 Subject: [PATCH 02/54] rcutorture: Set CONFIG_BOOTPARAM_HOTPLUG_CPU0 to offline CPU 0 There is now a BOOTPARAM_HOTPLUG_CPU0 Kconfig option that allows CPU 0 to be offlined on x86 systems. This commit therefore sets this option in the TREE01 rcutorture scenario in order to regularly test this capability. Reported-by: "Zhang, Qiang1" Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- tools/testing/selftests/rcutorture/configs/rcu/TREE01 | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 b/tools/testing/selftests/rcutorture/configs/rcu/TREE01 index 8ae41d5f81a3..04831ef1f9b5 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01 @@ -15,3 +15,4 @@ CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_RCU_BOOST=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y +CONFIG_BOOTPARAM_HOTPLUG_CPU0=y From 7c3a8b48dc5452ec639293fd7796402d02213f58 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Jan 2023 21:26:11 -0800 Subject: [PATCH 03/54] rcutorture: Make scenario TREE04 enable lazy call_rcu() This commit enables the RCU_LAZY Kconfig option in scenario TREE04 in order to provide some ongoing testing of this configuration. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- tools/testing/selftests/rcutorture/configs/rcu/TREE04 | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 index ae395981b5e5..dc4985064b3a 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 @@ -15,3 +15,4 @@ CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y CONFIG_RCU_EQS_DEBUG=y +CONFIG_RCU_LAZY=y From 236bdb881d9656194a2d9c78ee11dba39329a3b1 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Tue, 10 Jan 2023 16:46:34 +0530 Subject: [PATCH 04/54] tools: rcu: Add usage function and check for argument This commit converts extract-stall.sh script's header comment to a usage() function, and adds an argument check. While in the area, make this script be executable. [ paulmck: Strength argument check, remove extraneous comment. ] Signed-off-by: Bhaskar Chowdhury Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- tools/rcu/extract-stall.sh | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) mode change 100644 => 100755 tools/rcu/extract-stall.sh diff --git a/tools/rcu/extract-stall.sh b/tools/rcu/extract-stall.sh old mode 100644 new mode 100755 index e565697c9f90..08a39ad44320 --- a/tools/rcu/extract-stall.sh +++ b/tools/rcu/extract-stall.sh @@ -1,11 +1,25 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0+ -# -# Extract any RCU CPU stall warnings present in specified file. -# Filter out clocksource lines. Note that preceding-lines excludes the -# initial line of the stall warning but trailing-lines includes it. -# -# Usage: extract-stall.sh dmesg-file [ preceding-lines [ trailing-lines ] ] + +usage() { + echo Extract any RCU CPU stall warnings present in specified file. + echo Filter out clocksource lines. Note that preceding-lines excludes the + echo initial line of the stall warning but trailing-lines includes it. + echo + echo Usage: $(basename $0) dmesg-file [ preceding-lines [ trailing-lines ] ] + echo + echo Error: $1 +} + +# Terminate the script, if the argument is missing + +if test -f "$1" && test -r "$1" +then + : +else + usage "Console log file \"$1\" missing or unreadable." + exit 1 +fi echo $1 preceding_lines="${2-3}" From 995495846f7403cd871b22b03b40bd7e24be07a3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Jan 2023 21:23:46 -0800 Subject: [PATCH 05/54] torture: Permit kvm-again.sh --duration to default to previous run Currently, invoking kvm-again.sh without a --duration argument results in a bash error message. This commit therefore adds quotes around the $dur argument to kvm-transform.sh to allow a default duration to be taken from the earlier run. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- tools/testing/selftests/rcutorture/bin/kvm-again.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-again.sh b/tools/testing/selftests/rcutorture/bin/kvm-again.sh index 8a968fbda02c..88ca4e368489 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-again.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-again.sh @@ -193,7 +193,7 @@ do qemu_cmd_dir="`dirname "$i"`" kernel_dir="`echo $qemu_cmd_dir | sed -e 's/\.[0-9]\+$//'`" jitter_dir="`dirname "$kernel_dir"`" - kvm-transform.sh "$kernel_dir/bzImage" "$qemu_cmd_dir/console.log" "$jitter_dir" $dur "$bootargs" < $T/qemu-cmd > $i + kvm-transform.sh "$kernel_dir/bzImage" "$qemu_cmd_dir/console.log" "$jitter_dir" "$dur" "$bootargs" < $T/qemu-cmd > $i if test -n "$arg_remote" then echo "# TORTURE_KCONFIG_GDB_ARG=''" >> $i From fdcf87a3dfbe19361ff128806a218a28da7689ad Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Tue, 17 Jan 2023 13:53:10 +0800 Subject: [PATCH 06/54] rcutorture: Eliminate variable n_rcu_torture_boost_rterror After commit 8b700983de82 ("sched: Remove sched_set_*() return value"), this variable is not used anymore. So eliminate it entirely. Signed-off-by: Yue Hu Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/rcutorture.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 480bba142e3a..c0b2fd687bbb 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -180,7 +180,6 @@ static atomic_t n_rcu_torture_mbchk_tries; static atomic_t n_rcu_torture_error; static long n_rcu_torture_barrier_error; static long n_rcu_torture_boost_ktrerror; -static long n_rcu_torture_boost_rterror; static long n_rcu_torture_boost_failure; static long n_rcu_torture_boosts; static atomic_long_t n_rcu_torture_timers; @@ -2195,12 +2194,11 @@ rcu_torture_stats_print(void) atomic_read(&n_rcu_torture_alloc), atomic_read(&n_rcu_torture_alloc_fail), atomic_read(&n_rcu_torture_free)); - pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld rtbre: %ld ", + pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld ", atomic_read(&n_rcu_torture_mberror), atomic_read(&n_rcu_torture_mbchk_fail), atomic_read(&n_rcu_torture_mbchk_tries), n_rcu_torture_barrier_error, - n_rcu_torture_boost_ktrerror, - n_rcu_torture_boost_rterror); + n_rcu_torture_boost_ktrerror); pr_cont("rtbf: %ld rtb: %ld nt: %ld ", n_rcu_torture_boost_failure, n_rcu_torture_boosts, @@ -2218,15 +2216,13 @@ rcu_torture_stats_print(void) if (atomic_read(&n_rcu_torture_mberror) || atomic_read(&n_rcu_torture_mbchk_fail) || n_rcu_torture_barrier_error || n_rcu_torture_boost_ktrerror || - n_rcu_torture_boost_rterror || n_rcu_torture_boost_failure || - i > 1) { + n_rcu_torture_boost_failure || i > 1) { pr_cont("%s", "!!! "); atomic_inc(&n_rcu_torture_error); WARN_ON_ONCE(atomic_read(&n_rcu_torture_mberror)); WARN_ON_ONCE(atomic_read(&n_rcu_torture_mbchk_fail)); WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier() WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread - WARN_ON_ONCE(n_rcu_torture_boost_rterror); // can't set RT prio WARN_ON_ONCE(n_rcu_torture_boost_failure); // boost failed (TIMER_SOFTIRQ RT prio?) WARN_ON_ONCE(i > 1); // Too-short grace period } @@ -3568,7 +3564,6 @@ rcu_torture_init(void) atomic_set(&n_rcu_torture_error, 0); n_rcu_torture_barrier_error = 0; n_rcu_torture_boost_ktrerror = 0; - n_rcu_torture_boost_rterror = 0; n_rcu_torture_boost_failure = 0; n_rcu_torture_boosts = 0; for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) From 877a0e83c57fa5e2a7fd628ec2e1733ed70c8792 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Feb 2023 17:04:09 -0800 Subject: [PATCH 07/54] torture: Enable clocksource watchdog with "tsc=watchdog" This commit tests the "tsc=watchdog" kernel boot parameter when running the clocksourcewd torture tests. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- tools/testing/selftests/rcutorture/bin/torture.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index 130d0de4c3bb..5a2ae2264403 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -497,16 +497,16 @@ fi if test "$do_clocksourcewd" = "yes" then - torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000" + torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000 tsc=watchdog" torture_set "clocksourcewd-1" tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 45s --configs TREE03 --kconfig "CONFIG_TEST_CLOCKSOURCE_WATCHDOG=y" --trust-make - torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000 clocksource.max_cswd_read_retries=1" + torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000 clocksource.max_cswd_read_retries=1 tsc=watchdog" torture_set "clocksourcewd-2" tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 45s --configs TREE03 --kconfig "CONFIG_TEST_CLOCKSOURCE_WATCHDOG=y" --trust-make # In case our work is already done... if test "$do_rcutorture" != "yes" then - torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000" + torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000 tsc=watchdog" torture_set "clocksourcewd-3" tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 45s --configs TREE03 --trust-make fi fi From 4f02ac237813a31d4fcb4c9eb9b699b15956b53b Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 3 Feb 2023 14:40:54 +0800 Subject: [PATCH 08/54] rcutorture: Create nocb kthreads only when testing rcu in CONFIG_RCU_NOCB_CPU=y kernels Given a non-zero rcutorture.nocbs_nthreads module parameter, the specified number of nocb kthreads will be created, regardless of whether or not the RCU implementation under test is capable of offloading callbacks. Please note that even vanilla RCU is incapable of offloading in kernels built with CONFIG_RCU_NOCB_CPU=n. And when the RCU implementation is incapable of offloading callbacks, there is no point in creating those kthreads. This commit therefore checks the cur_ops.torture_type module parameter and CONFIG_RCU_NOCB_CPU Kconfig option in order to avoid creating unnecessary nocb tasks. Signed-off-by: Zqiang Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney [ boqun: Fix checkpatch warning ] Signed-off-by: Boqun Feng --- kernel/rcu/rcutorture.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index c0b2fd687bbb..e046d2c6fe10 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -3525,6 +3525,12 @@ rcu_torture_init(void) pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); fqs_duration = 0; } + if (nocbs_nthreads != 0 && (cur_ops != &rcu_ops || + !IS_ENABLED(CONFIG_RCU_NOCB_CPU))) { + pr_alert("rcu-torture types: %s and CONFIG_RCU_NOCB_CPU=%d, nocb toggle disabled.\n", + cur_ops->name, IS_ENABLED(CONFIG_RCU_NOCB_CPU)); + nocbs_nthreads = 0; + } if (cur_ops->init) cur_ops->init(); From ef1ef3d47677dc191b88650a9f7f91413452cc1b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 31 Jan 2023 12:08:54 -0800 Subject: [PATCH 09/54] rcuscale: Move shutdown from wait_event() to wait_event_idle() The rcu_scale_shutdown() and kfree_scale_shutdown() kthreads/functions use wait_event() to wait for the rcuscale test to complete. However, each updater thread in such a test waits for at least 100 grace periods. If each grace period takes more than 1.2 seconds, which is long, but not insanely so, this can trigger the hung-task timeout. This commit therefore replaces those wait_event() calls with calls to wait_event_idle(), which do not trigger the hung-task timeout. Reported-by: kernel test robot Reported-by: Liam Howlett Signed-off-by: Paul E. McKenney Tested-by: Yujie Liu Signed-off-by: Boqun Feng --- kernel/rcu/rcuscale.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 91fb5905a008..4120f94030c3 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -631,8 +631,7 @@ static int compute_real(int n) static int rcu_scale_shutdown(void *arg) { - wait_event(shutdown_wq, - atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); + wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); smp_mb(); /* Wake before output. */ rcu_scale_cleanup(); kernel_power_off(); @@ -771,8 +770,8 @@ kfree_scale_cleanup(void) static int kfree_scale_shutdown(void *arg) { - wait_event(shutdown_wq, - atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads); + wait_event_idle(shutdown_wq, + atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads); smp_mb(); /* Wake before output. */ From 6bc6e6b27524304aadb9c04611ddb1c84dd7617a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 31 Jan 2023 16:12:18 -0800 Subject: [PATCH 10/54] refscale: Move shutdown from wait_event() to wait_event_idle() The ref_scale_shutdown() kthread/function uses wait_event() to wait for the refscale test to complete. However, although the read-side tests are normally extremely fast, there is no law against specifying a very large value for the refscale.loops module parameter or against having a slow read-side primitive. Either way, this might well trigger the hung-task timeout. This commit therefore replaces those wait_event() calls with calls to wait_event_idle(), which do not trigger the hung-task timeout. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/refscale.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index afa3e1a2f690..1970ce5f22d4 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -1031,7 +1031,7 @@ ref_scale_cleanup(void) static int ref_scale_shutdown(void *arg) { - wait_event(shutdown_wq, shutdown_start); + wait_event_idle(shutdown_wq, shutdown_start); smp_mb(); // Wake before output. ref_scale_cleanup(); From 2f1f043e7bea3fbf4c1869df2f7a0312bc8ca2bf Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 12 Jan 2023 22:59:53 -0800 Subject: [PATCH 11/54] locking/lockdep: Introduce lock_sync() Currently, functions like synchronize_srcu() do not have lockdep annotations resembling those of other write-side locking primitives. Such annotations might look as follows: lock_acquire(); lock_release(); Such annotations would tell lockdep that synchronize_srcu() acts like an empty critical section that waits for other (read-side) critical sections to finish. This would definitely catch some deadlock, but as pointed out by Paul Mckenney [1], this could also introduce false positives because of irq-safe/unsafe detection. Of course, there are tricks could help with this: might_sleep(); // Existing statement in __synchronize_srcu(). if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { local_irq_disable(); lock_acquire(); lock_release(); local_irq_enable(); } But it would be better for lockdep to provide a separate annonation for functions like synchronize_srcu(), so that people won't need to repeat the ugly tricks above. Therefore introduce lock_sync(), which is simply an lock+unlock pair with no irq safe/unsafe deadlock check. This works because the to-be-annontated functions do not create real critical sections, and there is therefore no way that irq can create extra dependencies. [1]: https://lore.kernel.org/lkml/20180412021233.ewncg5jjuzjw3x62@tardis/ Signed-off-by: Boqun Feng Acked-by: Waiman Long Signed-off-by: Paul E. McKenney [ boqun: Fix typos reported by Davidlohr Bueso and Paul E. Mckenney ] Acked-by: Peter Zijlstra (Intel) Signed-off-by: Boqun Feng --- include/linux/lockdep.h | 5 +++++ kernel/locking/lockdep.c | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 1023f349af71..14d9dbedc6c1 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -268,6 +268,10 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass, extern void lock_release(struct lockdep_map *lock, unsigned long ip); +extern void lock_sync(struct lockdep_map *lock, unsigned int subclass, + int read, int check, struct lockdep_map *nest_lock, + unsigned long ip); + /* lock_is_held_type() returns */ #define LOCK_STATE_UNKNOWN -1 #define LOCK_STATE_NOT_HELD 0 @@ -554,6 +558,7 @@ do { \ #define lock_map_acquire_read(l) lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_tryread(l) lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_) #define lock_map_release(l) lock_release(l, _THIS_IP_) +#define lock_map_sync(l) lock_sync(l, 0, 0, 1, NULL, _THIS_IP_) #ifdef CONFIG_PROVE_LOCKING # define might_lock(lock) \ diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 50d4863974e7..3ee3b278789d 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -5693,6 +5693,40 @@ void lock_release(struct lockdep_map *lock, unsigned long ip) } EXPORT_SYMBOL_GPL(lock_release); +/* + * lock_sync() - A special annotation for synchronize_{s,}rcu()-like API. + * + * No actual critical section is created by the APIs annotated with this: these + * APIs are used to wait for one or multiple critical sections (on other CPUs + * or threads), and it means that calling these APIs inside these critical + * sections is potential deadlock. + * + * This annotation acts as an acquire+release annotation pair with hardirqoff + * being 1. Since there's no critical section, no interrupt can create extra + * dependencies "inside" the annotation, hardirqoff == 1 allows us to avoid + * false positives. + */ +void lock_sync(struct lockdep_map *lock, unsigned subclass, int read, + int check, struct lockdep_map *nest_lock, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(!lockdep_enabled())) + return; + + raw_local_irq_save(flags); + check_flags(flags); + + lockdep_recursion_inc(); + __lock_acquire(lock, subclass, 0, read, check, 1, nest_lock, ip, 0, 0); + + if (__lock_release(lock, ip)) + check_chain_key(current); + lockdep_recursion_finish(); + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_sync); + noinstr int lock_is_held_type(const struct lockdep_map *lock, int read) { unsigned long flags; From f0f44752f5f61ee4e3bd88ae033fdb888320aafe Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 12 Jan 2023 22:59:54 -0800 Subject: [PATCH 12/54] rcu: Annotate SRCU's update-side lockdep dependencies Although all flavors of RCU readers are annotated correctly with lockdep as recursive read locks, they do not set the lock_acquire 'check' parameter. This means that RCU read locks are not added to the lockdep dependency graph, which in turn means that lockdep cannot detect RCU-based deadlocks. This is not a problem for RCU flavors having atomic read-side critical sections because context-based annotations can catch these deadlocks, see for example the RCU_LOCKDEP_WARN() statement in synchronize_rcu(). But context-based annotations are not helpful for sleepable RCU, especially given that it is perfectly legal to do synchronize_srcu(&srcu1) within an srcu_read_lock(&srcu2). However, we can detect SRCU-based by: (1) Making srcu_read_lock() a 'check'ed recursive read lock and (2) Making synchronize_srcu() a empty write lock critical section. Even better, with the newly introduced lock_sync(), we can avoid false positives about irq-unsafe/safe. This commit therefore makes it so. Note that NMI-safe SRCU read side critical sections are currently not annotated, but might be annotated in the future. Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney [ boqun: Add comments for annotation per Waiman's suggestion ] [ boqun: Fix comment warning reported by Stephen Rothwell ] Acked-by: Peter Zijlstra (Intel) Signed-off-by: Boqun Feng --- include/linux/srcu.h | 34 ++++++++++++++++++++++++++++++++-- kernel/rcu/srcutiny.c | 2 ++ kernel/rcu/srcutree.c | 2 ++ 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 74796cd7e7a9..41c4b26fb1c1 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -102,6 +102,32 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) return lock_is_held(&ssp->dep_map); } +/* + * Annotations provide deadlock detection for SRCU. + * + * Similar to other lockdep annotations, except there is an additional + * srcu_lock_sync(), which is basically an empty *write*-side critical section, + * see lock_sync() for more information. + */ + +/* Annotates a srcu_read_lock() */ +static inline void srcu_lock_acquire(struct lockdep_map *map) +{ + lock_map_acquire_read(map); +} + +/* Annotates a srcu_read_lock() */ +static inline void srcu_lock_release(struct lockdep_map *map) +{ + lock_map_release(map); +} + +/* Annotates a synchronize_srcu() */ +static inline void srcu_lock_sync(struct lockdep_map *map) +{ + lock_map_sync(map); +} + #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) @@ -109,6 +135,10 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) return 1; } +#define srcu_lock_acquire(m) do { } while (0) +#define srcu_lock_release(m) do { } while (0) +#define srcu_lock_sync(m) do { } while (0) + #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #define SRCU_NMI_UNKNOWN 0x0 @@ -182,7 +212,7 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) srcu_check_nmi_safety(ssp, false); retval = __srcu_read_lock(ssp); - rcu_lock_acquire(&(ssp)->dep_map); + srcu_lock_acquire(&(ssp)->dep_map); return retval; } @@ -254,7 +284,7 @@ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) { WARN_ON_ONCE(idx & ~0x1); srcu_check_nmi_safety(ssp, false); - rcu_lock_release(&(ssp)->dep_map); + srcu_lock_release(&(ssp)->dep_map); __srcu_read_unlock(ssp, idx); } diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index b12fb0cec44d..336af24e0fe3 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -197,6 +197,8 @@ void synchronize_srcu(struct srcu_struct *ssp) { struct rcu_synchronize rs; + srcu_lock_sync(&ssp->dep_map); + RCU_LOCKDEP_WARN(lockdep_is_held(ssp) || lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index ab4ee58af84b..c541b82646b6 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1307,6 +1307,8 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) { struct rcu_synchronize rcu; + srcu_lock_sync(&ssp->dep_map); + RCU_LOCKDEP_WARN(lockdep_is_held(ssp) || lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || From 60a1a64ec0c08e02f2cd4372cd3971e9d5eb2b37 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Sun, 12 Mar 2023 12:04:48 -0700 Subject: [PATCH 13/54] locking: Reduce the number of locks in ww_mutex stress tests The stress test in test_ww_mutex_init() uses 4095 locks since lockdep::reference has 12 bits, and since we are going to reduce it to 11 bits to support lock_sync(), and 2047 is still a reasonable number of the max nesting level for locks, so adjust the test. Reported-by: kernel test robot Link: https://lore.kernel.org/oe-lkp/202302011445.9d99dae2-oliver.sang@intel.com Tested-by: Paul E. McKenney Acked-by: Peter Zijlstra (Intel) Signed-off-by: Boqun Feng --- kernel/locking/test-ww_mutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 29dc253d03af..93cca6e69860 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -659,7 +659,7 @@ static int __init test_ww_mutex_init(void) if (ret) return ret; - ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL); + ret = stress(2047, hweight32(STRESS_ALL)*ncpus, STRESS_ALL); if (ret) return ret; From 0471db447cb7de56bbe2fedd9256b4d2b8ef642a Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 13 Jan 2023 15:57:22 -0800 Subject: [PATCH 14/54] locking/lockdep: Improve the deadlock scenario print for sync and read lock Lock scenario print is always a weak spot of lockdep splats. Improvement can be made if we rework the dependency search and the error printing. However without touching the graph search, we can improve a little for the circular deadlock case, since we have the to-be-added lock dependency, and know whether these two locks are read/write/sync. In order to know whether a held_lock is sync or not, a bit was "stolen" from ->references, which reduce our limit for the same lock class nesting from 2^12 to 2^11, and it should still be good enough. Besides, since we now have bit in held_lock for sync, we don't need the "hardirqoffs being 1" trick, and also we can avoid the __lock_release() if we jump out of __lock_acquire() before the held_lock stored. With these changes, a deadlock case evolved with read lock and sync gets a better print-out from: [...] Possible unsafe locking scenario: [...] [...] CPU0 CPU1 [...] ---- ---- [...] lock(srcuA); [...] lock(srcuB); [...] lock(srcuA); [...] lock(srcuB); to [...] Possible unsafe locking scenario: [...] [...] CPU0 CPU1 [...] ---- ---- [...] rlock(srcuA); [...] lock(srcuB); [...] lock(srcuA); [...] sync(srcuB); Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney Acked-by: Peter Zijlstra (Intel) Signed-off-by: Boqun Feng --- include/linux/lockdep.h | 3 ++- kernel/locking/lockdep.c | 48 ++++++++++++++++++++++++++-------------- 2 files changed, 34 insertions(+), 17 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 14d9dbedc6c1..b32256e9e944 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -134,7 +134,8 @@ struct held_lock { unsigned int read:2; /* see lock_acquire() comment */ unsigned int check:1; /* see lock_acquire() comment */ unsigned int hardirqs_off:1; - unsigned int references:12; /* 32 bits */ + unsigned int sync:1; + unsigned int references:11; /* 32 bits */ unsigned int pin_count; }; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 3ee3b278789d..dcd1d5bfc1e0 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1881,6 +1881,8 @@ print_circular_lock_scenario(struct held_lock *src, struct lock_class *source = hlock_class(src); struct lock_class *target = hlock_class(tgt); struct lock_class *parent = prt->class; + int src_read = src->read; + int tgt_read = tgt->read; /* * A direct locking problem where unsafe_class lock is taken @@ -1908,7 +1910,10 @@ print_circular_lock_scenario(struct held_lock *src, printk(" Possible unsafe locking scenario:\n\n"); printk(" CPU0 CPU1\n"); printk(" ---- ----\n"); - printk(" lock("); + if (tgt_read != 0) + printk(" rlock("); + else + printk(" lock("); __print_lock_name(target); printk(KERN_CONT ");\n"); printk(" lock("); @@ -1917,7 +1922,12 @@ print_circular_lock_scenario(struct held_lock *src, printk(" lock("); __print_lock_name(target); printk(KERN_CONT ");\n"); - printk(" lock("); + if (src_read != 0) + printk(" rlock("); + else if (src->sync) + printk(" sync("); + else + printk(" lock("); __print_lock_name(source); printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); @@ -4531,7 +4541,13 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) return 0; } } - if (!hlock->hardirqs_off) { + + /* + * For lock_sync(), don't mark the ENABLED usage, since lock_sync() + * creates no critical section and no extra dependency can be introduced + * by interrupts + */ + if (!hlock->hardirqs_off && !hlock->sync) { if (hlock->read) { if (!mark_lock(curr, hlock, LOCK_ENABLED_HARDIRQ_READ)) @@ -4910,7 +4926,7 @@ static int __lock_is_held(const struct lockdep_map *lock, int read); static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, int hardirqs_off, struct lockdep_map *nest_lock, unsigned long ip, - int references, int pin_count) + int references, int pin_count, int sync) { struct task_struct *curr = current; struct lock_class *class = NULL; @@ -4961,7 +4977,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, class_idx = class - lock_classes; - if (depth) { /* we're holding locks */ + if (depth && !sync) { + /* we're holding locks and the new held lock is not a sync */ hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { if (!references) @@ -4995,6 +5012,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->trylock = trylock; hlock->read = read; hlock->check = check; + hlock->sync = !!sync; hlock->hardirqs_off = !!hardirqs_off; hlock->references = references; #ifdef CONFIG_LOCK_STAT @@ -5056,6 +5074,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!validate_chain(curr, hlock, chain_head, chain_key)) return 0; + /* For lock_sync(), we are done here since no actual critical section */ + if (hlock->sync) + return 1; + curr->curr_chain_key = chain_key; curr->lockdep_depth++; check_chain_key(curr); @@ -5197,7 +5219,7 @@ static int reacquire_held_locks(struct task_struct *curr, unsigned int depth, hlock->read, hlock->check, hlock->hardirqs_off, hlock->nest_lock, hlock->acquire_ip, - hlock->references, hlock->pin_count)) { + hlock->references, hlock->pin_count, 0)) { case 0: return 1; case 1: @@ -5667,7 +5689,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, lockdep_recursion_inc(); __lock_acquire(lock, subclass, trylock, read, check, - irqs_disabled_flags(flags), nest_lock, ip, 0, 0); + irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 0); lockdep_recursion_finish(); raw_local_irq_restore(flags); } @@ -5700,11 +5722,6 @@ EXPORT_SYMBOL_GPL(lock_release); * APIs are used to wait for one or multiple critical sections (on other CPUs * or threads), and it means that calling these APIs inside these critical * sections is potential deadlock. - * - * This annotation acts as an acquire+release annotation pair with hardirqoff - * being 1. Since there's no critical section, no interrupt can create extra - * dependencies "inside" the annotation, hardirqoff == 1 allows us to avoid - * false positives. */ void lock_sync(struct lockdep_map *lock, unsigned subclass, int read, int check, struct lockdep_map *nest_lock, unsigned long ip) @@ -5718,10 +5735,9 @@ void lock_sync(struct lockdep_map *lock, unsigned subclass, int read, check_flags(flags); lockdep_recursion_inc(); - __lock_acquire(lock, subclass, 0, read, check, 1, nest_lock, ip, 0, 0); - - if (__lock_release(lock, ip)) - check_chain_key(current); + __lock_acquire(lock, subclass, 0, read, check, + irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 1); + check_chain_key(current); lockdep_recursion_finish(); raw_local_irq_restore(flags); } From d94f12e8a46d3a8623fd77231e2dcab780368685 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Jan 2023 21:34:27 -0800 Subject: [PATCH 15/54] rcutorture: Add SRCU deadlock scenarios In order to test the new SRCU-lockdep functionality, this commit adds an rcutorture.test_srcu_lockdep module parameter that, when non-zero, selects an SRCU deadlock scenario to execute. This parameter is a five-digit number formatted as DNNL, where "D" is 1 to force a deadlock and 0 to avoid doing so; "NN" is the test number, 0 for SRCU-based, 1 for SRCU/mutex-based, and 2 for SRCU/rwsem-based; and "L" is the number of steps in the deadlock cycle. Note that rcutorture.test_srcu_lockdep=1 will also force a hard hang. If a non-zero value of rcutorture.test_srcu_lockdep does not select a deadlock scenario, a console message is printed and testing continues. [ paulmck: Apply kernel test robot feedback, add rwsem support. ] [ paulmck: Apply Dan Carpenter feedback. ] Signed-off-by: Paul E. McKenney Acked-by: Peter Zijlstra (Intel) Signed-off-by: Boqun Feng --- kernel/rcu/rcutorture.c | 151 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8e6c023212cb..80ff9a743d31 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -120,6 +120,7 @@ torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds."); torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds."); torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs"); +torture_param(int, test_srcu_lockdep, 0, "Test specified SRCU deadlock scenario."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); static char *torture_type = "rcu"; @@ -3463,6 +3464,154 @@ static void rcutorture_sync(void) cur_ops->sync(); } +static DEFINE_MUTEX(mut0); +static DEFINE_MUTEX(mut1); +static DEFINE_MUTEX(mut2); +static DEFINE_MUTEX(mut3); +static DEFINE_MUTEX(mut4); +static DEFINE_MUTEX(mut5); +static DEFINE_MUTEX(mut6); +static DEFINE_MUTEX(mut7); +static DEFINE_MUTEX(mut8); +static DEFINE_MUTEX(mut9); + +static DECLARE_RWSEM(rwsem0); +static DECLARE_RWSEM(rwsem1); +static DECLARE_RWSEM(rwsem2); +static DECLARE_RWSEM(rwsem3); +static DECLARE_RWSEM(rwsem4); +static DECLARE_RWSEM(rwsem5); +static DECLARE_RWSEM(rwsem6); +static DECLARE_RWSEM(rwsem7); +static DECLARE_RWSEM(rwsem8); +static DECLARE_RWSEM(rwsem9); + +DEFINE_STATIC_SRCU(srcu0); +DEFINE_STATIC_SRCU(srcu1); +DEFINE_STATIC_SRCU(srcu2); +DEFINE_STATIC_SRCU(srcu3); +DEFINE_STATIC_SRCU(srcu4); +DEFINE_STATIC_SRCU(srcu5); +DEFINE_STATIC_SRCU(srcu6); +DEFINE_STATIC_SRCU(srcu7); +DEFINE_STATIC_SRCU(srcu8); +DEFINE_STATIC_SRCU(srcu9); + +static int srcu_lockdep_next(const char *f, const char *fl, const char *fs, const char *fu, int i, + int cyclelen, int deadlock) +{ + int j = i + 1; + + if (j >= cyclelen) + j = deadlock ? 0 : -1; + if (j >= 0) + pr_info("%s: %s(%d), %s(%d), %s(%d)\n", f, fl, i, fs, j, fu, i); + else + pr_info("%s: %s(%d), %s(%d)\n", f, fl, i, fu, i); + return j; +} + +// Test lockdep on SRCU-based deadlock scenarios. +static void rcu_torture_init_srcu_lockdep(void) +{ + int cyclelen; + int deadlock; + bool err = false; + int i; + int j; + int idx; + struct mutex *muts[] = { &mut0, &mut1, &mut2, &mut3, &mut4, + &mut5, &mut6, &mut7, &mut8, &mut9 }; + struct rw_semaphore *rwsems[] = { &rwsem0, &rwsem1, &rwsem2, &rwsem3, &rwsem4, + &rwsem5, &rwsem6, &rwsem7, &rwsem8, &rwsem9 }; + struct srcu_struct *srcus[] = { &srcu0, &srcu1, &srcu2, &srcu3, &srcu4, + &srcu5, &srcu6, &srcu7, &srcu8, &srcu9 }; + int testtype; + + if (!test_srcu_lockdep) + return; + + deadlock = test_srcu_lockdep / 1000; + testtype = (test_srcu_lockdep / 10) % 100; + cyclelen = test_srcu_lockdep % 10; + WARN_ON_ONCE(ARRAY_SIZE(muts) != ARRAY_SIZE(srcus)); + if (WARN_ONCE(deadlock != !!deadlock, + "%s: test_srcu_lockdep=%d and deadlock digit %d must be zero or one.\n", + __func__, test_srcu_lockdep, deadlock)) + err = true; + if (WARN_ONCE(cyclelen <= 0, + "%s: test_srcu_lockdep=%d and cycle-length digit %d must be greater than zero.\n", + __func__, test_srcu_lockdep, cyclelen)) + err = true; + if (err) + goto err_out; + + if (testtype == 0) { + pr_info("%s: test_srcu_lockdep = %05d: SRCU %d-way %sdeadlock.\n", + __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-"); + if (deadlock && cyclelen == 1) + pr_info("%s: Expect hang.\n", __func__); + for (i = 0; i < cyclelen; i++) { + j = srcu_lockdep_next(__func__, "srcu_read_lock", "synchronize_srcu", + "srcu_read_unlock", i, cyclelen, deadlock); + idx = srcu_read_lock(srcus[i]); + if (j >= 0) + synchronize_srcu(srcus[j]); + srcu_read_unlock(srcus[i], idx); + } + return; + } + + if (testtype == 1) { + pr_info("%s: test_srcu_lockdep = %05d: SRCU/mutex %d-way %sdeadlock.\n", + __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-"); + for (i = 0; i < cyclelen; i++) { + pr_info("%s: srcu_read_lock(%d), mutex_lock(%d), mutex_unlock(%d), srcu_read_unlock(%d)\n", + __func__, i, i, i, i); + idx = srcu_read_lock(srcus[i]); + mutex_lock(muts[i]); + mutex_unlock(muts[i]); + srcu_read_unlock(srcus[i], idx); + + j = srcu_lockdep_next(__func__, "mutex_lock", "synchronize_srcu", + "mutex_unlock", i, cyclelen, deadlock); + mutex_lock(muts[i]); + if (j >= 0) + synchronize_srcu(srcus[j]); + mutex_unlock(muts[i]); + } + return; + } + + if (testtype == 2) { + pr_info("%s: test_srcu_lockdep = %05d: SRCU/rwsem %d-way %sdeadlock.\n", + __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-"); + for (i = 0; i < cyclelen; i++) { + pr_info("%s: srcu_read_lock(%d), down_read(%d), up_read(%d), srcu_read_unlock(%d)\n", + __func__, i, i, i, i); + idx = srcu_read_lock(srcus[i]); + down_read(rwsems[i]); + up_read(rwsems[i]); + srcu_read_unlock(srcus[i], idx); + + j = srcu_lockdep_next(__func__, "down_write", "synchronize_srcu", + "up_write", i, cyclelen, deadlock); + down_write(rwsems[i]); + if (j >= 0) + synchronize_srcu(srcus[j]); + up_write(rwsems[i]); + } + return; + } + +err_out: + pr_info("%s: test_srcu_lockdep = %05d does nothing.\n", __func__, test_srcu_lockdep); + pr_info("%s: test_srcu_lockdep = DNNL.\n", __func__); + pr_info("%s: D: Deadlock if nonzero.\n", __func__); + pr_info("%s: NN: Test number, 0=SRCU, 1=SRCU/mutex, 2=SRCU/rwsem.\n", __func__); + pr_info("%s: L: Cycle length.\n", __func__); +} + static int __init rcu_torture_init(void) { @@ -3504,6 +3653,8 @@ rcu_torture_init(void) if (cur_ops->init) cur_ops->init(); + rcu_torture_init_srcu_lockdep(); + if (nreaders >= 0) { nrealreaders = nreaders; } else { From 6e2044887b1d8f26a8cf0042b9a340411b227e5e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Jan 2023 09:43:15 -0800 Subject: [PATCH 16/54] rcutorture: Add RCU Tasks Trace and SRCU deadlock scenarios Add a test number 3 that creates deadlock cycles involving one RCU Tasks Trace step and L-1 SRCU steps. Please note that lockdep will not detect these deadlocks until synchronize_rcu_tasks_trace() is marked with lockdep's new "sync" annotation, which will probably not happen until some time after these markings prove their worth on SRCU. Please note that these tests are available only in kernels built with CONFIG_TASKS_TRACE_RCU=y. Signed-off-by: Paul E. McKenney Acked-by: Peter Zijlstra (Intel) Signed-off-by: Boqun Feng --- kernel/rcu/rcutorture.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 80ff9a743d31..9efb8258a272 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -3604,12 +3604,46 @@ static void rcu_torture_init_srcu_lockdep(void) return; } +#ifdef CONFIG_TASKS_TRACE_RCU + if (testtype == 3) { + pr_info("%s: test_srcu_lockdep = %05d: SRCU and Tasks Trace RCU %d-way %sdeadlock.\n", + __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-"); + if (deadlock && cyclelen == 1) + pr_info("%s: Expect hang.\n", __func__); + for (i = 0; i < cyclelen; i++) { + char *fl = i == 0 ? "rcu_read_lock_trace" : "srcu_read_lock"; + char *fs = i == cyclelen - 1 ? "synchronize_rcu_tasks_trace" + : "synchronize_srcu"; + char *fu = i == 0 ? "rcu_read_unlock_trace" : "srcu_read_unlock"; + + j = srcu_lockdep_next(__func__, fl, fs, fu, i, cyclelen, deadlock); + if (i == 0) + rcu_read_lock_trace(); + else + idx = srcu_read_lock(srcus[i]); + if (j >= 0) { + if (i == cyclelen - 1) + synchronize_rcu_tasks_trace(); + else + synchronize_srcu(srcus[j]); + } + if (i == 0) + rcu_read_unlock_trace(); + else + srcu_read_unlock(srcus[i], idx); + } + return; + } +#endif // #ifdef CONFIG_TASKS_TRACE_RCU + err_out: pr_info("%s: test_srcu_lockdep = %05d does nothing.\n", __func__, test_srcu_lockdep); pr_info("%s: test_srcu_lockdep = DNNL.\n", __func__); pr_info("%s: D: Deadlock if nonzero.\n", __func__); - pr_info("%s: NN: Test number, 0=SRCU, 1=SRCU/mutex, 2=SRCU/rwsem.\n", __func__); + pr_info("%s: NN: Test number, 0=SRCU, 1=SRCU/mutex, 2=SRCU/rwsem, 3=SRCU/Tasks Trace RCU.\n", __func__); pr_info("%s: L: Cycle length.\n", __func__); + if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU)) + pr_info("%s: NN=3 disallowed because kernel is built with CONFIG_TASKS_TRACE_RCU=n\n", __func__); } static int __init From 5c5552d6297a2a91464180d2790dffdca9ffc7de Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Jan 2023 20:09:56 -0800 Subject: [PATCH 17/54] rcutorture: Add srcu_lockdep.sh This commit adds an srcu_lockdep.sh script that checks whether lockdep correctly classifies SRCU-based, SRCU/mutex-based, and SRCU/rwsem-based deadlocks. Signed-off-by: Paul E. McKenney [ boqun: Fix "RCUTORTURE" with "$RCUTORTURE" ] Acked-by: Peter Zijlstra (Intel) Signed-off-by: Boqun Feng --- .../selftests/rcutorture/bin/srcu_lockdep.sh | 78 +++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100755 tools/testing/selftests/rcutorture/bin/srcu_lockdep.sh diff --git a/tools/testing/selftests/rcutorture/bin/srcu_lockdep.sh b/tools/testing/selftests/rcutorture/bin/srcu_lockdep.sh new file mode 100755 index 000000000000..2e63ef009d59 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/srcu_lockdep.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Run SRCU-lockdep tests and report any that fail to meet expectations. +# +# Copyright (C) 2021 Meta Platforms, Inc. +# +# Authors: Paul E. McKenney + +usage () { + echo "Usage: $scriptname optional arguments:" + echo " --datestamp string" + exit 1 +} + +ds=`date +%Y.%m.%d-%H.%M.%S`-srcu_lockdep +scriptname="$0" + +T="`mktemp -d ${TMPDIR-/tmp}/srcu_lockdep.sh.XXXXXX`" +trap 'rm -rf $T' 0 + +RCUTORTURE="`pwd`/tools/testing/selftests/rcutorture"; export RCUTORTURE +PATH=${RCUTORTURE}/bin:$PATH; export PATH +. functions.sh + +while test $# -gt 0 +do + case "$1" in + --datestamp) + checkarg --datestamp "(relative pathname)" "$#" "$2" '^[a-zA-Z0-9._/-]*$' '^--' + ds=$2 + shift + ;; + *) + echo Unknown argument $1 + usage + ;; + esac + shift +done + +err= +nerrs=0 +for d in 0 1 +do + for t in 0 1 2 + do + for c in 1 2 3 + do + err= + val=$((d*1000+t*10+c)) + tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 5s --configs "SRCU-P" --bootargs "rcutorture.test_srcu_lockdep=$val" --trust-make --datestamp "$ds/$val" > "$T/kvm.sh.out" 2>&1 + ret=$? + mv "$T/kvm.sh.out" "$RCUTORTURE/res/$ds/$val" + if test "$d" -ne 0 && test "$ret" -eq 0 + then + err=1 + echo -n Unexpected success for > "$RCUTORTURE/res/$ds/$val/kvm.sh.err" + fi + if test "$d" -eq 0 && test "$ret" -ne 0 + then + err=1 + echo -n Unexpected failure for > "$RCUTORTURE/res/$ds/$val/kvm.sh.err" + fi + if test -n "$err" + then + grep "rcu_torture_init_srcu_lockdep: test_srcu_lockdep = " "$RCUTORTURE/res/$ds/$val/SRCU-P/console.log" | sed -e 's/^.*rcu_torture_init_srcu_lockdep://' >> "$RCUTORTURE/res/$ds/$val/kvm.sh.err" + cat "$RCUTORTURE/res/$ds/$val/kvm.sh.err" + nerrs=$((nerrs+1)) + fi + done + done +done +if test "$nerrs" -ne 0 +then + exit 1 +fi +exit 0 From 2b4be54830681cc023877d3b33a9b7ae9122e551 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 19 Mar 2023 17:29:20 -0700 Subject: [PATCH 18/54] rcu-tasks: Fix warning for unused tasks_rcu_exit_srcu The tasks_rcu_exit_srcu variable is used only by kernels built with CONFIG_TASKS_RCU=y, but is defined for all kernesl with CONFIG_TASKS_RCU_GENERIC=y. Therefore, in kernels built with CONFIG_TASKS_RCU_GENERIC=y but CONFIG_TASKS_RCU=n, this gives a "defined but not used" warning. This commit therefore moves this variable under CONFIG_TASKS_RCU. Link: https://lore.kernel.org/oe-kbuild-all/202303191536.XzMSyzTl-lkp@intel.com/ Reported-by: kernel test robot Reviewed-by: Frederic Weisbecker Tested-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index bfb5e1549f2b..185358c2f08d 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -136,8 +136,10 @@ static struct rcu_tasks rt_name = \ .kname = #rt_name, \ } +#ifdef CONFIG_TASKS_RCU /* Track exiting tasks in order to allow them to be waited for. */ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); +#endif /* Avoid IPIing CPUs early in the grace period. */ #define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0) From 3e67cb8a3c6251c86e5d058d8ee4e1909bc25af0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 17 Mar 2023 08:06:56 -0700 Subject: [PATCH 19/54] srcu: Add whitespace to __SRCU_STRUCT_INIT() & __DEFINE_SRCU() This is a whitespace-only commit with no change in functionality. Its purpose is to prepare for later commits that: (1) Cause statically allocated srcu_struct structures to rely on compile-time initialization and (2) Move fields from the srcu_struct structure to a new srcu_usage structure. Cc: Christoph Hellwig Tested-by: Sachin Sant Tested-by: "Zhang, Qiang1" Tested-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 558057b517b7..488d0e5d1ba3 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -108,13 +108,13 @@ struct srcu_struct { #define SRCU_STATE_SCAN1 1 #define SRCU_STATE_SCAN2 2 -#define __SRCU_STRUCT_INIT(name, pcpu_name) \ -{ \ - .sda = &pcpu_name, \ - .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ - .srcu_gp_seq_needed = -1UL, \ - .work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0), \ - __SRCU_DEP_MAP_INIT(name) \ +#define __SRCU_STRUCT_INIT(name, pcpu_name) \ +{ \ + .sda = &pcpu_name, \ + .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ + .srcu_gp_seq_needed = -1UL, \ + .work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0), \ + __SRCU_DEP_MAP_INIT(name) \ } /* @@ -137,15 +137,15 @@ struct srcu_struct { * See include/linux/percpu-defs.h for the rules on per-CPU variables. */ #ifdef MODULE -# define __DEFINE_SRCU(name, is_static) \ - is_static struct srcu_struct name; \ - extern struct srcu_struct * const __srcu_struct_##name; \ - struct srcu_struct * const __srcu_struct_##name \ +# define __DEFINE_SRCU(name, is_static) \ + is_static struct srcu_struct name; \ + extern struct srcu_struct * const __srcu_struct_##name; \ + struct srcu_struct * const __srcu_struct_##name \ __section("___srcu_struct_ptrs") = &name #else -# define __DEFINE_SRCU(name, is_static) \ - static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data); \ - is_static struct srcu_struct name = \ +# define __DEFINE_SRCU(name, is_static) \ + static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data); \ + is_static struct srcu_struct name = \ __SRCU_STRUCT_INIT(name, name##_srcu_data) #endif #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) From f4d01a259374ef358cd6b00a96b4dfc0fb05a844 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 17 Mar 2023 13:28:04 -0700 Subject: [PATCH 20/54] srcu: Use static init for statically allocated in-module srcu_struct Further shrinking the srcu_struct structure is eased by requiring that in-module srcu_struct structures rely more heavily on static initialization. In particular, this preserves the property that a module-load-time srcu_struct initialization can fail only due to memory-allocation failure of the per-CPU srcu_data structures. It might also slightly improve robustness by keeping the number of memory allocations that must succeed down percpu_alloc() call. This is in preparation for splitting an srcu_usage structure out of the srcu_struct structure. [ paulmck: Fold in qiang1.zhang@intel.com feedback. ] Cc: Christoph Hellwig Tested-by: Sachin Sant Tested-by: "Zhang, Qiang1" Tested-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 19 ++++++++++++++----- kernel/rcu/srcutree.c | 19 +++++++++++++------ 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 488d0e5d1ba3..3ce6deee1dbe 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -108,15 +108,24 @@ struct srcu_struct { #define SRCU_STATE_SCAN1 1 #define SRCU_STATE_SCAN2 2 -#define __SRCU_STRUCT_INIT(name, pcpu_name) \ -{ \ - .sda = &pcpu_name, \ +#define __SRCU_STRUCT_INIT_COMMON(name) \ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ .srcu_gp_seq_needed = -1UL, \ .work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0), \ - __SRCU_DEP_MAP_INIT(name) \ + __SRCU_DEP_MAP_INIT(name) + +#define __SRCU_STRUCT_INIT_MODULE(name) \ +{ \ + __SRCU_STRUCT_INIT_COMMON(name) \ } +#define __SRCU_STRUCT_INIT(name, pcpu_name) \ +{ \ + .sda = &pcpu_name, \ + __SRCU_STRUCT_INIT_COMMON(name) \ +} + + /* * Define and initialize a srcu struct at build time. * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. @@ -138,7 +147,7 @@ struct srcu_struct { */ #ifdef MODULE # define __DEFINE_SRCU(name, is_static) \ - is_static struct srcu_struct name; \ + is_static struct srcu_struct name = __SRCU_STRUCT_INIT_MODULE(name); \ extern struct srcu_struct * const __srcu_struct_##name; \ struct srcu_struct * const __srcu_struct_##name \ __section("___srcu_struct_ptrs") = &name diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index ab4ee58af84b..7e6e7dfb1a87 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1873,13 +1873,14 @@ void __init srcu_init(void) static int srcu_module_coming(struct module *mod) { int i; + struct srcu_struct *ssp; struct srcu_struct **sspp = mod->srcu_struct_ptrs; - int ret; for (i = 0; i < mod->num_srcu_structs; i++) { - ret = init_srcu_struct(*(sspp++)); - if (WARN_ON_ONCE(ret)) - return ret; + ssp = *(sspp++); + ssp->sda = alloc_percpu(struct srcu_data); + if (WARN_ON_ONCE(!ssp->sda)) + return -ENOMEM; } return 0; } @@ -1888,10 +1889,16 @@ static int srcu_module_coming(struct module *mod) static void srcu_module_going(struct module *mod) { int i; + struct srcu_struct *ssp; struct srcu_struct **sspp = mod->srcu_struct_ptrs; - for (i = 0; i < mod->num_srcu_structs; i++) - cleanup_srcu_struct(*(sspp++)); + for (i = 0; i < mod->num_srcu_structs; i++) { + ssp = *(sspp++); + if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed)) && + !WARN_ON_ONCE(!ssp->sda_is_static)) + cleanup_srcu_struct(ssp); + free_percpu(ssp->sda); + } } /* Handle one module, either coming or going. */ From 95433f7263011e0e6e83caef85d98896dd99cab7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 16 Mar 2023 17:58:51 -0700 Subject: [PATCH 21/54] srcu: Begin offloading srcu_struct fields to srcu_update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current srcu_struct structure is on the order of 200 bytes in size (depending on architecture and .config), which is much better than the old-style 26K bytes, but still all too inconvenient when one is trying to achieve good cache locality on a fastpath involving SRCU readers. However, only a few fields in srcu_struct are used by SRCU readers. The remaining fields could be offloaded to a new srcu_update structure, thus shrinking the srcu_struct structure down to a few tens of bytes. This commit begins this noble quest, a quest that is complicated by open-coded initialization of the srcu_struct within the srcu_notifier_head structure. This complication is addressed by updating the srcu_notifier_head structure's open coding, given that there does not appear to be a straightforward way of abstracting that initialization. This commit moves only the ->node pointer to srcu_update. Later commits will move additional fields. [ paulmck: Fold in qiang1.zhang@intel.com's memory-leak fix. ] Link: https://lore.kernel.org/all/20230320055751.4120251-1-qiang1.zhang@intel.com/ Suggested-by: Christoph Hellwig Cc: "Rafael J. Wysocki" Cc: "Michał Mirosław" Cc: Dmitry Osipenko Tested-by: Sachin Sant Tested-by: "Zhang, Qiang1" Acked-by: Rafael J. Wysocki Tested-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/notifier.h | 5 ++++- include/linux/srcutiny.h | 6 +++--- include/linux/srcutree.h | 27 ++++++++++++++++++--------- kernel/rcu/rcu.h | 6 ++++-- kernel/rcu/srcutree.c | 28 +++++++++++++++++++--------- 5 files changed, 48 insertions(+), 24 deletions(-) diff --git a/include/linux/notifier.h b/include/linux/notifier.h index aef88c2d1173..2aba75145144 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -73,6 +73,9 @@ struct raw_notifier_head { struct srcu_notifier_head { struct mutex mutex; +#ifdef CONFIG_TREE_SRCU + struct srcu_usage srcuu; +#endif struct srcu_struct srcu; struct notifier_block __rcu *head; }; @@ -107,7 +110,7 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh); { \ .mutex = __MUTEX_INITIALIZER(name.mutex), \ .head = NULL, \ - .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu), \ + .srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \ } #define ATOMIC_NOTIFIER_HEAD(name) \ diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 5aa5e0faf6a1..ebd72491af99 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -31,7 +31,7 @@ struct srcu_struct { void srcu_drive_gp(struct work_struct *wp); -#define __SRCU_STRUCT_INIT(name, __ignored) \ +#define __SRCU_STRUCT_INIT(name, __ignored, ___ignored) \ { \ .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \ .srcu_cb_tail = &name.srcu_cb_head, \ @@ -44,9 +44,9 @@ void srcu_drive_gp(struct work_struct *wp); * Tree SRCU, which needs some per-CPU data. */ #define DEFINE_SRCU(name) \ - struct srcu_struct name = __SRCU_STRUCT_INIT(name, name) + struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name) #define DEFINE_STATIC_SRCU(name) \ - static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name) + static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name) void synchronize_srcu(struct srcu_struct *ssp); diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 3ce6deee1dbe..276f325f1296 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -57,11 +57,17 @@ struct srcu_node { int grphi; /* Biggest CPU for node. */ }; +/* + * Per-SRCU-domain structure, update-side data linked from srcu_struct. + */ +struct srcu_usage { + struct srcu_node *node; /* Combining tree. */ +}; + /* * Per-SRCU-domain structure, similar in function to rcu_state. */ struct srcu_struct { - struct srcu_node *node; /* Combining tree. */ struct srcu_node *level[RCU_NUM_LVLS + 1]; /* First node at each level. */ int srcu_size_state; /* Small-to-big transition state. */ @@ -90,6 +96,7 @@ struct srcu_struct { unsigned long reschedule_count; struct delayed_work work; struct lockdep_map dep_map; + struct srcu_usage *srcu_sup; /* Update-side data. */ }; /* Values for size state variable (->srcu_size_state). */ @@ -108,24 +115,24 @@ struct srcu_struct { #define SRCU_STATE_SCAN1 1 #define SRCU_STATE_SCAN2 2 -#define __SRCU_STRUCT_INIT_COMMON(name) \ +#define __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ .srcu_gp_seq_needed = -1UL, \ .work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0), \ + .srcu_sup = &usage_name, \ __SRCU_DEP_MAP_INIT(name) -#define __SRCU_STRUCT_INIT_MODULE(name) \ +#define __SRCU_STRUCT_INIT_MODULE(name, usage_name) \ { \ - __SRCU_STRUCT_INIT_COMMON(name) \ + __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ } -#define __SRCU_STRUCT_INIT(name, pcpu_name) \ +#define __SRCU_STRUCT_INIT(name, usage_name, pcpu_name) \ { \ .sda = &pcpu_name, \ - __SRCU_STRUCT_INIT_COMMON(name) \ + __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ } - /* * Define and initialize a srcu struct at build time. * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. @@ -147,15 +154,17 @@ struct srcu_struct { */ #ifdef MODULE # define __DEFINE_SRCU(name, is_static) \ - is_static struct srcu_struct name = __SRCU_STRUCT_INIT_MODULE(name); \ + static struct srcu_usage name##_srcu_usage; \ + is_static struct srcu_struct name = __SRCU_STRUCT_INIT_MODULE(name, name##_srcu_usage); \ extern struct srcu_struct * const __srcu_struct_##name; \ struct srcu_struct * const __srcu_struct_##name \ __section("___srcu_struct_ptrs") = &name #else # define __DEFINE_SRCU(name, is_static) \ static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data); \ + static struct srcu_usage name##_srcu_usage; \ is_static struct srcu_struct name = \ - __SRCU_STRUCT_INIT(name, name##_srcu_data) + __SRCU_STRUCT_INIT(name, name##_srcu_usage, name##_srcu_data) #endif #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 115616ac3bfa..8d18d4bf0e29 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -341,11 +341,13 @@ extern void rcu_init_geometry(void); * specified state structure (for SRCU) or the only rcu_state structure * (for RCU). */ -#define srcu_for_each_node_breadth_first(sp, rnp) \ +#define _rcu_for_each_node_breadth_first(sp, rnp) \ for ((rnp) = &(sp)->node[0]; \ (rnp) < &(sp)->node[rcu_num_nodes]; (rnp)++) #define rcu_for_each_node_breadth_first(rnp) \ - srcu_for_each_node_breadth_first(&rcu_state, rnp) + _rcu_for_each_node_breadth_first(&rcu_state, rnp) +#define srcu_for_each_node_breadth_first(ssp, rnp) \ + _rcu_for_each_node_breadth_first(ssp->srcu_sup, rnp) /* * Scan the leaves of the rcu_node hierarchy for the rcu_state structure. diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 7e6e7dfb1a87..049e20dbec76 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -173,12 +173,12 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) /* Initialize geometry if it has not already been initialized. */ rcu_init_geometry(); - ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), gfp_flags); - if (!ssp->node) + ssp->srcu_sup->node = kcalloc(rcu_num_nodes, sizeof(*ssp->srcu_sup->node), gfp_flags); + if (!ssp->srcu_sup->node) return false; /* Work out the overall tree geometry. */ - ssp->level[0] = &ssp->node[0]; + ssp->level[0] = &ssp->srcu_sup->node[0]; for (i = 1; i < rcu_num_lvls; i++) ssp->level[i] = ssp->level[i - 1] + num_rcu_lvl[i - 1]; rcu_init_levelspread(levelspread, num_rcu_lvl); @@ -195,7 +195,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ; snp->grplo = -1; snp->grphi = -1; - if (snp == &ssp->node[0]) { + if (snp == &ssp->srcu_sup->node[0]) { /* Root node, special case. */ snp->srcu_parent = NULL; continue; @@ -236,8 +236,12 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) */ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) { + if (!is_static) + ssp->srcu_sup = kzalloc(sizeof(*ssp->srcu_sup), GFP_KERNEL); + if (!ssp->srcu_sup) + return -ENOMEM; ssp->srcu_size_state = SRCU_SIZE_SMALL; - ssp->node = NULL; + ssp->srcu_sup->node = NULL; mutex_init(&ssp->srcu_cb_mutex); mutex_init(&ssp->srcu_gp_mutex); ssp->srcu_idx = 0; @@ -249,8 +253,11 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->sda_is_static = is_static; if (!is_static) ssp->sda = alloc_percpu(struct srcu_data); - if (!ssp->sda) + if (!ssp->sda) { + if (!is_static) + kfree(ssp->srcu_sup); return -ENOMEM; + } init_srcu_struct_data(ssp); ssp->srcu_gp_seq_needed_exp = 0; ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); @@ -259,6 +266,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) if (!ssp->sda_is_static) { free_percpu(ssp->sda); ssp->sda = NULL; + kfree(ssp->srcu_sup); return -ENOMEM; } } else { @@ -656,13 +664,15 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed); return; /* Caller forgot to stop doing call_srcu()? */ } + kfree(ssp->srcu_sup->node); + ssp->srcu_sup->node = NULL; + ssp->srcu_size_state = SRCU_SIZE_SMALL; if (!ssp->sda_is_static) { free_percpu(ssp->sda); ssp->sda = NULL; + kfree(ssp->srcu_sup); + ssp->srcu_sup = NULL; } - kfree(ssp->node); - ssp->node = NULL; - ssp->srcu_size_state = SRCU_SIZE_SMALL; } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); From 208f41b1312443401353bec0c1939e2bfc28adce Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 17 Mar 2023 14:43:08 -0700 Subject: [PATCH 22/54] srcu: Move ->level from srcu_struct to srcu_usage This commit moves the ->level[] array from the srcu_struct structure to the srcu_usage structure to reduce the size of the former in order to improve cache locality. Suggested-by: Christoph Hellwig Tested-by: Sachin Sant Tested-by: "Zhang, Qiang1" Tested-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 4 ++-- kernel/rcu/srcutree.c | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 276f325f1296..c7373fe5c14b 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -62,14 +62,14 @@ struct srcu_node { */ struct srcu_usage { struct srcu_node *node; /* Combining tree. */ + struct srcu_node *level[RCU_NUM_LVLS + 1]; + /* First node at each level. */ }; /* * Per-SRCU-domain structure, similar in function to rcu_state. */ struct srcu_struct { - struct srcu_node *level[RCU_NUM_LVLS + 1]; - /* First node at each level. */ int srcu_size_state; /* Small-to-big transition state. */ struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ spinlock_t __private lock; /* Protect counters and size state. */ diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 049e20dbec76..acb0862faafa 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -178,9 +178,9 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) return false; /* Work out the overall tree geometry. */ - ssp->level[0] = &ssp->srcu_sup->node[0]; + ssp->srcu_sup->level[0] = &ssp->srcu_sup->node[0]; for (i = 1; i < rcu_num_lvls; i++) - ssp->level[i] = ssp->level[i - 1] + num_rcu_lvl[i - 1]; + ssp->srcu_sup->level[i] = ssp->srcu_sup->level[i - 1] + num_rcu_lvl[i - 1]; rcu_init_levelspread(levelspread, num_rcu_lvl); /* Each pass through this loop initializes one srcu_node structure. */ @@ -202,10 +202,10 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) } /* Non-root node. */ - if (snp == ssp->level[level + 1]) + if (snp == ssp->srcu_sup->level[level + 1]) level++; - snp->srcu_parent = ssp->level[level - 1] + - (snp - ssp->level[level]) / + snp->srcu_parent = ssp->srcu_sup->level[level - 1] + + (snp - ssp->srcu_sup->level[level]) / levelspread[level - 1]; } @@ -214,7 +214,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) * leaves of the srcu_node tree. */ level = rcu_num_lvls - 1; - snp_first = ssp->level[level]; + snp_first = ssp->srcu_sup->level[level]; for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); sdp->mynode = &snp_first[cpu / levelspread[level]]; @@ -889,7 +889,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) srcu_for_each_node_breadth_first(ssp, snp) { spin_lock_irq_rcu_node(snp); cbs = false; - last_lvl = snp >= ssp->level[rcu_num_lvls - 1]; + last_lvl = snp >= ssp->srcu_sup->level[rcu_num_lvls - 1]; if (last_lvl) cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq; snp->srcu_have_cbs[idx] = gpseq; From a0d8cbd3821369dc9478cabd605417afb9eb24dc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 17 Mar 2023 17:16:30 -0700 Subject: [PATCH 23/54] srcu: Move ->srcu_size_state from srcu_struct to srcu_usage This commit moves the ->srcu_size_state field from the srcu_struct structure to the srcu_usage structure to reduce the size of the former in order to improve cache locality. Suggested-by: Christoph Hellwig Tested-by: Sachin Sant Tested-by: "Zhang, Qiang1" Tested-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 2 +- kernel/rcu/srcutree.c | 37 +++++++++++++++++++------------------ 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index c7373fe5c14b..443d27a214ef 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -64,13 +64,13 @@ struct srcu_usage { struct srcu_node *node; /* Combining tree. */ struct srcu_node *level[RCU_NUM_LVLS + 1]; /* First node at each level. */ + int srcu_size_state; /* Small-to-big transition state. */ }; /* * Per-SRCU-domain structure, similar in function to rcu_state. */ struct srcu_struct { - int srcu_size_state; /* Small-to-big transition state. */ struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ spinlock_t __private lock; /* Protect counters and size state. */ struct mutex srcu_gp_mutex; /* Serialize GP work. */ diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index acb0862faafa..8428a184d506 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -225,7 +225,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) } sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); } - smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); + smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); return true; } @@ -240,7 +240,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->srcu_sup = kzalloc(sizeof(*ssp->srcu_sup), GFP_KERNEL); if (!ssp->srcu_sup) return -ENOMEM; - ssp->srcu_size_state = SRCU_SIZE_SMALL; + ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL; ssp->srcu_sup->node = NULL; mutex_init(&ssp->srcu_cb_mutex); mutex_init(&ssp->srcu_gp_mutex); @@ -261,7 +261,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) init_srcu_struct_data(ssp); ssp->srcu_gp_seq_needed_exp = 0; ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); - if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { + if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) { if (!ssp->sda_is_static) { free_percpu(ssp->sda); @@ -270,7 +270,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) return -ENOMEM; } } else { - WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_BIG); + WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); } } smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ @@ -315,7 +315,7 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); static void __srcu_transition_to_big(struct srcu_struct *ssp) { lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); - smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC); + smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_ALLOC); } /* @@ -326,10 +326,10 @@ static void srcu_transition_to_big(struct srcu_struct *ssp) unsigned long flags; /* Double-checked locking on ->srcu_size-state. */ - if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) + if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) return; spin_lock_irqsave_rcu_node(ssp, flags); - if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) { + if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) { spin_unlock_irqrestore_rcu_node(ssp, flags); return; } @@ -345,7 +345,7 @@ static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp) { unsigned long j; - if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_size_state) + if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_sup->srcu_size_state) return; j = jiffies; if (ssp->srcu_size_jiffies != j) { @@ -666,7 +666,7 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) } kfree(ssp->srcu_sup->node); ssp->srcu_sup->node = NULL; - ssp->srcu_size_state = SRCU_SIZE_SMALL; + ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL; if (!ssp->sda_is_static) { free_percpu(ssp->sda); ssp->sda = NULL; @@ -770,7 +770,7 @@ static void srcu_gp_start(struct srcu_struct *ssp) struct srcu_data *sdp; int state; - if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id()); else sdp = this_cpu_ptr(ssp->sda); @@ -880,7 +880,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) /* A new grace period can start at this point. But only one. */ /* Initiate callback invocation as needed. */ - ss_state = smp_load_acquire(&ssp->srcu_size_state); + ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state); if (ss_state < SRCU_SIZE_WAIT_BARRIER) { srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, get_boot_cpu_id()), cbdelay); @@ -940,7 +940,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) if (ss_state == SRCU_SIZE_ALLOC) init_srcu_struct_nodes(ssp, GFP_KERNEL); else - smp_store_release(&ssp->srcu_size_state, ss_state + 1); + smp_store_release(&ssp->srcu_sup->srcu_size_state, ss_state + 1); } } @@ -1002,7 +1002,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, unsigned long snp_seq; /* Ensure that snp node tree is fully initialized before traversing it */ - if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) snp_leaf = NULL; else snp_leaf = sdp->mynode; @@ -1209,7 +1209,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, * sequence number cannot wrap around in the meantime. */ idx = __srcu_read_lock_nmisafe(ssp); - ss_state = smp_load_acquire(&ssp->srcu_size_state); + ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state); if (ss_state < SRCU_SIZE_WAIT_CALL) sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id()); else @@ -1546,7 +1546,7 @@ void srcu_barrier(struct srcu_struct *ssp) atomic_set(&ssp->srcu_barrier_cpu_cnt, 1); idx = __srcu_read_lock_nmisafe(ssp); - if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, get_boot_cpu_id())); else for_each_possible_cpu(cpu) @@ -1784,7 +1784,7 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) int cpu; int idx; unsigned long s0 = 0, s1 = 0; - int ss_state = READ_ONCE(ssp->srcu_size_state); + int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state); int ss_state_idx = ss_state; idx = ssp->srcu_idx & 0x1; @@ -1871,8 +1871,9 @@ void __init srcu_init(void) ssp = list_first_entry(&srcu_boot_list, struct srcu_struct, work.work.entry); list_del_init(&ssp->work.work.entry); - if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && ssp->srcu_size_state == SRCU_SIZE_SMALL) - ssp->srcu_size_state = SRCU_SIZE_ALLOC; + if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && + ssp->srcu_sup->srcu_size_state == SRCU_SIZE_SMALL) + ssp->srcu_sup->srcu_size_state = SRCU_SIZE_ALLOC; queue_work(rcu_gp_wq, &ssp->work.work); } } From 574dc1a7efe490dffe5c1ce0285306feec16a880 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 17 Mar 2023 17:22:27 -0700 Subject: [PATCH 24/54] srcu: Move ->srcu_cb_mutex from srcu_struct to srcu_usage This commit moves the ->srcu_cb_mutex field from the srcu_struct structure to the srcu_usage structure to reduce the size of the former in order to improve cache locality. Suggested-by: Christoph Hellwig Tested-by: Sachin Sant Tested-by: "Zhang, Qiang1" Tested-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 2 +- kernel/rcu/srcutree.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 443d27a214ef..231de66ceb15 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -65,13 +65,13 @@ struct srcu_usage { struct srcu_node *level[RCU_NUM_LVLS + 1]; /* First node at each level. */ int srcu_size_state; /* Small-to-big transition state. */ + struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ }; /* * Per-SRCU-domain structure, similar in function to rcu_state. */ struct srcu_struct { - struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ spinlock_t __private lock; /* Protect counters and size state. */ struct mutex srcu_gp_mutex; /* Serialize GP work. */ unsigned int srcu_idx; /* Current rdr array element. */ diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 8428a184d506..1814f3bfc219 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -242,7 +242,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) return -ENOMEM; ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL; ssp->srcu_sup->node = NULL; - mutex_init(&ssp->srcu_cb_mutex); + mutex_init(&ssp->srcu_sup->srcu_cb_mutex); mutex_init(&ssp->srcu_gp_mutex); ssp->srcu_idx = 0; ssp->srcu_gp_seq = 0; @@ -861,7 +861,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) int ss_state; /* Prevent more than one additional grace period. */ - mutex_lock(&ssp->srcu_cb_mutex); + mutex_lock(&ssp->srcu_sup->srcu_cb_mutex); /* End the current grace period. */ spin_lock_irq_rcu_node(ssp); @@ -921,7 +921,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) } /* Callback initiation done, allow grace periods after next. */ - mutex_unlock(&ssp->srcu_cb_mutex); + mutex_unlock(&ssp->srcu_sup->srcu_cb_mutex); /* Start a new grace period if needed. */ spin_lock_irq_rcu_node(ssp); From 0839ade94bdef395bab02b3a579416c112062026 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 17 Mar 2023 17:35:21 -0700 Subject: [PATCH 25/54] srcu: Move ->lock initialization after srcu_usage allocation Currently, both __init_srcu_struct() in CONFIG_DEBUG_LOCK_ALLOC=y kernels and init_srcu_struct() in CONFIG_DEBUG_LOCK_ALLOC=n kernel initialize the srcu_struct structure's ->lock before the srcu_usage structure has been allocated. This of course prevents the ->lock from being moved to the srcu_usage structure, so this commit moves the initialization into the init_srcu_struct_fields() after the srcu_usage structure has been allocated. Cc: Christoph Hellwig Tested-by: Sachin Sant Tested-by: "Zhang, Qiang1" Tested-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 1814f3bfc219..c2a024a60f1a 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -240,6 +240,8 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->srcu_sup = kzalloc(sizeof(*ssp->srcu_sup), GFP_KERNEL); if (!ssp->srcu_sup) return -ENOMEM; + if (!is_static) + spin_lock_init(&ACCESS_PRIVATE(ssp, lock)); ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL; ssp->srcu_sup->node = NULL; mutex_init(&ssp->srcu_sup->srcu_cb_mutex); @@ -285,7 +287,6 @@ int __init_srcu_struct(struct srcu_struct *ssp, const char *name, /* Don't re-initialize a lock while it is held. */ debug_check_no_locks_freed((void *)ssp, sizeof(*ssp)); lockdep_init_map(&ssp->dep_map, name, key, 0); - spin_lock_init(&ACCESS_PRIVATE(ssp, lock)); return init_srcu_struct_fields(ssp, false); } EXPORT_SYMBOL_GPL(__init_srcu_struct); @@ -302,7 +303,6 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct); */ int init_srcu_struct(struct srcu_struct *ssp) { - spin_lock_init(&ACCESS_PRIVATE(ssp, lock)); return init_srcu_struct_fields(ssp, false); } EXPORT_SYMBOL_GPL(init_srcu_struct); From b3fb11f7e9c3c64dd86403409a070c996d8ac081 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 17 Mar 2023 18:29:38 -0700 Subject: [PATCH 26/54] srcu: Move ->lock from srcu_struct to srcu_usage This commit moves the ->lock field from the srcu_struct structure to the srcu_usage structure to reduce the size of the former in order to improve cache locality. Suggested-by: Christoph Hellwig Tested-by: Sachin Sant Tested-by: "Zhang, Qiang1" Tested-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 11 +++++--- kernel/rcu/srcutree.c | 56 ++++++++++++++++++++-------------------- 2 files changed, 35 insertions(+), 32 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 231de66ceb15..694d87b81917 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -66,13 +66,13 @@ struct srcu_usage { /* First node at each level. */ int srcu_size_state; /* Small-to-big transition state. */ struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ + spinlock_t __private lock; /* Protect counters and size state. */ }; /* * Per-SRCU-domain structure, similar in function to rcu_state. */ struct srcu_struct { - spinlock_t __private lock; /* Protect counters and size state. */ struct mutex srcu_gp_mutex; /* Serialize GP work. */ unsigned int srcu_idx; /* Current rdr array element. */ unsigned long srcu_gp_seq; /* Grace-period seq #. */ @@ -116,7 +116,6 @@ struct srcu_struct { #define SRCU_STATE_SCAN2 2 #define __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ - .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ .srcu_gp_seq_needed = -1UL, \ .work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0), \ .srcu_sup = &usage_name, \ @@ -154,7 +153,9 @@ struct srcu_struct { */ #ifdef MODULE # define __DEFINE_SRCU(name, is_static) \ - static struct srcu_usage name##_srcu_usage; \ + static struct srcu_usage name##_srcu_usage = { \ + .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ + }; \ is_static struct srcu_struct name = __SRCU_STRUCT_INIT_MODULE(name, name##_srcu_usage); \ extern struct srcu_struct * const __srcu_struct_##name; \ struct srcu_struct * const __srcu_struct_##name \ @@ -162,7 +163,9 @@ struct srcu_struct { #else # define __DEFINE_SRCU(name, is_static) \ static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data); \ - static struct srcu_usage name##_srcu_usage; \ + static struct srcu_usage name##_srcu_usage = { \ + .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ + }; \ is_static struct srcu_struct name = \ __SRCU_STRUCT_INIT(name, name##_srcu_usage, name##_srcu_data) #endif diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index c2a024a60f1a..c42248cf18f6 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -103,7 +103,7 @@ do { \ #define spin_trylock_irqsave_rcu_node(p, flags) \ ({ \ - bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ \ if (___locked) \ smp_mb__after_unlock_lock(); \ @@ -241,7 +241,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) if (!ssp->srcu_sup) return -ENOMEM; if (!is_static) - spin_lock_init(&ACCESS_PRIVATE(ssp, lock)); + spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL; ssp->srcu_sup->node = NULL; mutex_init(&ssp->srcu_sup->srcu_cb_mutex); @@ -314,7 +314,7 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); */ static void __srcu_transition_to_big(struct srcu_struct *ssp) { - lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); + lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_ALLOC); } @@ -328,13 +328,13 @@ static void srcu_transition_to_big(struct srcu_struct *ssp) /* Double-checked locking on ->srcu_size-state. */ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) return; - spin_lock_irqsave_rcu_node(ssp, flags); + spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) { - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); return; } __srcu_transition_to_big(ssp); - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* @@ -369,9 +369,9 @@ static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned lon if (spin_trylock_irqsave_rcu_node(sdp, *flags)) return; - spin_lock_irqsave_rcu_node(ssp, *flags); + spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); spin_lock_irqsave_check_contention(ssp); - spin_unlock_irqrestore_rcu_node(ssp, *flags); + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags); spin_lock_irqsave_rcu_node(sdp, *flags); } @@ -383,9 +383,9 @@ static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned lon */ static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) { - if (spin_trylock_irqsave_rcu_node(ssp, *flags)) + if (spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags)) return; - spin_lock_irqsave_rcu_node(ssp, *flags); + spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); spin_lock_irqsave_check_contention(ssp); } @@ -404,13 +404,13 @@ static void check_init_srcu_struct(struct srcu_struct *ssp) /* The smp_load_acquire() pairs with the smp_store_release(). */ if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed))) /*^^^*/ return; /* Already initialized. */ - spin_lock_irqsave_rcu_node(ssp, flags); + spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); if (!rcu_seq_state(ssp->srcu_gp_seq_needed)) { - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); return; } init_srcu_struct_fields(ssp, true); - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* @@ -774,7 +774,7 @@ static void srcu_gp_start(struct srcu_struct *ssp) sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id()); else sdp = this_cpu_ptr(ssp->sda); - lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); + lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ rcu_segcblist_advance(&sdp->srcu_cblist, @@ -864,7 +864,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) mutex_lock(&ssp->srcu_sup->srcu_cb_mutex); /* End the current grace period. */ - spin_lock_irq_rcu_node(ssp); + spin_lock_irq_rcu_node(ssp->srcu_sup); idx = rcu_seq_state(ssp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) @@ -875,7 +875,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) gpseq = rcu_seq_current(&ssp->srcu_gp_seq); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq)) WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq); - spin_unlock_irq_rcu_node(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); mutex_unlock(&ssp->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ @@ -924,15 +924,15 @@ static void srcu_gp_end(struct srcu_struct *ssp) mutex_unlock(&ssp->srcu_sup->srcu_cb_mutex); /* Start a new grace period if needed. */ - spin_lock_irq_rcu_node(ssp); + spin_lock_irq_rcu_node(ssp->srcu_sup); gpseq = rcu_seq_current(&ssp->srcu_gp_seq); if (!rcu_seq_state(gpseq) && ULONG_CMP_LT(gpseq, ssp->srcu_gp_seq_needed)) { srcu_gp_start(ssp); - spin_unlock_irq_rcu_node(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); srcu_reschedule(ssp, 0); } else { - spin_unlock_irq_rcu_node(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); } /* Transition to big if needed. */ @@ -975,7 +975,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp spin_lock_irqsave_ssp_contention(ssp, &flags); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* @@ -1064,7 +1064,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, else if (list_empty(&ssp->work.work.entry)) list_add(&ssp->work.work.entry, &srcu_boot_list); } - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* @@ -1599,17 +1599,17 @@ static void srcu_advance_state(struct srcu_struct *ssp) */ idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq)); /* ^^^ */ if (idx == SRCU_STATE_IDLE) { - spin_lock_irq_rcu_node(ssp); + spin_lock_irq_rcu_node(ssp->srcu_sup); if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) { WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq)); - spin_unlock_irq_rcu_node(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); mutex_unlock(&ssp->srcu_gp_mutex); return; } idx = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)); if (idx == SRCU_STATE_IDLE) srcu_gp_start(ssp); - spin_unlock_irq_rcu_node(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); if (idx != SRCU_STATE_IDLE) { mutex_unlock(&ssp->srcu_gp_mutex); return; /* Someone else started the grace period. */ @@ -1623,10 +1623,10 @@ static void srcu_advance_state(struct srcu_struct *ssp) return; /* readers present, retry later. */ } srcu_flip(ssp); - spin_lock_irq_rcu_node(ssp); + spin_lock_irq_rcu_node(ssp->srcu_sup); rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2); ssp->srcu_n_exp_nodelay = 0; - spin_unlock_irq_rcu_node(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); } if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { @@ -1710,7 +1710,7 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) { bool pushgp = true; - spin_lock_irq_rcu_node(ssp); + spin_lock_irq_rcu_node(ssp->srcu_sup); if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) { if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq))) { /* All requests fulfilled, time to go idle. */ @@ -1720,7 +1720,7 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) /* Outstanding request and no GP. Start one. */ srcu_gp_start(ssp); } - spin_unlock_irq_rcu_node(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); if (pushgp) queue_delayed_work(rcu_gp_wq, &ssp->work, delay); From e3a6ab25cfa0fcdcb31c346b9871a566d440980d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 17 Mar 2023 19:13:16 -0700 Subject: [PATCH 27/54] srcu: Move ->srcu_gp_mutex from srcu_struct to srcu_usage This commit moves the ->srcu_gp_mutex field from the srcu_struct structure to the srcu_usage structure to reduce the size of the former in order to improve cache locality. Suggested-by: Christoph Hellwig Tested-by: Sachin Sant Tested-by: "Zhang, Qiang1" Tested-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 2 +- kernel/rcu/srcutree.c | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 694d87b81917..d04e3da6181c 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -67,13 +67,13 @@ struct srcu_usage { int srcu_size_state; /* Small-to-big transition state. */ struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ spinlock_t __private lock; /* Protect counters and size state. */ + struct mutex srcu_gp_mutex; /* Serialize GP work. */ }; /* * Per-SRCU-domain structure, similar in function to rcu_state. */ struct srcu_struct { - struct mutex srcu_gp_mutex; /* Serialize GP work. */ unsigned int srcu_idx; /* Current rdr array element. */ unsigned long srcu_gp_seq; /* Grace-period seq #. */ unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */ diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index c42248cf18f6..a36066798de7 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -245,7 +245,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL; ssp->srcu_sup->node = NULL; mutex_init(&ssp->srcu_sup->srcu_cb_mutex); - mutex_init(&ssp->srcu_gp_mutex); + mutex_init(&ssp->srcu_sup->srcu_gp_mutex); ssp->srcu_idx = 0; ssp->srcu_gp_seq = 0; ssp->srcu_barrier_seq = 0; @@ -876,7 +876,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq)) WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq); spin_unlock_irq_rcu_node(ssp->srcu_sup); - mutex_unlock(&ssp->srcu_gp_mutex); + mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ /* Initiate callback invocation as needed. */ @@ -1585,7 +1585,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) { int idx; - mutex_lock(&ssp->srcu_gp_mutex); + mutex_lock(&ssp->srcu_sup->srcu_gp_mutex); /* * Because readers might be delayed for an extended period after @@ -1603,7 +1603,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) { WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq)); spin_unlock_irq_rcu_node(ssp->srcu_sup); - mutex_unlock(&ssp->srcu_gp_mutex); + mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; } idx = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)); @@ -1611,7 +1611,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) srcu_gp_start(ssp); spin_unlock_irq_rcu_node(ssp->srcu_sup); if (idx != SRCU_STATE_IDLE) { - mutex_unlock(&ssp->srcu_gp_mutex); + mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* Someone else started the grace period. */ } } @@ -1619,7 +1619,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN1) { idx = 1 ^ (ssp->srcu_idx & 1); if (!try_check_zero(ssp, idx, 1)) { - mutex_unlock(&ssp->srcu_gp_mutex); + mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ } srcu_flip(ssp); @@ -1637,7 +1637,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) */ idx = 1 ^ (ssp->srcu_idx & 1); if (!try_check_zero(ssp, idx, 2)) { - mutex_unlock(&ssp->srcu_gp_mutex); + mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ } ssp->srcu_n_exp_nodelay = 0; From 03200b5ca3b4d4edf634dc052bf3b8eb8dc8bbbc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 17 Mar 2023 19:30:50 -0700 Subject: [PATCH 28/54] srcu: Move grace-period fields from srcu_struct to srcu_usage This commit moves the ->srcu_gp_seq, ->srcu_gp_seq_needed, ->srcu_gp_seq_needed_exp, ->srcu_gp_start, and ->srcu_last_gp_end fields from the srcu_struct structure to the srcu_usage structure to reduce the size of the former in order to improve cache locality. Suggested-by: Christoph Hellwig Tested-by: Sachin Sant Tested-by: "Zhang, Qiang1" Tested-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 25 ++++---- kernel/rcu/srcutree.c | 128 +++++++++++++++++++-------------------- 2 files changed, 77 insertions(+), 76 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index d04e3da6181c..372e35b0e8b6 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -68,6 +68,11 @@ struct srcu_usage { struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ spinlock_t __private lock; /* Protect counters and size state. */ struct mutex srcu_gp_mutex; /* Serialize GP work. */ + unsigned long srcu_gp_seq; /* Grace-period seq #. */ + unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */ + unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ + unsigned long srcu_gp_start; /* Last GP start timestamp (jiffies) */ + unsigned long srcu_last_gp_end; /* Last GP end timestamp (ns) */ }; /* @@ -75,11 +80,6 @@ struct srcu_usage { */ struct srcu_struct { unsigned int srcu_idx; /* Current rdr array element. */ - unsigned long srcu_gp_seq; /* Grace-period seq #. */ - unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */ - unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ - unsigned long srcu_gp_start; /* Last GP start timestamp (jiffies) */ - unsigned long srcu_last_gp_end; /* Last GP end timestamp (ns) */ unsigned long srcu_size_jiffies; /* Current contention-measurement interval. */ unsigned long srcu_n_lock_retries; /* Contention events in current interval. */ unsigned long srcu_n_exp_nodelay; /* # expedited no-delays in current GP phase. */ @@ -115,8 +115,13 @@ struct srcu_struct { #define SRCU_STATE_SCAN1 1 #define SRCU_STATE_SCAN2 2 -#define __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ +#define __SRCU_USAGE_INIT(name) \ +{ \ + .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ .srcu_gp_seq_needed = -1UL, \ +} + +#define __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ .work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0), \ .srcu_sup = &usage_name, \ __SRCU_DEP_MAP_INIT(name) @@ -153,9 +158,7 @@ struct srcu_struct { */ #ifdef MODULE # define __DEFINE_SRCU(name, is_static) \ - static struct srcu_usage name##_srcu_usage = { \ - .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ - }; \ + static struct srcu_usage name##_srcu_usage = __SRCU_USAGE_INIT(name##_srcu_usage); \ is_static struct srcu_struct name = __SRCU_STRUCT_INIT_MODULE(name, name##_srcu_usage); \ extern struct srcu_struct * const __srcu_struct_##name; \ struct srcu_struct * const __srcu_struct_##name \ @@ -163,9 +166,7 @@ struct srcu_struct { #else # define __DEFINE_SRCU(name, is_static) \ static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data); \ - static struct srcu_usage name##_srcu_usage = { \ - .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ - }; \ + static struct srcu_usage name##_srcu_usage = __SRCU_USAGE_INIT(name##_srcu_usage); \ is_static struct srcu_struct name = \ __SRCU_STRUCT_INIT(name, name##_srcu_usage, name##_srcu_data) #endif diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index a36066798de7..340eb685cf64 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -135,8 +135,8 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); rcu_segcblist_init(&sdp->srcu_cblist); sdp->srcu_cblist_invoking = false; - sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq; - sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq; + sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq; + sdp->srcu_gp_seq_needed_exp = ssp->srcu_sup->srcu_gp_seq; sdp->mynode = NULL; sdp->cpu = cpu; INIT_WORK(&sdp->work, srcu_invoke_callbacks); @@ -247,7 +247,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) mutex_init(&ssp->srcu_sup->srcu_cb_mutex); mutex_init(&ssp->srcu_sup->srcu_gp_mutex); ssp->srcu_idx = 0; - ssp->srcu_gp_seq = 0; + ssp->srcu_sup->srcu_gp_seq = 0; ssp->srcu_barrier_seq = 0; mutex_init(&ssp->srcu_barrier_mutex); atomic_set(&ssp->srcu_barrier_cpu_cnt, 0); @@ -261,8 +261,8 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) return -ENOMEM; } init_srcu_struct_data(ssp); - ssp->srcu_gp_seq_needed_exp = 0; - ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); + ssp->srcu_sup->srcu_gp_seq_needed_exp = 0; + ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns(); if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) { if (!ssp->sda_is_static) { @@ -275,7 +275,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); } } - smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ + smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */ return 0; } @@ -402,10 +402,10 @@ static void check_init_srcu_struct(struct srcu_struct *ssp) unsigned long flags; /* The smp_load_acquire() pairs with the smp_store_release(). */ - if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed))) /*^^^*/ + if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed))) /*^^^*/ return; /* Already initialized. */ spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); - if (!rcu_seq_state(ssp->srcu_gp_seq_needed)) { + if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq_needed)) { spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); return; } @@ -616,11 +616,11 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) unsigned long j; unsigned long jbase = SRCU_INTERVAL; - if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) + if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_sup->srcu_gp_seq), READ_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp))) jbase = 0; - if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) { + if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq))) { j = jiffies - 1; - gpstart = READ_ONCE(ssp->srcu_gp_start); + gpstart = READ_ONCE(ssp->srcu_sup->srcu_gp_start); if (time_after(j, gpstart)) jbase += j - gpstart; if (!jbase) { @@ -656,12 +656,12 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist))) return; /* Forgot srcu_barrier(), so just leak it! */ } - if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || - WARN_ON(rcu_seq_current(&ssp->srcu_gp_seq) != ssp->srcu_gp_seq_needed) || + if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) != SRCU_STATE_IDLE) || + WARN_ON(rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq) != ssp->srcu_sup->srcu_gp_seq_needed) || WARN_ON(srcu_readers_active(ssp))) { pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n", - __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)), - rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed); + __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)), + rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq), ssp->srcu_sup->srcu_gp_seq_needed); return; /* Caller forgot to stop doing call_srcu()? */ } kfree(ssp->srcu_sup->node); @@ -775,18 +775,18 @@ static void srcu_gp_start(struct srcu_struct *ssp) else sdp = this_cpu_ptr(ssp->sda); lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); - WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); + WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)); spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&ssp->srcu_gp_seq)); + rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&ssp->srcu_gp_seq)); + rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq)); spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ - WRITE_ONCE(ssp->srcu_gp_start, jiffies); + WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies); WRITE_ONCE(ssp->srcu_n_exp_nodelay, 0); smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ - rcu_seq_start(&ssp->srcu_gp_seq); - state = rcu_seq_state(ssp->srcu_gp_seq); + rcu_seq_start(&ssp->srcu_sup->srcu_gp_seq); + state = rcu_seq_state(ssp->srcu_sup->srcu_gp_seq); WARN_ON_ONCE(state != SRCU_STATE_SCAN1); } @@ -865,16 +865,16 @@ static void srcu_gp_end(struct srcu_struct *ssp) /* End the current grace period. */ spin_lock_irq_rcu_node(ssp->srcu_sup); - idx = rcu_seq_state(ssp->srcu_gp_seq); + idx = rcu_seq_state(ssp->srcu_sup->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); - if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) + if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_sup->srcu_gp_seq), READ_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp))) cbdelay = 0; - WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns()); - rcu_seq_end(&ssp->srcu_gp_seq); - gpseq = rcu_seq_current(&ssp->srcu_gp_seq); - if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq)) - WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq); + WRITE_ONCE(ssp->srcu_sup->srcu_last_gp_end, ktime_get_mono_fast_ns()); + rcu_seq_end(&ssp->srcu_sup->srcu_gp_seq); + gpseq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq); + if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, gpseq)) + WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, gpseq); spin_unlock_irq_rcu_node(ssp->srcu_sup); mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ @@ -925,9 +925,9 @@ static void srcu_gp_end(struct srcu_struct *ssp) /* Start a new grace period if needed. */ spin_lock_irq_rcu_node(ssp->srcu_sup); - gpseq = rcu_seq_current(&ssp->srcu_gp_seq); + gpseq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq); if (!rcu_seq_state(gpseq) && - ULONG_CMP_LT(gpseq, ssp->srcu_gp_seq_needed)) { + ULONG_CMP_LT(gpseq, ssp->srcu_sup->srcu_gp_seq_needed)) { srcu_gp_start(ssp); spin_unlock_irq_rcu_node(ssp->srcu_sup); srcu_reschedule(ssp, 0); @@ -960,7 +960,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp if (snp) for (; snp != NULL; snp = snp->srcu_parent) { sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp); - if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) || + if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) || (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s))) return; spin_lock_irqsave_rcu_node(snp, flags); @@ -973,8 +973,8 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp spin_unlock_irqrestore_rcu_node(snp, flags); } spin_lock_irqsave_ssp_contention(ssp, &flags); - if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) - WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); + if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, s)) + WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } @@ -1010,7 +1010,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, if (snp_leaf) /* Each pass through the loop does one level of the srcu_node tree. */ for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { - if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) && snp != snp_leaf) + if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) && snp != snp_leaf) return; /* GP already done and CBs recorded. */ spin_lock_irqsave_rcu_node(snp, flags); snp_seq = snp->srcu_have_cbs[idx]; @@ -1037,20 +1037,20 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, /* Top of tree, must ensure the grace period will be started. */ spin_lock_irqsave_ssp_contention(ssp, &flags); - if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) { + if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load * acquire setting up for initialization. */ - smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/ + smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, s); /*^^^*/ } - if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) - WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); + if (!do_norm && ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, s)) + WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, s); /* If grace period not already in progress, start it. */ - if (!WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) && - rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) { - WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); + if (!WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) && + rcu_seq_state(ssp->srcu_sup->srcu_gp_seq) == SRCU_STATE_IDLE) { + WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)); srcu_gp_start(ssp); // And how can that list_add() in the "else" clause @@ -1164,18 +1164,18 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp) /* First, see if enough time has passed since the last GP. */ t = ktime_get_mono_fast_ns(); - tlast = READ_ONCE(ssp->srcu_last_gp_end); + tlast = READ_ONCE(ssp->srcu_sup->srcu_last_gp_end); if (exp_holdoff == 0 || time_in_range_open(t, tlast, tlast + exp_holdoff)) return false; /* Too soon after last GP. */ /* Next, check for probable idleness. */ - curseq = rcu_seq_current(&ssp->srcu_gp_seq); + curseq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq); smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */ - if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_gp_seq_needed))) + if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_sup->srcu_gp_seq_needed))) return false; /* Grace period in progress, so not idle. */ smp_mb(); /* Order ->srcu_gp_seq with prior access. */ - if (curseq != rcu_seq_current(&ssp->srcu_gp_seq)) + if (curseq != rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)) return false; /* GP # changed, so not idle. */ return true; /* With reasonable probability, idle! */ } @@ -1218,8 +1218,8 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, if (rhp) rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&ssp->srcu_gp_seq)); - s = rcu_seq_snap(&ssp->srcu_gp_seq); + rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); + s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { sdp->srcu_gp_seq_needed = s; @@ -1430,7 +1430,7 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp) // Any prior manipulation of SRCU-protected data must happen // before the load from ->srcu_gp_seq. smp_mb(); - return rcu_seq_snap(&ssp->srcu_gp_seq); + return rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq); } EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); @@ -1477,7 +1477,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); */ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) { - if (!rcu_seq_done(&ssp->srcu_gp_seq, cookie)) + if (!rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie)) return false; // Ensure that the end of the SRCU grace period happens before // any subsequent code that the caller might execute. @@ -1597,16 +1597,16 @@ static void srcu_advance_state(struct srcu_struct *ssp) * The load-acquire ensures that we see the accesses performed * by the prior grace period. */ - idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq)); /* ^^^ */ + idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq)); /* ^^^ */ if (idx == SRCU_STATE_IDLE) { spin_lock_irq_rcu_node(ssp->srcu_sup); - if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) { - WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq)); + if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) { + WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq)); spin_unlock_irq_rcu_node(ssp->srcu_sup); mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; } - idx = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)); + idx = rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)); if (idx == SRCU_STATE_IDLE) srcu_gp_start(ssp); spin_unlock_irq_rcu_node(ssp->srcu_sup); @@ -1616,7 +1616,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) } } - if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN1) { + if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) { idx = 1 ^ (ssp->srcu_idx & 1); if (!try_check_zero(ssp, idx, 1)) { mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); @@ -1624,12 +1624,12 @@ static void srcu_advance_state(struct srcu_struct *ssp) } srcu_flip(ssp); spin_lock_irq_rcu_node(ssp->srcu_sup); - rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2); + rcu_seq_set_state(&ssp->srcu_sup->srcu_gp_seq, SRCU_STATE_SCAN2); ssp->srcu_n_exp_nodelay = 0; spin_unlock_irq_rcu_node(ssp->srcu_sup); } - if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { + if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN2) { /* * SRCU read-side critical sections are normally short, @@ -1666,7 +1666,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) rcu_cblist_init(&ready_cbs); spin_lock_irq_rcu_node(sdp); rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&ssp->srcu_gp_seq)); + rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); if (sdp->srcu_cblist_invoking || !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { spin_unlock_irq_rcu_node(sdp); @@ -1694,7 +1694,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) spin_lock_irq_rcu_node(sdp); rcu_segcblist_add_len(&sdp->srcu_cblist, -len); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&ssp->srcu_gp_seq)); + rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq)); sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); spin_unlock_irq_rcu_node(sdp); @@ -1711,12 +1711,12 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) bool pushgp = true; spin_lock_irq_rcu_node(ssp->srcu_sup); - if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) { - if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq))) { + if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) { + if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq))) { /* All requests fulfilled, time to go idle. */ pushgp = false; } - } else if (!rcu_seq_state(ssp->srcu_gp_seq)) { + } else if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq)) { /* Outstanding request and no GP. Start one. */ srcu_gp_start(ssp); } @@ -1762,7 +1762,7 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, if (test_type != SRCU_FLAVOR) return; *flags = 0; - *gp_seq = rcu_seq_current(&ssp->srcu_gp_seq); + *gp_seq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq); } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); @@ -1791,7 +1791,7 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name)) ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1; pr_alert("%s%s Tree SRCU g%ld state %d (%s)", - tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), ss_state, + tt, tf, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq), ss_state, srcu_size_state_name[ss_state_idx]); if (!ssp->sda) { // Called after cleanup_srcu_struct(), perhaps. @@ -1905,7 +1905,7 @@ static void srcu_module_going(struct module *mod) for (i = 0; i < mod->num_srcu_structs; i++) { ssp = *(sspp++); - if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed)) && + if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed)) && !WARN_ON_ONCE(!ssp->sda_is_static)) cleanup_srcu_struct(ssp); free_percpu(ssp->sda); From 3b46679c623c2766f4c56fd3f9ce8edbb38c5d20 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 17 Mar 2023 20:01:02 -0700 Subject: [PATCH 29/54] srcu: Move heuristics fields from srcu_struct to srcu_usage This commit moves the ->srcu_size_jiffies, ->srcu_n_lock_retries, and ->srcu_n_exp_nodelay fields from the srcu_struct structure to the srcu_usage structure to reduce the size of the former in order to improve cache locality. Suggested-by: Christoph Hellwig Tested-by: Sachin Sant Tested-by: "Zhang, Qiang1" Tested-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 6 +++--- kernel/rcu/srcutree.c | 18 +++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 372e35b0e8b6..3023492d8d89 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -73,6 +73,9 @@ struct srcu_usage { unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ unsigned long srcu_gp_start; /* Last GP start timestamp (jiffies) */ unsigned long srcu_last_gp_end; /* Last GP end timestamp (ns) */ + unsigned long srcu_size_jiffies; /* Current contention-measurement interval. */ + unsigned long srcu_n_lock_retries; /* Contention events in current interval. */ + unsigned long srcu_n_exp_nodelay; /* # expedited no-delays in current GP phase. */ }; /* @@ -80,9 +83,6 @@ struct srcu_usage { */ struct srcu_struct { unsigned int srcu_idx; /* Current rdr array element. */ - unsigned long srcu_size_jiffies; /* Current contention-measurement interval. */ - unsigned long srcu_n_lock_retries; /* Contention events in current interval. */ - unsigned long srcu_n_exp_nodelay; /* # expedited no-delays in current GP phase. */ struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ bool sda_is_static; /* May ->sda be passed to free_percpu()? */ unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 340eb685cf64..291fb520bce0 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -348,11 +348,11 @@ static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp) if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_sup->srcu_size_state) return; j = jiffies; - if (ssp->srcu_size_jiffies != j) { - ssp->srcu_size_jiffies = j; - ssp->srcu_n_lock_retries = 0; + if (ssp->srcu_sup->srcu_size_jiffies != j) { + ssp->srcu_sup->srcu_size_jiffies = j; + ssp->srcu_sup->srcu_n_lock_retries = 0; } - if (++ssp->srcu_n_lock_retries <= small_contention_lim) + if (++ssp->srcu_sup->srcu_n_lock_retries <= small_contention_lim) return; __srcu_transition_to_big(ssp); } @@ -624,8 +624,8 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) if (time_after(j, gpstart)) jbase += j - gpstart; if (!jbase) { - WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1); - if (READ_ONCE(ssp->srcu_n_exp_nodelay) > srcu_max_nodelay_phase) + WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay) + 1); + if (READ_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase) jbase = 1; } } @@ -783,7 +783,7 @@ static void srcu_gp_start(struct srcu_struct *ssp) rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq)); spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies); - WRITE_ONCE(ssp->srcu_n_exp_nodelay, 0); + WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, 0); smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ rcu_seq_start(&ssp->srcu_sup->srcu_gp_seq); state = rcu_seq_state(ssp->srcu_sup->srcu_gp_seq); @@ -1625,7 +1625,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) srcu_flip(ssp); spin_lock_irq_rcu_node(ssp->srcu_sup); rcu_seq_set_state(&ssp->srcu_sup->srcu_gp_seq, SRCU_STATE_SCAN2); - ssp->srcu_n_exp_nodelay = 0; + ssp->srcu_sup->srcu_n_exp_nodelay = 0; spin_unlock_irq_rcu_node(ssp->srcu_sup); } @@ -1640,7 +1640,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ } - ssp->srcu_n_exp_nodelay = 0; + ssp->srcu_sup->srcu_n_exp_nodelay = 0; srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */ } } From 660349ac79cb22bb64c44b026d879069783e97d5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 17 Mar 2023 20:22:58 -0700 Subject: [PATCH 30/54] srcu: Move ->sda_is_static from srcu_struct to srcu_usage This commit moves the ->sda_is_static field from the srcu_struct structure to the srcu_usage structure to reduce the size of the former in order to improve cache locality. Suggested-by: Christoph Hellwig Tested-by: Sachin Sant Tested-by: "Zhang, Qiang1" Tested-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 2 +- kernel/rcu/srcutree.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 3023492d8d89..d3534ecb806e 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -76,6 +76,7 @@ struct srcu_usage { unsigned long srcu_size_jiffies; /* Current contention-measurement interval. */ unsigned long srcu_n_lock_retries; /* Contention events in current interval. */ unsigned long srcu_n_exp_nodelay; /* # expedited no-delays in current GP phase. */ + bool sda_is_static; /* May ->sda be passed to free_percpu()? */ }; /* @@ -84,7 +85,6 @@ struct srcu_usage { struct srcu_struct { unsigned int srcu_idx; /* Current rdr array element. */ struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ - bool sda_is_static; /* May ->sda be passed to free_percpu()? */ unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ struct mutex srcu_barrier_mutex; /* Serialize barrier ops. */ struct completion srcu_barrier_completion; diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 291fb520bce0..20f2373f7e25 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -252,7 +252,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) mutex_init(&ssp->srcu_barrier_mutex); atomic_set(&ssp->srcu_barrier_cpu_cnt, 0); INIT_DELAYED_WORK(&ssp->work, process_srcu); - ssp->sda_is_static = is_static; + ssp->srcu_sup->sda_is_static = is_static; if (!is_static) ssp->sda = alloc_percpu(struct srcu_data); if (!ssp->sda) { @@ -265,7 +265,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns(); if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) { - if (!ssp->sda_is_static) { + if (!ssp->srcu_sup->sda_is_static) { free_percpu(ssp->sda); ssp->sda = NULL; kfree(ssp->srcu_sup); @@ -667,7 +667,7 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) kfree(ssp->srcu_sup->node); ssp->srcu_sup->node = NULL; ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL; - if (!ssp->sda_is_static) { + if (!ssp->srcu_sup->sda_is_static) { free_percpu(ssp->sda); ssp->sda = NULL; kfree(ssp->srcu_sup); @@ -1906,7 +1906,7 @@ static void srcu_module_going(struct module *mod) for (i = 0; i < mod->num_srcu_structs; i++) { ssp = *(sspp++); if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed)) && - !WARN_ON_ONCE(!ssp->sda_is_static)) + !WARN_ON_ONCE(!ssp->srcu_sup->sda_is_static)) cleanup_srcu_struct(ssp); free_percpu(ssp->sda); } From d20162e0bfc222183a7c94cd00e74b6bbf1a605b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 17 Mar 2023 21:08:18 -0700 Subject: [PATCH 31/54] srcu: Move srcu_barrier() fields from srcu_struct to srcu_usage This commit moves the ->srcu_barrier_seq, ->srcu_barrier_mutex, ->srcu_barrier_completion, and ->srcu_barrier_cpu_cnt fields from the srcu_struct structure to the srcu_usage structure to reduce the size of the former in order to improve cache locality. Suggested-by: Christoph Hellwig Tested-by: Sachin Sant Tested-by: "Zhang, Qiang1" Tested-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 14 +++++++------- kernel/rcu/srcutree.c | 38 +++++++++++++++++++------------------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index d3534ecb806e..d544ec1c0c8e 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -77,6 +77,13 @@ struct srcu_usage { unsigned long srcu_n_lock_retries; /* Contention events in current interval. */ unsigned long srcu_n_exp_nodelay; /* # expedited no-delays in current GP phase. */ bool sda_is_static; /* May ->sda be passed to free_percpu()? */ + unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ + struct mutex srcu_barrier_mutex; /* Serialize barrier ops. */ + struct completion srcu_barrier_completion; + /* Awaken barrier rq at end. */ + atomic_t srcu_barrier_cpu_cnt; /* # CPUs not yet posting a */ + /* callback for the barrier */ + /* operation. */ }; /* @@ -85,13 +92,6 @@ struct srcu_usage { struct srcu_struct { unsigned int srcu_idx; /* Current rdr array element. */ struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ - unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ - struct mutex srcu_barrier_mutex; /* Serialize barrier ops. */ - struct completion srcu_barrier_completion; - /* Awaken barrier rq at end. */ - atomic_t srcu_barrier_cpu_cnt; /* # CPUs not yet posting a */ - /* callback for the barrier */ - /* operation. */ unsigned long reschedule_jiffies; unsigned long reschedule_count; struct delayed_work work; diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 20f2373f7e25..97d1fe9a160c 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -248,9 +248,9 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) mutex_init(&ssp->srcu_sup->srcu_gp_mutex); ssp->srcu_idx = 0; ssp->srcu_sup->srcu_gp_seq = 0; - ssp->srcu_barrier_seq = 0; - mutex_init(&ssp->srcu_barrier_mutex); - atomic_set(&ssp->srcu_barrier_cpu_cnt, 0); + ssp->srcu_sup->srcu_barrier_seq = 0; + mutex_init(&ssp->srcu_sup->srcu_barrier_mutex); + atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0); INIT_DELAYED_WORK(&ssp->work, process_srcu); ssp->srcu_sup->sda_is_static = is_static; if (!is_static) @@ -1496,8 +1496,8 @@ static void srcu_barrier_cb(struct rcu_head *rhp) sdp = container_of(rhp, struct srcu_data, srcu_barrier_head); ssp = sdp->ssp; - if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) - complete(&ssp->srcu_barrier_completion); + if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt)) + complete(&ssp->srcu_sup->srcu_barrier_completion); } /* @@ -1511,13 +1511,13 @@ static void srcu_barrier_cb(struct rcu_head *rhp) static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp) { spin_lock_irq_rcu_node(sdp); - atomic_inc(&ssp->srcu_barrier_cpu_cnt); + atomic_inc(&ssp->srcu_sup->srcu_barrier_cpu_cnt); sdp->srcu_barrier_head.func = srcu_barrier_cb; debug_rcu_head_queue(&sdp->srcu_barrier_head); if (!rcu_segcblist_entrain(&sdp->srcu_cblist, &sdp->srcu_barrier_head)) { debug_rcu_head_unqueue(&sdp->srcu_barrier_head); - atomic_dec(&ssp->srcu_barrier_cpu_cnt); + atomic_dec(&ssp->srcu_sup->srcu_barrier_cpu_cnt); } spin_unlock_irq_rcu_node(sdp); } @@ -1530,20 +1530,20 @@ void srcu_barrier(struct srcu_struct *ssp) { int cpu; int idx; - unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq); + unsigned long s = rcu_seq_snap(&ssp->srcu_sup->srcu_barrier_seq); check_init_srcu_struct(ssp); - mutex_lock(&ssp->srcu_barrier_mutex); - if (rcu_seq_done(&ssp->srcu_barrier_seq, s)) { + mutex_lock(&ssp->srcu_sup->srcu_barrier_mutex); + if (rcu_seq_done(&ssp->srcu_sup->srcu_barrier_seq, s)) { smp_mb(); /* Force ordering following return. */ - mutex_unlock(&ssp->srcu_barrier_mutex); + mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex); return; /* Someone else did our work for us. */ } - rcu_seq_start(&ssp->srcu_barrier_seq); - init_completion(&ssp->srcu_barrier_completion); + rcu_seq_start(&ssp->srcu_sup->srcu_barrier_seq); + init_completion(&ssp->srcu_sup->srcu_barrier_completion); /* Initial count prevents reaching zero until all CBs are posted. */ - atomic_set(&ssp->srcu_barrier_cpu_cnt, 1); + atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 1); idx = __srcu_read_lock_nmisafe(ssp); if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) @@ -1554,12 +1554,12 @@ void srcu_barrier(struct srcu_struct *ssp) __srcu_read_unlock_nmisafe(ssp, idx); /* Remove the initial count, at which point reaching zero can happen. */ - if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) - complete(&ssp->srcu_barrier_completion); - wait_for_completion(&ssp->srcu_barrier_completion); + if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt)) + complete(&ssp->srcu_sup->srcu_barrier_completion); + wait_for_completion(&ssp->srcu_sup->srcu_barrier_completion); - rcu_seq_end(&ssp->srcu_barrier_seq); - mutex_unlock(&ssp->srcu_barrier_mutex); + rcu_seq_end(&ssp->srcu_sup->srcu_barrier_seq); + mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex); } EXPORT_SYMBOL_GPL(srcu_barrier); From fd1b3f8e097b7fbbab8ac4a802b24fc23c703dcf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 17 Mar 2023 21:30:32 -0700 Subject: [PATCH 32/54] srcu: Move work-scheduling fields from srcu_struct to srcu_usage This commit moves the ->reschedule_jiffies, ->reschedule_count, and ->work fields from the srcu_struct structure to the srcu_usage structure to reduce the size of the former in order to improve cache locality. However, this means that the container_of() calls cannot get a pointer to the srcu_struct because they are no longer in the srcu_struct. This issue is addressed by adding a ->srcu_ssp field in the srcu_usage structure that references the corresponding srcu_struct structure. And given the presence of the sup pointer to the srcu_usage structure, replace some ssp->srcu_usage-> instances with sup->. [ paulmck Apply feedback from kernel test robot. ] Link: https://lore.kernel.org/oe-kbuild-all/202303191400.iO5BOqka-lkp@intel.com/ Suggested-by: Christoph Hellwig Tested-by: Sachin Sant Tested-by: "Zhang, Qiang1" Tested-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 9 +++++---- kernel/rcu/srcutree.c | 41 +++++++++++++++++++++------------------- 2 files changed, 27 insertions(+), 23 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index d544ec1c0c8e..cd0cdd8142c5 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -84,6 +84,10 @@ struct srcu_usage { atomic_t srcu_barrier_cpu_cnt; /* # CPUs not yet posting a */ /* callback for the barrier */ /* operation. */ + unsigned long reschedule_jiffies; + unsigned long reschedule_count; + struct delayed_work work; + struct srcu_struct *srcu_ssp; }; /* @@ -92,9 +96,6 @@ struct srcu_usage { struct srcu_struct { unsigned int srcu_idx; /* Current rdr array element. */ struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ - unsigned long reschedule_jiffies; - unsigned long reschedule_count; - struct delayed_work work; struct lockdep_map dep_map; struct srcu_usage *srcu_sup; /* Update-side data. */ }; @@ -119,10 +120,10 @@ struct srcu_struct { { \ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ .srcu_gp_seq_needed = -1UL, \ + .work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0), \ } #define __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ - .work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0), \ .srcu_sup = &usage_name, \ __SRCU_DEP_MAP_INIT(name) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 97d1fe9a160c..169a6513b739 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -251,7 +251,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->srcu_sup->srcu_barrier_seq = 0; mutex_init(&ssp->srcu_sup->srcu_barrier_mutex); atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0); - INIT_DELAYED_WORK(&ssp->work, process_srcu); + INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu); ssp->srcu_sup->sda_is_static = is_static; if (!is_static) ssp->sda = alloc_percpu(struct srcu_data); @@ -275,6 +275,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); } } + ssp->srcu_sup->srcu_ssp = ssp; smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */ return 0; } @@ -647,7 +648,7 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(ssp))) return; /* Just leak it! */ - flush_delayed_work(&ssp->work); + flush_delayed_work(&ssp->srcu_sup->work); for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); @@ -1059,10 +1060,10 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, // can only be executed during early boot when there is only // the one boot CPU running with interrupts still disabled. if (likely(srcu_init_done)) - queue_delayed_work(rcu_gp_wq, &ssp->work, + queue_delayed_work(rcu_gp_wq, &ssp->srcu_sup->work, !!srcu_get_delay(ssp)); - else if (list_empty(&ssp->work.work.entry)) - list_add(&ssp->work.work.entry, &srcu_boot_list); + else if (list_empty(&ssp->srcu_sup->work.work.entry)) + list_add(&ssp->srcu_sup->work.work.entry, &srcu_boot_list); } spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } @@ -1723,7 +1724,7 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) spin_unlock_irq_rcu_node(ssp->srcu_sup); if (pushgp) - queue_delayed_work(rcu_gp_wq, &ssp->work, delay); + queue_delayed_work(rcu_gp_wq, &ssp->srcu_sup->work, delay); } /* @@ -1734,22 +1735,24 @@ static void process_srcu(struct work_struct *work) unsigned long curdelay; unsigned long j; struct srcu_struct *ssp; + struct srcu_usage *sup; - ssp = container_of(work, struct srcu_struct, work.work); + sup = container_of(work, struct srcu_usage, work.work); + ssp = sup->srcu_ssp; srcu_advance_state(ssp); curdelay = srcu_get_delay(ssp); if (curdelay) { - WRITE_ONCE(ssp->reschedule_count, 0); + WRITE_ONCE(sup->reschedule_count, 0); } else { j = jiffies; - if (READ_ONCE(ssp->reschedule_jiffies) == j) { - WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1); - if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay) + if (READ_ONCE(sup->reschedule_jiffies) == j) { + WRITE_ONCE(sup->reschedule_count, READ_ONCE(sup->reschedule_count) + 1); + if (READ_ONCE(sup->reschedule_count) > srcu_max_nodelay) curdelay = 1; } else { - WRITE_ONCE(ssp->reschedule_count, 1); - WRITE_ONCE(ssp->reschedule_jiffies, j); + WRITE_ONCE(sup->reschedule_count, 1); + WRITE_ONCE(sup->reschedule_jiffies, j); } } srcu_reschedule(ssp, curdelay); @@ -1848,7 +1851,7 @@ early_initcall(srcu_bootup_announce); void __init srcu_init(void) { - struct srcu_struct *ssp; + struct srcu_usage *sup; /* Decide on srcu_struct-size strategy. */ if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) { @@ -1868,13 +1871,13 @@ void __init srcu_init(void) */ srcu_init_done = true; while (!list_empty(&srcu_boot_list)) { - ssp = list_first_entry(&srcu_boot_list, struct srcu_struct, + sup = list_first_entry(&srcu_boot_list, struct srcu_usage, work.work.entry); - list_del_init(&ssp->work.work.entry); + list_del_init(&sup->work.work.entry); if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && - ssp->srcu_sup->srcu_size_state == SRCU_SIZE_SMALL) - ssp->srcu_sup->srcu_size_state = SRCU_SIZE_ALLOC; - queue_work(rcu_gp_wq, &ssp->work.work); + sup->srcu_size_state == SRCU_SIZE_SMALL) + sup->srcu_size_state = SRCU_SIZE_ALLOC; + queue_work(rcu_gp_wq, &sup->work.work); } } From a7bf4d7c16c1cf9753873879630a5d5169eb3206 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 24 Mar 2023 09:05:50 -0700 Subject: [PATCH 33/54] srcu: Check for readers at module-exit time If a given statically allocated in-module srcu_struct structure was ever used for updates, srcu_module_going() will invoke cleanup_srcu_struct() at module-exit time. This will check for the error case of SRCU readers persisting past module-exit time. On the other hand, if this srcu_struct structure never went through a grace period, srcu_module_going() only invokes free_percpu(), which would result in strange failures if SRCU readers persisted past module-exit time. This commit therefore adds a srcu_readers_active() check to srcu_module_going(), splatting if readers have persisted and refraining from invoking free_percpu() in that case. Better to leak memory than to suffer silent memory corruption! [ paulmck: Apply Zhang, Qiang1 feedback on memory leak. ] Tested-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 169a6513b739..f9dd6ed5503e 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1911,7 +1911,8 @@ static void srcu_module_going(struct module *mod) if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed)) && !WARN_ON_ONCE(!ssp->srcu_sup->sda_is_static)) cleanup_srcu_struct(ssp); - free_percpu(ssp->sda); + if (!WARN_ON(srcu_readers_active(ssp))) + free_percpu(ssp->sda); } } From eabe7625f053050e4cecbbe98bd944f7e8eb14f5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 18 Mar 2023 09:34:52 -0700 Subject: [PATCH 34/54] srcu: Fix long lines in srcu_get_delay() This commit creates an srcu_usage pointer named "sup" as a shorter synonym for the "ssp->srcu_sup" that was bloating several lines of code. Tested-by: Sachin Sant Tested-by: "Zhang, Qiang1" Cc: Christoph Hellwig Tested-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index f9dd6ed5503e..9699bcab7215 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -616,17 +616,18 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) unsigned long gpstart; unsigned long j; unsigned long jbase = SRCU_INTERVAL; + struct srcu_usage *sup = ssp->srcu_sup; - if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_sup->srcu_gp_seq), READ_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp))) + if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp))) jbase = 0; - if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq))) { + if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) { j = jiffies - 1; - gpstart = READ_ONCE(ssp->srcu_sup->srcu_gp_start); + gpstart = READ_ONCE(sup->srcu_gp_start); if (time_after(j, gpstart)) jbase += j - gpstart; if (!jbase) { - WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay) + 1); - if (READ_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase) + WRITE_ONCE(sup->srcu_n_exp_nodelay, READ_ONCE(sup->srcu_n_exp_nodelay) + 1); + if (READ_ONCE(sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase) jbase = 1; } } From 5ff8319f07db11c3b9347ce1dc0a6d951ae96d29 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 18 Mar 2023 10:51:50 -0700 Subject: [PATCH 35/54] srcu: Fix long lines in cleanup_srcu_struct() This commit creates an srcu_usage pointer named "sup" as a shorter synonym for the "ssp->srcu_sup" that was bloating several lines of code. Cc: Christoph Hellwig Tested-by: Sachin Sant Tested-by: "Zhang, Qiang1" Tested-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 9699bcab7215..11a08201ca0a 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -644,12 +644,13 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) void cleanup_srcu_struct(struct srcu_struct *ssp) { int cpu; + struct srcu_usage *sup = ssp->srcu_sup; if (WARN_ON(!srcu_get_delay(ssp))) return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(ssp))) return; /* Just leak it! */ - flush_delayed_work(&ssp->srcu_sup->work); + flush_delayed_work(&sup->work); for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); @@ -658,21 +659,21 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist))) return; /* Forgot srcu_barrier(), so just leak it! */ } - if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) != SRCU_STATE_IDLE) || - WARN_ON(rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq) != ssp->srcu_sup->srcu_gp_seq_needed) || + if (WARN_ON(rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)) != SRCU_STATE_IDLE) || + WARN_ON(rcu_seq_current(&sup->srcu_gp_seq) != sup->srcu_gp_seq_needed) || WARN_ON(srcu_readers_active(ssp))) { pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n", - __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)), - rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq), ssp->srcu_sup->srcu_gp_seq_needed); + __func__, ssp, rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)), + rcu_seq_current(&sup->srcu_gp_seq), sup->srcu_gp_seq_needed); return; /* Caller forgot to stop doing call_srcu()? */ } - kfree(ssp->srcu_sup->node); - ssp->srcu_sup->node = NULL; - ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL; - if (!ssp->srcu_sup->sda_is_static) { + kfree(sup->node); + sup->node = NULL; + sup->srcu_size_state = SRCU_SIZE_SMALL; + if (!sup->sda_is_static) { free_percpu(ssp->sda); ssp->sda = NULL; - kfree(ssp->srcu_sup); + kfree(sup); ssp->srcu_sup = NULL; } } From 6c366522e10f2b5e916b3f08ac042df1a1cd512a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 18 Mar 2023 10:52:48 -0700 Subject: [PATCH 36/54] srcu: Fix long lines in srcu_gp_end() This commit creates an srcu_usage pointer named "sup" as a shorter synonym for the "ssp->srcu_sup" that was bloating several lines of code. Cc: Christoph Hellwig Tested-by: Sachin Sant Tested-by: "Zhang, Qiang1" Tested-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 11a08201ca0a..f661a0f6bc0d 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -862,28 +862,29 @@ static void srcu_gp_end(struct srcu_struct *ssp) unsigned long sgsne; struct srcu_node *snp; int ss_state; + struct srcu_usage *sup = ssp->srcu_sup; /* Prevent more than one additional grace period. */ - mutex_lock(&ssp->srcu_sup->srcu_cb_mutex); + mutex_lock(&sup->srcu_cb_mutex); /* End the current grace period. */ - spin_lock_irq_rcu_node(ssp->srcu_sup); - idx = rcu_seq_state(ssp->srcu_sup->srcu_gp_seq); + spin_lock_irq_rcu_node(sup); + idx = rcu_seq_state(sup->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); - if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_sup->srcu_gp_seq), READ_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp))) + if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp))) cbdelay = 0; - WRITE_ONCE(ssp->srcu_sup->srcu_last_gp_end, ktime_get_mono_fast_ns()); - rcu_seq_end(&ssp->srcu_sup->srcu_gp_seq); - gpseq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq); - if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, gpseq)) - WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, gpseq); - spin_unlock_irq_rcu_node(ssp->srcu_sup); - mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); + WRITE_ONCE(sup->srcu_last_gp_end, ktime_get_mono_fast_ns()); + rcu_seq_end(&sup->srcu_gp_seq); + gpseq = rcu_seq_current(&sup->srcu_gp_seq); + if (ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, gpseq)) + WRITE_ONCE(sup->srcu_gp_seq_needed_exp, gpseq); + spin_unlock_irq_rcu_node(sup); + mutex_unlock(&sup->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ /* Initiate callback invocation as needed. */ - ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state); + ss_state = smp_load_acquire(&sup->srcu_size_state); if (ss_state < SRCU_SIZE_WAIT_BARRIER) { srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, get_boot_cpu_id()), cbdelay); @@ -892,7 +893,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) srcu_for_each_node_breadth_first(ssp, snp) { spin_lock_irq_rcu_node(snp); cbs = false; - last_lvl = snp >= ssp->srcu_sup->level[rcu_num_lvls - 1]; + last_lvl = snp >= sup->level[rcu_num_lvls - 1]; if (last_lvl) cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq; snp->srcu_have_cbs[idx] = gpseq; @@ -924,18 +925,18 @@ static void srcu_gp_end(struct srcu_struct *ssp) } /* Callback initiation done, allow grace periods after next. */ - mutex_unlock(&ssp->srcu_sup->srcu_cb_mutex); + mutex_unlock(&sup->srcu_cb_mutex); /* Start a new grace period if needed. */ - spin_lock_irq_rcu_node(ssp->srcu_sup); - gpseq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq); + spin_lock_irq_rcu_node(sup); + gpseq = rcu_seq_current(&sup->srcu_gp_seq); if (!rcu_seq_state(gpseq) && - ULONG_CMP_LT(gpseq, ssp->srcu_sup->srcu_gp_seq_needed)) { + ULONG_CMP_LT(gpseq, sup->srcu_gp_seq_needed)) { srcu_gp_start(ssp); - spin_unlock_irq_rcu_node(ssp->srcu_sup); + spin_unlock_irq_rcu_node(sup); srcu_reschedule(ssp, 0); } else { - spin_unlock_irq_rcu_node(ssp->srcu_sup); + spin_unlock_irq_rcu_node(sup); } /* Transition to big if needed. */ @@ -943,7 +944,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) if (ss_state == SRCU_SIZE_ALLOC) init_srcu_struct_nodes(ssp, GFP_KERNEL); else - smp_store_release(&ssp->srcu_sup->srcu_size_state, ss_state + 1); + smp_store_release(&sup->srcu_size_state, ss_state + 1); } } From cefc0a599b19d8dd0e26d0b2e43311bae7530ca1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 18 Mar 2023 12:31:53 -0700 Subject: [PATCH 37/54] srcu: Fix long lines in srcu_funnel_gp_start() This commit creates an srcu_usage pointer named "sup" as a shorter synonym for the "ssp->srcu_sup" that was bloating several lines of code. Cc: Christoph Hellwig Tested-by: Sachin Sant Tested-by: "Zhang, Qiang1" Tested-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index f661a0f6bc0d..a887cfc89894 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1004,9 +1004,10 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, struct srcu_node *snp; struct srcu_node *snp_leaf; unsigned long snp_seq; + struct srcu_usage *sup = ssp->srcu_sup; /* Ensure that snp node tree is fully initialized before traversing it */ - if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + if (smp_load_acquire(&sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) snp_leaf = NULL; else snp_leaf = sdp->mynode; @@ -1014,7 +1015,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, if (snp_leaf) /* Each pass through the loop does one level of the srcu_node tree. */ for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { - if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) && snp != snp_leaf) + if (WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && snp != snp_leaf) return; /* GP already done and CBs recorded. */ spin_lock_irqsave_rcu_node(snp, flags); snp_seq = snp->srcu_have_cbs[idx]; @@ -1041,20 +1042,20 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, /* Top of tree, must ensure the grace period will be started. */ spin_lock_irqsave_ssp_contention(ssp, &flags); - if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed, s)) { + if (ULONG_CMP_LT(sup->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load * acquire setting up for initialization. */ - smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, s); /*^^^*/ + smp_store_release(&sup->srcu_gp_seq_needed, s); /*^^^*/ } - if (!do_norm && ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, s)) - WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, s); + if (!do_norm && ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, s)) + WRITE_ONCE(sup->srcu_gp_seq_needed_exp, s); /* If grace period not already in progress, start it. */ - if (!WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) && - rcu_seq_state(ssp->srcu_sup->srcu_gp_seq) == SRCU_STATE_IDLE) { - WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)); + if (!WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && + rcu_seq_state(sup->srcu_gp_seq) == SRCU_STATE_IDLE) { + WARN_ON_ONCE(ULONG_CMP_GE(sup->srcu_gp_seq, sup->srcu_gp_seq_needed)); srcu_gp_start(ssp); // And how can that list_add() in the "else" clause @@ -1063,12 +1064,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, // can only be executed during early boot when there is only // the one boot CPU running with interrupts still disabled. if (likely(srcu_init_done)) - queue_delayed_work(rcu_gp_wq, &ssp->srcu_sup->work, + queue_delayed_work(rcu_gp_wq, &sup->work, !!srcu_get_delay(ssp)); - else if (list_empty(&ssp->srcu_sup->work.work.entry)) - list_add(&ssp->srcu_sup->work.work.entry, &srcu_boot_list); + else if (list_empty(&sup->work.work.entry)) + list_add(&sup->work.work.entry, &srcu_boot_list); } - spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); + spin_unlock_irqrestore_rcu_node(sup, flags); } /* From 3636d8d114c648af2484f8049eb7ecb204c542c6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 19 Jan 2023 14:29:34 +0100 Subject: [PATCH 38/54] rcu: Further comment and explain the state space of GP sequences The state space of the GP sequence number isn't documented and the definitions of its special values are scattered. This commit therefore gathers some common knowledge near the grace-period sequence-number definitions. Signed-off-by: Frederic Weisbecker Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) --- kernel/rcu/rcu.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 115616ac3bfa..a3adcf9a9919 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -14,6 +14,43 @@ /* * Grace-period counter management. + * + * The two least significant bits contain the control flags. + * The most significant bits contain the grace-period sequence counter. + * + * When both control flags are zero, no grace period is in progress. + * When either bit is non-zero, a grace period has started and is in + * progress. When the grace period completes, the control flags are reset + * to 0 and the grace-period sequence counter is incremented. + * + * However some specific RCU usages make use of custom values. + * + * SRCU special control values: + * + * SRCU_SNP_INIT_SEQ : Invalid/init value set when SRCU node + * is initialized. + * + * SRCU_STATE_IDLE : No SRCU gp is in progress + * + * SRCU_STATE_SCAN1 : State set by rcu_seq_start(). Indicates + * we are scanning the readers on the slot + * defined as inactive (there might well + * be pending readers that will use that + * index, but their number is bounded). + * + * SRCU_STATE_SCAN2 : State set manually via rcu_seq_set_state() + * Indicates we are flipping the readers + * index and then scanning the readers on the + * slot newly designated as inactive (again, + * the number of pending readers that will use + * this inactive index is bounded). + * + * RCU polled GP special control value: + * + * RCU_GET_STATE_COMPLETED : State value indicating an already-completed + * polled GP has completed. This value covers + * both the state and the counter of the + * grace-period sequence number. */ #define RCU_SEQ_CTR_SHIFT 2 From e15a19306004b3d7b6a5fe269e4e7cb7934aa3fe Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Wed, 4 Jan 2023 12:29:01 -0800 Subject: [PATCH 39/54] srcu: Add comments for srcu_size_state The SRCU_SIZE_* names are not self-explanatory, so this commit therefore adds comments to the definitions. Signed-off-by: Pingfan Liu Cc: Lai Jiangshan Cc: "Paul E. McKenney" Cc: Frederic Weisbecker Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: "Zhang, Qiang1" To: rcu@vger.kernel.org Reviewed-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) --- include/linux/srcutree.h | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 558057b517b7..a6910805f9c5 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -92,16 +92,29 @@ struct srcu_struct { struct lockdep_map dep_map; }; -/* Values for size state variable (->srcu_size_state). */ -#define SRCU_SIZE_SMALL 0 -#define SRCU_SIZE_ALLOC 1 -#define SRCU_SIZE_WAIT_BARRIER 2 -#define SRCU_SIZE_WAIT_CALL 3 -#define SRCU_SIZE_WAIT_CBS1 4 -#define SRCU_SIZE_WAIT_CBS2 5 -#define SRCU_SIZE_WAIT_CBS3 6 -#define SRCU_SIZE_WAIT_CBS4 7 -#define SRCU_SIZE_BIG 8 +// Values for size state variable (->srcu_size_state). Once the state +// has been set to SRCU_SIZE_ALLOC, the grace-period code advances through +// this state machine one step per grace period until the SRCU_SIZE_BIG state +// is reached. Otherwise, the state machine remains in the SRCU_SIZE_SMALL +// state indefinitely. +#define SRCU_SIZE_SMALL 0 // No srcu_node combining tree, ->node == NULL +#define SRCU_SIZE_ALLOC 1 // An srcu_node tree is being allocated, initialized, + // and then referenced by ->node. It will not be used. +#define SRCU_SIZE_WAIT_BARRIER 2 // The srcu_node tree starts being used by everything + // except call_srcu(), especially by srcu_barrier(). + // By the end of this state, all CPUs and threads + // are aware of this tree's existence. +#define SRCU_SIZE_WAIT_CALL 3 // The srcu_node tree starts being used by call_srcu(). + // By the end of this state, all of the call_srcu() + // invocations that were running on a non-boot CPU + // and using the boot CPU's callback queue will have + // completed. +#define SRCU_SIZE_WAIT_CBS1 4 // Don't trust the ->srcu_have_cbs[] grace-period +#define SRCU_SIZE_WAIT_CBS2 5 // sequence elements or the ->srcu_data_have_cbs[] +#define SRCU_SIZE_WAIT_CBS3 6 // CPU-bitmask elements until all four elements of +#define SRCU_SIZE_WAIT_CBS4 7 // each array have been initialized. +#define SRCU_SIZE_BIG 8 // The srcu_node combining tree is fully initialized + // and all aspects of it are being put to use. /* Values for state variable (bottom bits of ->srcu_gp_seq). */ #define SRCU_STATE_IDLE 0 From e5ad8b68f8d49211a5a2a20fdccac4180451f7f6 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Fri, 13 Jan 2023 16:31:08 +0800 Subject: [PATCH 40/54] Documentation/RCU: s/not/note/ in checklist.rst "Please not that you *cannot* rely..." has a typo. Fix it. Acked-by: Joel Fernandes (Google) Signed-off-by: Qiuxu Zhuo Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) --- Documentation/RCU/checklist.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst index cc361fb01ed4..bd3c58c44bef 100644 --- a/Documentation/RCU/checklist.rst +++ b/Documentation/RCU/checklist.rst @@ -70,7 +70,7 @@ over a rather long period of time, but improvements are always welcome! can serve as rcu_read_lock_sched(), but is less readable and prevents lockdep from detecting locking issues. - Please not that you *cannot* rely on code known to be built + Please note that you *cannot* rely on code known to be built only in non-preemptible kernels. Such code can and will break, especially in kernels built with CONFIG_PREEMPT_COUNT=y. From 754aa6427efeb8a059233e18e810263a108fdd71 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 28 Jan 2023 03:59:01 +0000 Subject: [PATCH 41/54] srcu: Clarify comments on memory barrier "E" There is an smp_mb() named "E" in srcu_flip() immediately before the increment (flip) of the srcu_struct structure's ->srcu_idx. The purpose of E is to order the preceding scan's read of lock counters against the flipping of the ->srcu_idx, in order to prevent new readers from continuing to use the old ->srcu_idx value, which might needlessly extend the grace period. However, this ordering is already enforced because of the control dependency between the preceding scan and the ->srcu_idx flip. This control dependency exists because atomic_long_read() is used to scan the counts, because WRITE_ONCE() is used to flip ->srcu_idx, and because ->srcu_idx is not flipped until the ->srcu_lock_count[] and ->srcu_unlock_count[] counts match. And such a match cannot happen when there is an in-flight reader that started before the flip (observation courtesy Mathieu Desnoyers). The litmus test below (courtesy of Frederic Weisbecker, with changes for ctrldep by Boqun and Joel) shows this: C srcu (* * bad condition: P0's first scan (SCAN1) saw P1's idx=0 LOCK count inc, though P1 saw flip. * * So basically, the ->po ordering on both P0 and P1 is enforced via ->ppo * (control deps) on both sides, and both P0 and P1 are interconnected by ->rf * relations. Combining the ->ppo with ->rf, a cycle is impossible. *) {} // updater P0(int *IDX, int *LOCK0, int *UNLOCK0, int *LOCK1, int *UNLOCK1) { int lock1; int unlock1; int lock0; int unlock0; // SCAN1 unlock1 = READ_ONCE(*UNLOCK1); smp_mb(); // A lock1 = READ_ONCE(*LOCK1); // FLIP if (lock1 == unlock1) { // Control dep smp_mb(); // E // Remove E and still passes. WRITE_ONCE(*IDX, 1); smp_mb(); // D // SCAN2 unlock0 = READ_ONCE(*UNLOCK0); smp_mb(); // A lock0 = READ_ONCE(*LOCK0); } } // reader P1(int *IDX, int *LOCK0, int *UNLOCK0, int *LOCK1, int *UNLOCK1) { int tmp; int idx1; int idx2; // 1st reader idx1 = READ_ONCE(*IDX); if (idx1 == 0) { // Control dep tmp = READ_ONCE(*LOCK0); WRITE_ONCE(*LOCK0, tmp + 1); smp_mb(); /* B and C */ tmp = READ_ONCE(*UNLOCK0); WRITE_ONCE(*UNLOCK0, tmp + 1); } else { tmp = READ_ONCE(*LOCK1); WRITE_ONCE(*LOCK1, tmp + 1); smp_mb(); /* B and C */ tmp = READ_ONCE(*UNLOCK1); WRITE_ONCE(*UNLOCK1, tmp + 1); } } exists (0:lock1=1 /\ 1:idx1=1) More complicated litmus tests with multiple SRCU readers also show that memory barrier E is not needed. This commit therefore clarifies the comment on memory barrier E. Why not also remove that redundant smp_mb()? Because control dependencies are quite fragile due to their not being recognized by most compilers and tools. Control dependencies therefore exact an ongoing maintenance burden, and such a burden cannot be justified in this slowpath. Therefore, that smp_mb() stays until such time as its overhead becomes a measurable problem in a real workload running on a real production system, or until such time as compilers start paying attention to this sort of control dependency. Co-developed-by: Frederic Weisbecker Signed-off-by: Frederic Weisbecker Co-developed-by: Mathieu Desnoyers Signed-off-by: Mathieu Desnoyers Co-developed-by: Boqun Feng Signed-off-by: Boqun Feng Reviewed-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) --- kernel/rcu/srcutree.c | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index ab4ee58af84b..68f89c533057 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1085,16 +1085,36 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) static void srcu_flip(struct srcu_struct *ssp) { /* - * Ensure that if this updater saw a given reader's increment - * from __srcu_read_lock(), that reader was using an old value - * of ->srcu_idx. Also ensure that if a given reader sees the - * new value of ->srcu_idx, this updater's earlier scans cannot - * have seen that reader's increments (which is OK, because this - * grace period need not wait on that reader). + * Because the flip of ->srcu_idx is executed only if the + * preceding call to srcu_readers_active_idx_check() found that + * the ->srcu_unlock_count[] and ->srcu_lock_count[] sums matched + * and because that summing uses atomic_long_read(), there is + * ordering due to a control dependency between that summing and + * the WRITE_ONCE() in this call to srcu_flip(). This ordering + * ensures that if this updater saw a given reader's increment from + * __srcu_read_lock(), that reader was using a value of ->srcu_idx + * from before the previous call to srcu_flip(), which should be + * quite rare. This ordering thus helps forward progress because + * the grace period could otherwise be delayed by additional + * calls to __srcu_read_lock() using that old (soon to be new) + * value of ->srcu_idx. + * + * This sum-equality check and ordering also ensures that if + * a given call to __srcu_read_lock() uses the new value of + * ->srcu_idx, this updater's earlier scans cannot have seen + * that reader's increments, which is all to the good, because + * this grace period need not wait on that reader. After all, + * if those earlier scans had seen that reader, there would have + * been a sum mismatch and this code would not be reached. + * + * This means that the following smp_mb() is redundant, but + * it stays until either (1) Compilers learn about this sort of + * control dependency or (2) Some production workload running on + * a production system is unduly delayed by this slowpath smp_mb(). */ smp_mb(); /* E */ /* Pairs with B and C. */ - WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); + WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); // Flip the counter. /* * Ensure that if the updater misses an __srcu_read_unlock() From c4af9e00894575797395e6cdd98b3a227ee70e29 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 29 Jan 2023 15:10:49 -0800 Subject: [PATCH 42/54] Documentation: RCU: Correct spelling Correct spelling problems for Documentation/RCU/ as reported by codespell. Note: in RTFP.txt, there are other misspellings that are left as is since they were used that way in email Subject: lines or in LWN.net articles. [preemptable, Preemptable, synchonisation] Acked-by: Joel Fernandes (Google) Signed-off-by: Randy Dunlap Cc: Jonathan Corbet Cc: linux-doc@vger.kernel.org Cc: "Paul E. McKenney" Cc: Frederic Weisbecker Cc: Neeraj Upadhyay Cc: Josh Triplett Cc: rcu@vger.kernel.org Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) --- .../Expedited-Grace-Periods.rst | 6 +++--- .../Memory-Ordering/Tree-RCU-Memory-Ordering.rst | 2 +- Documentation/RCU/RTFP.txt | 10 +++++----- Documentation/RCU/UP.rst | 4 ++-- Documentation/RCU/lockdep.rst | 2 +- Documentation/RCU/torture.rst | 4 ++-- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst index c9c957c85bac..93d899d53258 100644 --- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst +++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst @@ -277,7 +277,7 @@ the following access functions: Again, only one request in a given batch need actually carry out a grace-period operation, which means there must be an efficient way to -identify which of many concurrent reqeusts will initiate the grace +identify which of many concurrent requests will initiate the grace period, and that there be an efficient way for the remaining requests to wait for that grace period to complete. However, that is the topic of the next section. @@ -405,7 +405,7 @@ Use of Workqueues In earlier implementations, the task requesting the expedited grace period also drove it to completion. This straightforward approach had the disadvantage of needing to account for POSIX signals sent to user -tasks, so more recent implemementations use the Linux kernel's +tasks, so more recent implementations use the Linux kernel's workqueues (see Documentation/core-api/workqueue.rst). The requesting task still does counter snapshotting and funnel-lock @@ -465,7 +465,7 @@ corresponding disadvantage that workqueues cannot be used until they are initialized, which does not happen until some time after the scheduler spawns the first task. Given that there are parts of the kernel that really do want to execute grace periods during this mid-boot “dead -zone”, expedited grace periods must do something else during thie time. +zone”, expedited grace periods must do something else during this time. What they do is to fall back to the old practice of requiring that the requesting task drive the expedited grace period, as was the case before diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst index 7fdf151a8680..5750f125361b 100644 --- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst +++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst @@ -168,7 +168,7 @@ an ``atomic_add_return()`` of zero) to detect idle CPUs. +-----------------------------------------------------------------------+ The approach must be extended to handle one final case, that of waking a -task blocked in ``synchronize_rcu()``. This task might be affinitied to +task blocked in ``synchronize_rcu()``. This task might be affined to a CPU that is not yet aware that the grace period has ended, and thus might not yet be subject to the grace period's memory ordering. Therefore, there is an ``smp_mb()`` after the return from diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt index 588d97366a46..db8f16b392aa 100644 --- a/Documentation/RCU/RTFP.txt +++ b/Documentation/RCU/RTFP.txt @@ -201,7 +201,7 @@ work looked at debugging uses of RCU [Seyster:2011:RFA:2075416.2075425]. In 2012, Josh Triplett received his Ph.D. with his dissertation covering RCU-protected resizable hash tables and the relationship between memory barriers and read-side traversal order: If the updater -is making changes in the opposite direction from the read-side traveral +is making changes in the opposite direction from the read-side traversal order, the updater need only execute a memory-barrier instruction, but if in the same direction, the updater needs to wait for a grace period between the individual updates [JoshTriplettPhD]. Also in 2012, @@ -1245,7 +1245,7 @@ Oregon Health and Sciences University" [Viewed September 5, 2005]" ,annotation={ First posting showing how RCU can be safely adapted for - preemptable RCU read side critical sections. + preemptible RCU read side critical sections. } } @@ -1888,7 +1888,7 @@ Revised: \url{https://lore.kernel.org/r/20070910183004.GA3299@linux.vnet.ibm.com} [Viewed October 25, 2007]" ,annotation={ - Final patch for preemptable RCU to -rt. (Later patches were + Final patch for preemptible RCU to -rt. (Later patches were to mainline, eventually incorporated.) } } @@ -2275,7 +2275,7 @@ lot of {Linux} into your technology!!!" \url{https://lore.kernel.org/r/20090724001429.GA17374@linux.vnet.ibm.com} [Viewed August 15, 2009]" ,annotation={ - First posting of simple and fast preemptable RCU. + First posting of simple and fast preemptible RCU. } } @@ -2639,7 +2639,7 @@ lot of {Linux} into your technology!!!" RCU-protected hash tables, barriers vs. read-side traversal order. . If the updater is making changes in the opposite direction from - the read-side traveral order, the updater need only execute a + the read-side traversal order, the updater need only execute a memory-barrier instruction, but if in the same direction, the updater needs to wait for a grace period between the individual updates. diff --git a/Documentation/RCU/UP.rst b/Documentation/RCU/UP.rst index 8b20fd45f255..4060d7a2f62a 100644 --- a/Documentation/RCU/UP.rst +++ b/Documentation/RCU/UP.rst @@ -107,7 +107,7 @@ UP systems, including PREEMPT SMP builds running on UP systems. Quick Quiz #3: Why can't synchronize_rcu() return immediately on UP systems running - preemptable RCU? + preemptible RCU? .. _answer_quick_quiz_up: @@ -143,7 +143,7 @@ Answer to Quick Quiz #2: Answer to Quick Quiz #3: Why can't synchronize_rcu() return immediately on UP systems - running preemptable RCU? + running preemptible RCU? Because some other task might have been preempted in the middle of an RCU read-side critical section. If synchronize_rcu() diff --git a/Documentation/RCU/lockdep.rst b/Documentation/RCU/lockdep.rst index 2749f43ec1b0..69e73a39bd11 100644 --- a/Documentation/RCU/lockdep.rst +++ b/Documentation/RCU/lockdep.rst @@ -65,7 +65,7 @@ checking of rcu_dereference() primitives: rcu_access_pointer(p): Return the value of the pointer and omit all barriers, but retain the compiler constraints that prevent duplicating - or coalescsing. This is useful when testing the + or coalescing. This is useful when testing the value of the pointer itself, for example, against NULL. The rcu_dereference_check() check expression can be any boolean diff --git a/Documentation/RCU/torture.rst b/Documentation/RCU/torture.rst index 0316ba0c6922..b3b6dfa85248 100644 --- a/Documentation/RCU/torture.rst +++ b/Documentation/RCU/torture.rst @@ -216,7 +216,7 @@ Kernel boot arguments can also be supplied, for example, to control rcutorture's module parameters. For example, to test a change to RCU's CPU stall-warning code, use "--bootargs 'rcutorture.stall_cpu=30'". This will of course result in the scripting reporting a failure, namely -the resuling RCU CPU stall warning. As noted above, reducing memory may +the resulting RCU CPU stall warning. As noted above, reducing memory may require disabling rcutorture's callback-flooding tests:: kvm.sh --cpus 448 --configs '56*TREE04' --memory 128M \ @@ -370,5 +370,5 @@ You can also re-run a previous remote run in a manner similar to kvm.sh: tools/testing/selftests/rcutorture/res/2022.11.03-11.26.28-remote \ --duration 24h -In this case, most of the kvm-again.sh parmeters may be supplied following +In this case, most of the kvm-again.sh parameters may be supplied following the pathname of the old run-results directory. From 81573694a48599f1edf928f390c6dcab5cbb6f49 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 1 Feb 2023 16:09:53 +0100 Subject: [PATCH 43/54] doc: Update whatisRCU.rst The kfree_rcu() macro is deprecated. Rename it to its new kfree_rcu_mightsleep() name in this documentation. Acked-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) --- Documentation/RCU/whatisRCU.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst index 2c5563a91998..8eddef28d3a1 100644 --- a/Documentation/RCU/whatisRCU.rst +++ b/Documentation/RCU/whatisRCU.rst @@ -597,10 +597,10 @@ to avoid having to write your own callback:: If the occasional sleep is permitted, the single-argument form may be used, omitting the rcu_head structure from struct foo. - kfree_rcu(old_fp); + kfree_rcu_mightsleep(old_fp); -This variant of kfree_rcu() almost never blocks, but might do so by -invoking synchronize_rcu() in response to memory-allocation failure. +This variant almost never blocks, but might do so by invoking +synchronize_rcu() in response to memory-allocation failure. Again, see checklist.rst for additional rules governing the use of RCU. From 09853fb89f6bc46727ccd325c0f6f266df51d155 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 4 Mar 2023 13:40:55 -0800 Subject: [PATCH 44/54] rcu: Add comment to rcu_do_batch() identifying rcuoc code path This commit adds a comment to help explain why the "else" clause of the in_serving_softirq() "if" statement does not need to enforce a time limit. The reason is that this "else" clause handles rcuoc kthreads that do not block handlers for other softirq vectors. Acked-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8e880c09ab59..06cc6a6ad819 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2131,6 +2131,8 @@ static void rcu_do_batch(struct rcu_data *rdp) break; } } else { + // In rcuoc context, so no worries about depriving + // other softirq vectors of CPU cycles. local_bh_enable(); lockdep_assert_irqs_enabled(); cond_resched_tasks_rcu_qs(); From a77b2109f71e0a963325869123b9b5f159b2ae5e Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 1 Feb 2023 16:08:08 +0100 Subject: [PATCH 45/54] drbd: Rename kvfree_rcu() to kvfree_rcu_mightsleep() The kvfree_rcu() macro's single-argument form is deprecated. Therefore switch to the new kvfree_rcu_mightsleep() variant. The goal is to avoid accidental use of the single-argument forms, which can introduce functionality bugs in atomic contexts and latency bugs in non-atomic contexts. Cc: Jens Axboe Cc: Philipp Reisner Cc: Lars Ellenberg Begrudgingly-acked-by: Jens Axboe Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) --- drivers/block/drbd/drbd_nl.c | 6 +++--- drivers/block/drbd/drbd_receiver.c | 4 ++-- drivers/block/drbd/drbd_state.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 60757ac31701..f49f2a5282e1 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1615,7 +1615,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) drbd_send_sync_param(peer_device); } - kvfree_rcu(old_disk_conf); + kvfree_rcu_mightsleep(old_disk_conf); kfree(old_plan); mod_timer(&device->request_timer, jiffies + HZ); goto success; @@ -2446,7 +2446,7 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) mutex_unlock(&connection->resource->conf_update); mutex_unlock(&connection->data.mutex); - kvfree_rcu(old_net_conf); + kvfree_rcu_mightsleep(old_net_conf); if (connection->cstate >= C_WF_REPORT_PARAMS) { struct drbd_peer_device *peer_device; @@ -2860,7 +2860,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) new_disk_conf->disk_size = (sector_t)rs.resize_size; rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); mutex_unlock(&device->resource->conf_update); - kvfree_rcu(old_disk_conf); + kvfree_rcu_mightsleep(old_disk_conf); new_disk_conf = NULL; } diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 757f4692b5bd..e197b2a465d2 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3759,7 +3759,7 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in drbd_info(connection, "peer data-integrity-alg: %s\n", integrity_alg[0] ? integrity_alg : "(none)"); - kvfree_rcu(old_net_conf); + kvfree_rcu_mightsleep(old_net_conf); return 0; disconnect_rcu_unlock: @@ -4127,7 +4127,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); mutex_unlock(&connection->resource->conf_update); - kvfree_rcu(old_disk_conf); + kvfree_rcu_mightsleep(old_disk_conf); drbd_info(device, "Peer sets u_size to %lu sectors (old: %lu)\n", (unsigned long)p_usize, (unsigned long)my_usize); diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 75d13ea0024f..2aeea295fa28 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -2071,7 +2071,7 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused) conn_free_crypto(connection); mutex_unlock(&connection->resource->conf_update); - kvfree_rcu(old_conf); + kvfree_rcu_mightsleep(old_conf); } if (ns_max.susp_fen) { From 09b2286af617bc6f9480404b565393628f86037d Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 1 Feb 2023 16:08:09 +0100 Subject: [PATCH 46/54] misc: vmw_vmci: Rename kvfree_rcu() to kvfree_rcu_mightsleep() The kvfree_rcu() macro's single-argument form is deprecated. Therefore switch to the new kvfree_rcu_mightsleep() variant. The goal is to avoid accidental use of the single-argument forms, which can introduce functionality bugs in atomic contexts and latency bugs in non-atomic contexts. Cc: Bryan Tan Cc: Vishnu Dasa Reviewed-by: Vishnu Dasa Acked-by: Greg Kroah-Hartman Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) --- drivers/misc/vmw_vmci/vmci_context.c | 2 +- drivers/misc/vmw_vmci/vmci_event.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/misc/vmw_vmci/vmci_context.c b/drivers/misc/vmw_vmci/vmci_context.c index 172696abce31..f22b44827e92 100644 --- a/drivers/misc/vmw_vmci/vmci_context.c +++ b/drivers/misc/vmw_vmci/vmci_context.c @@ -687,7 +687,7 @@ int vmci_ctx_remove_notification(u32 context_id, u32 remote_cid) spin_unlock(&context->lock); if (notifier) - kvfree_rcu(notifier); + kvfree_rcu_mightsleep(notifier); vmci_ctx_put(context); diff --git a/drivers/misc/vmw_vmci/vmci_event.c b/drivers/misc/vmw_vmci/vmci_event.c index 2100297c94ad..5d7ac07623c2 100644 --- a/drivers/misc/vmw_vmci/vmci_event.c +++ b/drivers/misc/vmw_vmci/vmci_event.c @@ -209,7 +209,7 @@ int vmci_event_unsubscribe(u32 sub_id) if (!s) return VMCI_ERROR_NOT_FOUND; - kvfree_rcu(s); + kvfree_rcu_mightsleep(s); return VMCI_SUCCESS; } From cae16f2c2e11c60c888715f4d98c12740683d6a2 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 1 Feb 2023 16:08:10 +0100 Subject: [PATCH 47/54] tracing: Rename kvfree_rcu() to kvfree_rcu_mightsleep() The kvfree_rcu() macro's single-argument form is deprecated. Therefore switch to the new kvfree_rcu_mightsleep() variant. The goal is to avoid accidental use of the single-argument forms, which can introduce functionality bugs in atomic contexts and latency bugs in non-atomic contexts. Cc: Steven Rostedt (VMware) Acked-by: Daniel Bristot de Oliveira Acked-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) --- kernel/trace/trace_osnoise.c | 2 +- kernel/trace/trace_probe.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 04f0fdae19a1..f68ca1e6460f 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -159,7 +159,7 @@ static void osnoise_unregister_instance(struct trace_array *tr) if (!found) return; - kvfree_rcu(inst); + kvfree_rcu_mightsleep(inst); } /* diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 20d0c4a97633..2d2616678295 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1172,7 +1172,7 @@ int trace_probe_remove_file(struct trace_probe *tp, return -ENOENT; list_del_rcu(&link->list); - kvfree_rcu(link); + kvfree_rcu_mightsleep(link); if (list_empty(&tp->event->files)) trace_probe_clear_flag(tp, TP_FLAG_TRACE); From c779b97281d52faac253e9afb004537e50ada4e8 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 1 Feb 2023 16:08:11 +0100 Subject: [PATCH 48/54] lib/test_vmalloc.c: Rename kvfree_rcu() to kvfree_rcu_mightsleep() The kvfree_rcu() macro's single-argument form is deprecated. Therefore switch to the new kvfree_rcu_mightsleep() variant. The goal is to avoid accidental use of the single-argument forms, which can introduce functionality bugs in atomic contexts and latency bugs in non-atomic contexts. Acked-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) --- lib/test_vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index de4ee0d50906..cd2bdba6d3ed 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -334,7 +334,7 @@ kvfree_rcu_1_arg_vmalloc_test(void) return -1; p->array[0] = 'a'; - kvfree_rcu(p); + kvfree_rcu_mightsleep(p); } return 0; From aef3b8b8dd55a8bdfd7ef96cfde0aa71fd82ed10 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 1 Feb 2023 16:08:12 +0100 Subject: [PATCH 49/54] net/sysctl: Rename kvfree_rcu() to kvfree_rcu_mightsleep() The kfree_rcu() and kvfree_rcu() macros' single-argument forms are deprecated. Therefore switch to the new kfree_rcu_mightsleep() and kvfree_rcu_mightsleep() variants. The goal is to avoid accidental use of the single-argument forms, which can introduce functionality bugs in atomic contexts and latency bugs in non-atomic contexts. Acked-by: Jakub Kicinski Cc: Eric Dumazet Cc: David S. Miller Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) --- net/core/sysctl_net_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 74842b453407..782273bb93c2 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -177,7 +177,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, if (orig_sock_table) { static_branch_dec(&rps_needed); static_branch_dec(&rfs_needed); - kvfree_rcu(orig_sock_table); + kvfree_rcu_mightsleep(orig_sock_table); } } } @@ -215,7 +215,7 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, lockdep_is_held(&flow_limit_update_mutex)); if (cur && !cpumask_test_cpu(i, mask)) { RCU_INIT_POINTER(sd->flow_limit, NULL); - kfree_rcu(cur); + kfree_rcu_mightsleep(cur); } else if (!cur && cpumask_test_cpu(i, mask)) { cur = kzalloc_node(len, GFP_KERNEL, cpu_to_node(i)); From 23532061ad3047c014eacab048e9cef2d6463742 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 1 Feb 2023 16:08:14 +0100 Subject: [PATCH 50/54] net/mlx5: Rename kfree_rcu() to kfree_rcu_mightsleep() The kfree_rcu() and kvfree_rcu() macros' single-argument forms are deprecated. Therefore switch to the new kfree_rcu_mightsleep() and kvfree_rcu_mightsleep() variants. The goal is to avoid accidental use of the single-argument forms, which can introduce functionality bugs in atomic contexts and latency bugs in non-atomic contexts. Cc: Ariel Levkovich Cc: Saeed Mahameed Cc: Vlad Buslov Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) --- drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c index ca834bbcb44f..8afcec0c5d3c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c @@ -242,7 +242,7 @@ mlx5e_int_port_remove(struct mlx5e_tc_int_port_priv *priv, mlx5_del_flow_rules(int_port->rx_rule); mapping_remove(ctx, int_port->mapping); mlx5e_int_port_metadata_free(priv, int_port->match_metadata); - kfree_rcu(int_port); + kfree_rcu_mightsleep(int_port); priv->num_ports--; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c index 08d0929e8260..b811dad7370a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c @@ -670,7 +670,7 @@ static int mlx5e_macsec_del_txsa(struct macsec_context *ctx) mlx5e_macsec_cleanup_sa(macsec, tx_sa, true); mlx5_destroy_encryption_key(macsec->mdev, tx_sa->enc_key_id); - kfree_rcu(tx_sa); + kfree_rcu_mightsleep(tx_sa); macsec_device->tx_sa[assoc_num] = NULL; out: @@ -849,7 +849,7 @@ static void macsec_del_rxsc_ctx(struct mlx5e_macsec *macsec, struct mlx5e_macsec xa_erase(&macsec->sc_xarray, rx_sc->sc_xarray_element->fs_id); metadata_dst_free(rx_sc->md_dst); kfree(rx_sc->sc_xarray_element); - kfree_rcu(rx_sc); + kfree_rcu_mightsleep(rx_sc); } static int mlx5e_macsec_del_rxsc(struct macsec_context *ctx) From 10e4f310a887fd908b6800088f64fe21d7cc75a6 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 1 Feb 2023 16:08:15 +0100 Subject: [PATCH 51/54] ext4/super: Rename kfree_rcu() to kfree_rcu_mightsleep() The kfree_rcu() and kvfree_rcu() macros' single-argument forms are deprecated. Therefore switch to the new kfree_rcu_mightsleep() and kvfree_rcu_mightsleep() variants. The goal is to avoid accidental use of the single-argument forms, which can introduce functionality bugs in atomic contexts and latency bugs in non-atomic contexts. Cc: Theodore Ts'o Cc: Lukas Czerner Acked-by: Theodore Ts'o Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 88f7b8a88c76..405a66b47311 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2500,7 +2500,7 @@ static void ext4_apply_quota_options(struct fs_context *fc, qname = rcu_replace_pointer(sbi->s_qf_names[i], qname, lockdep_is_held(&sb->s_umount)); if (qname) - kfree_rcu(qname); + kfree_rcu_mightsleep(qname); } } From 936c7e19c695d624adac28ea390fd8d53c653542 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 1 Feb 2023 16:09:52 +0100 Subject: [PATCH 52/54] rcuscale: Rename kfree_rcu() to kfree_rcu_mightsleep() The kfree_rcu() and kvfree_rcu() macros' single-argument forms are deprecated. Therefore switch to the new kfree_rcu_mightsleep() and kvfree_rcu_mightsleep() variants. The goal is to avoid accidental use of the single-argument forms, which can introduce functionality bugs in atomic contexts and latency bugs in non-atomic contexts. Acked-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) --- kernel/rcu/rcuscale.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 91fb5905a008..98b75995b680 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -716,7 +716,7 @@ kfree_scale_thread(void *arg) // is tested. if ((kfree_rcu_test_single && !kfree_rcu_test_double) || (kfree_rcu_test_both && torture_random(&tr) & 0x800)) - kfree_rcu(alloc_ptr); + kfree_rcu_mightsleep(alloc_ptr); else kfree_rcu(alloc_ptr, rh); } From eb56a4cbc3ea348dc07fbe82531c27cce83df3b9 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 10 Mar 2023 01:18:12 +0000 Subject: [PATCH 53/54] mac802154: Rename kfree_rcu() to kvfree_rcu_mightsleep() The k[v]free_rcu() macro's single-argument form is deprecated. Therefore switch to the new k[v]free_rcu_mightsleep() variant. The goal is to avoid accidental use of the single-argument forms, which can introduce functionality bugs in atomic contexts and latency bugs in non-atomic contexts. The callers are holding a mutex so the context allows blocking. Hence using the API with a single argument will be fine, but use its new name. There is no functionality change with this patch. Fixes: 57588c71177f ("mac802154: Handle passive scanning") Acked-by: Stefan Schmidt Reviewed-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) --- net/mac802154/scan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mac802154/scan.c b/net/mac802154/scan.c index 9b0933a185eb..5c191bedd72c 100644 --- a/net/mac802154/scan.c +++ b/net/mac802154/scan.c @@ -52,7 +52,7 @@ static int mac802154_scan_cleanup_locked(struct ieee802154_local *local, request = rcu_replace_pointer(local->scan_req, NULL, 1); if (!request) return 0; - kfree_rcu(request); + kvfree_rcu_mightsleep(request); /* Advertize first, while we know the devices cannot be removed */ if (aborted) @@ -403,7 +403,7 @@ int mac802154_stop_beacons_locked(struct ieee802154_local *local, request = rcu_replace_pointer(local->beacon_req, NULL, 1); if (!request) return 0; - kfree_rcu(request); + kvfree_rcu_mightsleep(request); nl802154_beaconing_done(wpan_dev); From 1eacac3255495be7502d406e2ba5444fb5c3607c Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 6 Mar 2023 14:33:58 +0000 Subject: [PATCH 54/54] checkpatch: Error out if deprecated RCU API used Single-argument kvfree_rcu() usage is being deprecated [1] [2]. However, till all users are converted, we would like to introduce checkpatch errors for new patches submitted. This patch adds support for the same. Tested with a trial patch. For now, we are only considering usages that don't have compound nesting, for example ignore: kvfree_rcu( (rcu_head_obj), rcu_head_name). This is sufficient as such usages are unlikely. Once all users are converted and we remove the old API, we can also revert this checkpatch patch then. [1] https://lore.kernel.org/rcu/CAEXW_YRhHaVuq+5f+VgCZM=SF+9xO+QXaxe0yE7oA9iCXK-XPg@mail.gmail.com/ [2] https://lore.kernel.org/rcu/CAEXW_YSY=q2_uaE2qo4XSGjzs4+C102YMVJ7kWwuT5LGmJGGew@mail.gmail.com/ Acked-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) --- scripts/checkpatch.pl | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index bd44d12965c9..4bfbe3c9fa15 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6388,6 +6388,15 @@ sub process { } } +# check for soon-to-be-deprecated single-argument k[v]free_rcu() API + if ($line =~ /\bk[v]?free_rcu\s*\([^(]+\)/) { + if ($line =~ /\bk[v]?free_rcu\s*\([^,]+\)/) { + ERROR("DEPRECATED_API", + "Single-argument k[v]free_rcu() API is deprecated, please pass rcu_head object or call k[v]free_rcu_mightsleep()." . $herecurr); + } + } + + # check for unnecessary "Out of Memory" messages if ($line =~ /^\+.*\b$logFunctions\s*\(/ && $prevline =~ /^[ \+]\s*if\s*\(\s*(\!\s*|NULL\s*==\s*)?($Lval)(\s*==\s*NULL\s*)?\s*\)/ &&