From 18389c4570211e10e94f4a2ce907d01397abc335 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 19 Mar 2021 16:30:15 -0700 Subject: [PATCH 01/76] doc: Fix statement of RCU's memory-ordering requirements The sentence defining the relationship of accesses before a grace period to read-side accesses following that same grace period was missing a small word: "not". This commit therefore adds it. Reported-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- .../RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst index a648b423ba0e..3f6ce41ee0c5 100644 --- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst +++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst @@ -21,7 +21,7 @@ Any code that happens after the end of a given RCU grace period is guaranteed to see the effects of all accesses prior to the beginning of that grace period that are within RCU read-side critical sections. Similarly, any code that happens before the beginning of a given RCU grace -period is guaranteed to see the effects of all accesses following the end +period is guaranteed to not see the effects of all accesses following the end of that grace period that are within RCU read-side critical sections. Note well that RCU-sched read-side critical sections include any region From 58d0db869d7ab8ca97b521f167022caa2c42cbe7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 4 Apr 2021 23:58:43 +0200 Subject: [PATCH 02/76] doc: Fix diagram references in memory-ordering document The three diagrams describing rcu_gp_init() all spuriously refer to the same figure, probably due to a copy/paste issue. This commit fixes these references. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- .../RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst index 3f6ce41ee0c5..11cdab037bff 100644 --- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst +++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst @@ -339,14 +339,14 @@ The diagram below shows the path of ordering if the leftmost leftmost ``rcu_node`` structure offlines its last CPU and if the next ``rcu_node`` structure has no online CPUs). -.. kernel-figure:: TreeRCU-gp-init-1.svg +.. kernel-figure:: TreeRCU-gp-init-2.svg The final ``rcu_gp_init()`` pass through the ``rcu_node`` tree traverses breadth-first, setting each ``rcu_node`` structure's ``->gp_seq`` field to the newly advanced value from the ``rcu_state`` structure, as shown in the following diagram. -.. kernel-figure:: TreeRCU-gp-init-1.svg +.. kernel-figure:: TreeRCU-gp-init-3.svg This change will also cause each CPU's next call to ``__note_gp_changes()`` to notice that a new grace period has started, From e5bd61e82b7a60c92bc09a618a0d8a612689037b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 22 Apr 2021 11:55:43 -0700 Subject: [PATCH 03/76] tools/rcu: Add drgn script to dump number of RCU callbacks This commit adds an rcu-cbs.py drgn script that computes the number of RCU callbacks waiting to be invoked. This information can be helpful when managing systems that are short of memory and that have software components that make heavy use of RCU, for example, by opening and closing files in tight loops. (But please note that there are almost always better ways to get your job done than by opening and closing files in tight loops.) Reported-by: Richard Weinberger Signed-off-by: Paul E. McKenney --- tools/rcu/rcu-cbs.py | 46 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 tools/rcu/rcu-cbs.py diff --git a/tools/rcu/rcu-cbs.py b/tools/rcu/rcu-cbs.py new file mode 100644 index 000000000000..f8b461b9eaa7 --- /dev/null +++ b/tools/rcu/rcu-cbs.py @@ -0,0 +1,46 @@ +#!/usr/bin/env drgn +# SPDX-License-Identifier: GPL-2.0+ +# +# Dump out the number of RCU callbacks outstanding. +# +# On older kernels having multiple flavors of RCU, this dumps out the +# number of callbacks for the most heavily used flavor. +# +# Usage: sudo drgn rcu-cbs.py +# +# Copyright (C) 2021 Facebook, Inc. +# +# Authors: Paul E. McKenney + +import sys +import drgn +from drgn import NULL, Object +from drgn.helpers.linux import * + +def get_rdp0(prog): + try: + rdp0 = prog.variable('rcu_preempt_data', 'kernel/rcu/tree.c'); + except LookupError: + rdp0 = NULL; + + if rdp0 == NULL: + try: + rdp0 = prog.variable('rcu_sched_data', + 'kernel/rcu/tree.c'); + except LookupError: + rdp0 = NULL; + + if rdp0 == NULL: + rdp0 = prog.variable('rcu_data', 'kernel/rcu/tree.c'); + return rdp0.address_of_(); + +rdp0 = get_rdp0(prog); + +# Sum up RCU callbacks. +sum = 0; +for cpu in for_each_possible_cpu(prog): + rdp = per_cpu_ptr(rdp0, cpu); + len = rdp.cblist.len.value_(); + # print("CPU " + str(cpu) + " RCU callbacks: " + str(len)); + sum += len; +print("Number of RCU callbacks in flight: " + str(sum)); From 4c9c3809ae2ecfcece9acb3f51427e617d21fafb Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Wed, 17 Mar 2021 10:24:51 +0100 Subject: [PATCH 04/76] rcu: Fix typo in comment: kthead -> kthread Signed-off-by: Rolf Eike Beer Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- mm/oom_kill.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ad0156b86937..2cbe8f8456e6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1940,7 +1940,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, } /* - * Awaken the no-CBs grace-period kthead if needed, either due to it + * Awaken the no-CBs grace-period kthread if needed, either due to it * legitimately being asleep or due to overload conditions. * * If warranted, also wake up the kthread servicing this CPUs queues. diff --git a/mm/oom_kill.c b/mm/oom_kill.c index eefd3f5fde46..54527de9cd2d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -922,7 +922,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) continue; } /* - * No kthead_use_mm() user needs to read from the userspace so + * No kthread_use_mm() user needs to read from the userspace so * we are ok to reap it. */ if (unlikely(p->flags & PF_KTHREAD)) From d0bfa8b3c411e25e014e4131d2804afe29c440a6 Mon Sep 17 00:00:00 2001 From: Zhang Qiang Date: Thu, 15 Apr 2021 19:19:56 +0200 Subject: [PATCH 05/76] kvfree_rcu: Release a page cache under memory pressure Add a drain_page_cache() function to drain a per-cpu page cache. The reason behind of it is a system can run into a low memory condition, in that case a page shrinker can ask for its users to free their caches in order to get extra memory available for other needs in a system. When a system hits such condition, a page cache is drained for all CPUs in a system. By default a page cache work is delayed with 5 seconds interval until a memory pressure disappears, if needed it can be changed. See a rcu_delay_page_cache_fill_msec module parameter. Co-developed-by: Uladzislau Rezki (Sony) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- .../admin-guide/kernel-parameters.txt | 5 ++ kernel/rcu/tree.c | 82 +++++++++++++++++-- 2 files changed, 78 insertions(+), 9 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index cb89dbdedc46..4405fd32e8ab 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4290,6 +4290,11 @@ whole algorithm to behave better in low memory condition. + rcutree.rcu_delay_page_cache_fill_msec= [KNL] + Set the page-cache refill delay (in milliseconds) + in response to low-memory conditions. The range + of permitted values is in the range 0:100000. + rcutree.jiffies_till_first_fqs= [KNL] Set delay from grace-period initialization to first attempt to force quiescent states. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8e78b2430c16..74d840aa877b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -186,6 +186,17 @@ module_param(rcu_unlock_delay, int, 0444); static int rcu_min_cached_objs = 5; module_param(rcu_min_cached_objs, int, 0444); +// A page shrinker can ask for pages to be freed to make them +// available for other parts of the system. This usually happens +// under low memory conditions, and in that case we should also +// defer page-cache filling for a short time period. +// +// The default value is 5 seconds, which is long enough to reduce +// interference with the shrinker while it asks other systems to +// drain their caches. +static int rcu_delay_page_cache_fill_msec = 5000; +module_param(rcu_delay_page_cache_fill_msec, int, 0444); + /* Retrieve RCU kthreads priority for rcutorture */ int rcu_get_gp_kthreads_prio(void) { @@ -3171,6 +3182,7 @@ struct kfree_rcu_cpu_work { * Even though it is lockless an access has to be protected by the * per-cpu lock. * @page_cache_work: A work to refill the cache when it is empty + * @backoff_page_cache_fill: Delay cache refills * @work_in_progress: Indicates that page_cache_work is running * @hrtimer: A hrtimer for scheduling a page_cache_work * @nr_bkv_objs: number of allocated objects at @bkvcache. @@ -3190,7 +3202,8 @@ struct kfree_rcu_cpu { bool initialized; int count; - struct work_struct page_cache_work; + struct delayed_work page_cache_work; + atomic_t backoff_page_cache_fill; atomic_t work_in_progress; struct hrtimer hrtimer; @@ -3256,6 +3269,26 @@ put_cached_bnode(struct kfree_rcu_cpu *krcp, } +static int +drain_page_cache(struct kfree_rcu_cpu *krcp) +{ + unsigned long flags; + struct llist_node *page_list, *pos, *n; + int freed = 0; + + raw_spin_lock_irqsave(&krcp->lock, flags); + page_list = llist_del_all(&krcp->bkvcache); + krcp->nr_bkv_objs = 0; + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + llist_for_each_safe(pos, n, page_list) { + free_page((unsigned long)pos); + freed++; + } + + return freed; +} + /* * This function is invoked in workqueue context after a grace period. * It frees all the objects queued on ->bhead_free or ->head_free. @@ -3446,7 +3479,7 @@ schedule_page_work_fn(struct hrtimer *t) struct kfree_rcu_cpu *krcp = container_of(t, struct kfree_rcu_cpu, hrtimer); - queue_work(system_highpri_wq, &krcp->page_cache_work); + queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0); return HRTIMER_NORESTART; } @@ -3455,12 +3488,16 @@ static void fill_page_cache_func(struct work_struct *work) struct kvfree_rcu_bulk_data *bnode; struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu, - page_cache_work); + page_cache_work.work); unsigned long flags; + int nr_pages; bool pushed; int i; - for (i = 0; i < rcu_min_cached_objs; i++) { + nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ? + 1 : rcu_min_cached_objs; + + for (i = 0; i < nr_pages; i++) { bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); @@ -3477,6 +3514,7 @@ static void fill_page_cache_func(struct work_struct *work) } atomic_set(&krcp->work_in_progress, 0); + atomic_set(&krcp->backoff_page_cache_fill, 0); } static void @@ -3484,10 +3522,15 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp) { if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && !atomic_xchg(&krcp->work_in_progress, 1)) { - hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - krcp->hrtimer.function = schedule_page_work_fn; - hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); + if (atomic_read(&krcp->backoff_page_cache_fill)) { + queue_delayed_work(system_wq, + &krcp->page_cache_work, + msecs_to_jiffies(rcu_delay_page_cache_fill_msec)); + } else { + hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + krcp->hrtimer.function = schedule_page_work_fn; + hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); + } } } @@ -3639,12 +3682,19 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { int cpu; unsigned long count = 0; + unsigned long flags; /* Snapshot count of all CPUs */ for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); count += READ_ONCE(krcp->count); + + raw_spin_lock_irqsave(&krcp->lock, flags); + count += krcp->nr_bkv_objs; + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + atomic_set(&krcp->backoff_page_cache_fill, 1); } return count; @@ -3661,6 +3711,8 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); count = krcp->count; + count += drain_page_cache(krcp); + raw_spin_lock_irqsave(&krcp->lock, flags); if (krcp->monitor_todo) kfree_rcu_drain_unlock(krcp, flags); @@ -4687,6 +4739,18 @@ static void __init kfree_rcu_batch_init(void) int cpu; int i; + /* Clamp it to [0:100] seconds interval. */ + if (rcu_delay_page_cache_fill_msec < 0 || + rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) { + + rcu_delay_page_cache_fill_msec = + clamp(rcu_delay_page_cache_fill_msec, 0, + (int) (100 * MSEC_PER_SEC)); + + pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n", + rcu_delay_page_cache_fill_msec); + } + for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); @@ -4696,7 +4760,7 @@ static void __init kfree_rcu_batch_init(void) } INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); - INIT_WORK(&krcp->page_cache_work, fill_page_cache_func); + INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); krcp->initialized = true; } if (register_shrinker(&kfree_rcu_shrinker)) From ac7625ebd5f7bad93f821b7397fe50635f58aa4b Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 15 Apr 2021 19:19:57 +0200 Subject: [PATCH 06/76] kvfree_rcu: Use [READ/WRITE]_ONCE() macros to access to nr_bkv_objs nr_bkv_objs is a count of the objects in the kvfree_rcu page cache. Accessing it requires holding the ->lock. Switch to READ_ONCE() and WRITE_ONCE() macros to provide lockless access to this counter. This lockless access is used for the shrinker. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 74d840aa877b..676a49ab5b2b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3250,7 +3250,7 @@ get_cached_bnode(struct kfree_rcu_cpu *krcp) if (!krcp->nr_bkv_objs) return NULL; - krcp->nr_bkv_objs--; + WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1); return (struct kvfree_rcu_bulk_data *) llist_del_first(&krcp->bkvcache); } @@ -3264,9 +3264,8 @@ put_cached_bnode(struct kfree_rcu_cpu *krcp, return false; llist_add((struct llist_node *) bnode, &krcp->bkvcache); - krcp->nr_bkv_objs++; + WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1); return true; - } static int @@ -3278,7 +3277,7 @@ drain_page_cache(struct kfree_rcu_cpu *krcp) raw_spin_lock_irqsave(&krcp->lock, flags); page_list = llist_del_all(&krcp->bkvcache); - krcp->nr_bkv_objs = 0; + WRITE_ONCE(krcp->nr_bkv_objs, 0); raw_spin_unlock_irqrestore(&krcp->lock, flags); llist_for_each_safe(pos, n, page_list) { @@ -3682,18 +3681,13 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { int cpu; unsigned long count = 0; - unsigned long flags; /* Snapshot count of all CPUs */ for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); count += READ_ONCE(krcp->count); - - raw_spin_lock_irqsave(&krcp->lock, flags); - count += krcp->nr_bkv_objs; - raw_spin_unlock_irqrestore(&krcp->lock, flags); - + count += READ_ONCE(krcp->nr_bkv_objs); atomic_set(&krcp->backoff_page_cache_fill, 1); } From d434c00fa3ac476ca6295b8310d097dd71984624 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 15 Apr 2021 19:19:58 +0200 Subject: [PATCH 07/76] kvfree_rcu: Add a bulk-list check when a scheduler is run The rcu_scheduler_active flag is set to RCU_SCHEDULER_RUNNING once the scheduler is up and running. That signal is used in order to check and queue a "monitor work" to reclaim freed objects (if there are any) during early boot. This flag is used by kvfree_rcu() to determine when work can safely be queued, at which point memory passed to earlier invocations of kvfree_rcu() can be processed. However, only "krcp->head" is checked for objects that need to be released, and there are now two more, namely, "krcp->bkvhead[0]" and "krcp->bkvhead[1]". Therefore, check these two additional channels. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 676a49ab5b2b..e86f32d6b8f9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3739,7 +3739,8 @@ void __init kfree_rcu_scheduler_running(void) struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); raw_spin_lock_irqsave(&krcp->lock, flags); - if (!krcp->head || krcp->monitor_todo) { + if ((!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) || + krcp->monitor_todo) { raw_spin_unlock_irqrestore(&krcp->lock, flags); continue; } From dd28c9f057ad099f6221829053e48f331e6f0b7f Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 15 Apr 2021 19:19:59 +0200 Subject: [PATCH 08/76] kvfree_rcu: Update "monitor_todo" once a batch is started Before attempting to start a new batch the "monitor_todo" variable is set to "false" and set back to "true" when a previous RCU batch is still in progress. This is at best confusing. Thus change this variable to "false" only when a new batch has been successfully queued, otherwise, just leave it be. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e86f32d6b8f9..1ae5f88e475f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3442,15 +3442,14 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) { // Attempt to start a new batch. - krcp->monitor_todo = false; if (queue_kfree_rcu_work(krcp)) { // Success! Our job is done here. + krcp->monitor_todo = false; raw_spin_unlock_irqrestore(&krcp->lock, flags); return; } // Previous RCU batch still in progress, try again later. - krcp->monitor_todo = true; schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); raw_spin_unlock_irqrestore(&krcp->lock, flags); } From 7fe1da33f6bad33b79135b1df6c3476f87856928 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 15 Apr 2021 19:20:00 +0200 Subject: [PATCH 09/76] kvfree_rcu: Use kfree_rcu_monitor() instead of open-coded variant Replace an open-coded version of the kfree_rcu_monitor() function body with a call to that function. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1ae5f88e475f..d643fd8327b6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3697,7 +3697,6 @@ static unsigned long kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { int cpu, freed = 0; - unsigned long flags; for_each_possible_cpu(cpu) { int count; @@ -3705,12 +3704,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) count = krcp->count; count += drain_page_cache(krcp); - - raw_spin_lock_irqsave(&krcp->lock, flags); - if (krcp->monitor_todo) - kfree_rcu_drain_unlock(krcp, flags); - else - raw_spin_unlock_irqrestore(&krcp->lock, flags); + kfree_rcu_monitor(&krcp->monitor_work.work); sc->nr_to_scan -= count; freed += count; From d8628f35bae0d0b1f06ca32fa57de76a7055e731 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 28 Apr 2021 15:44:22 +0200 Subject: [PATCH 10/76] kvfree_rcu: Fix comments according to current code The kvfree_rcu() function now defers allocations in the common case due to the fact that there is no lockless access to the memory-allocator caches/pools. In addition, in CONFIG_PREEMPT_NONE=y and in CONFIG_PREEMPT_VOLUNTARY=y kernels, there is no reliable way to determine if spinlocks are held. As a result, allocation is deferred in the common case, and the two-argument form of kvfree_rcu() thus uses the "channel 3" queue through all the rcu_head structures. This channel is called referred to as the emergency case in comments, and these comments are now obsolete. This commit therefore updates these comments to reflect the new common-case nature of such emergencies. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d643fd8327b6..b043af7b0212 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3355,9 +3355,11 @@ static void kfree_rcu_work(struct work_struct *work) } /* - * Emergency case only. It can happen under low memory - * condition when an allocation gets failed, so the "bulk" - * path can not be temporary maintained. + * This is used when the "bulk" path can not be used for the + * double-argument of kvfree_rcu(). This happens when the + * page-cache is empty, which means that objects are instead + * queued on a linked list through their rcu_head structures. + * This list is named "Channel 3". */ for (; head; head = next) { unsigned long offset = (unsigned long)head->func; @@ -3403,8 +3405,8 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) || (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) || (krcp->head && !krwp->head_free)) { - // Channel 1 corresponds to SLAB ptrs. - // Channel 2 corresponds to vmalloc ptrs. + // Channel 1 corresponds to the SLAB-pointer bulk path. + // Channel 2 corresponds to vmalloc-pointer bulk path. for (j = 0; j < FREE_N_CHANNELS; j++) { if (!krwp->bkvhead_free[j]) { krwp->bkvhead_free[j] = krcp->bkvhead[j]; @@ -3412,7 +3414,8 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) } } - // Channel 3 corresponds to emergency path. + // Channel 3 corresponds to both SLAB and vmalloc + // objects queued on the linked list. if (!krwp->head_free) { krwp->head_free = krcp->head; krcp->head = NULL; From a78d4a2a1017dea67857a1164d73642743e89a0f Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 21 Apr 2021 13:22:52 +0200 Subject: [PATCH 11/76] kvfree_rcu: Refactor kfree_rcu_monitor() Currently we have three functions which depend on each other. Two of them are quite tiny and the last one where the most work is done. All of them are related to queuing RCU batches to reclaim objects after a GP. 1. kfree_rcu_monitor(). It consist of few lines. It acquires a spin-lock and calls kfree_rcu_drain_unlock(). 2. kfree_rcu_drain_unlock(). It also consists of few lines of code. It calls queue_kfree_rcu_work() to queue the batch. If this fails, it rearms the monitor work to try again later. 3. queue_kfree_rcu_work(). This provides the bulk of the functionality, attempting to start a new batch to free objects after a GP. Since there are no external users of functions [2] and [3], both can eliminated by moving all logic directly into [1], which both shrinks and simplifies the code. Also replace comments which start with "/*" to "//" format to make it unified across the file. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 86 +++++++++++++++-------------------------------- 1 file changed, 27 insertions(+), 59 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b043af7b0212..618ec9152e5e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3379,29 +3379,26 @@ static void kfree_rcu_work(struct work_struct *work) } /* - * Schedule the kfree batch RCU work to run in workqueue context after a GP. - * - * This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES - * timeout has been reached. + * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. */ -static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) +static void kfree_rcu_monitor(struct work_struct *work) { - struct kfree_rcu_cpu_work *krwp; - bool repeat = false; + struct kfree_rcu_cpu *krcp = container_of(work, + struct kfree_rcu_cpu, monitor_work.work); + unsigned long flags; int i, j; - lockdep_assert_held(&krcp->lock); + raw_spin_lock_irqsave(&krcp->lock, flags); + // Attempt to start a new batch. for (i = 0; i < KFREE_N_BATCHES; i++) { - krwp = &(krcp->krw_arr[i]); + struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]); - /* - * Try to detach bkvhead or head and attach it over any - * available corresponding free channel. It can be that - * a previous RCU batch is in progress, it means that - * immediately to queue another one is not possible so - * return false to tell caller to retry. - */ + // Try to detach bkvhead or head and attach it over any + // available corresponding free channel. It can be that + // a previous RCU batch is in progress, it means that + // immediately to queue another one is not possible so + // in that case the monitor work is rearmed. if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) || (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) || (krcp->head && !krwp->head_free)) { @@ -3423,55 +3420,26 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) WRITE_ONCE(krcp->count, 0); - /* - * One work is per one batch, so there are three - * "free channels", the batch can handle. It can - * be that the work is in the pending state when - * channels have been detached following by each - * other. - */ + // One work is per one batch, so there are three + // "free channels", the batch can handle. It can + // be that the work is in the pending state when + // channels have been detached following by each + // other. queue_rcu_work(system_wq, &krwp->rcu_work); } - - // Repeat if any "free" corresponding channel is still busy. - if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head) - repeat = true; } - return !repeat; -} - -static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, - unsigned long flags) -{ - // Attempt to start a new batch. - if (queue_kfree_rcu_work(krcp)) { - // Success! Our job is done here. + // If there is nothing to detach, it means that our job is + // successfully done here. In case of having at least one + // of the channels that is still busy we should rearm the + // work to repeat an attempt. Because previous batches are + // still in progress. + if (!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) krcp->monitor_todo = false; - raw_spin_unlock_irqrestore(&krcp->lock, flags); - return; - } - - // Previous RCU batch still in progress, try again later. - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); - raw_spin_unlock_irqrestore(&krcp->lock, flags); -} - -/* - * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. - * It invokes kfree_rcu_drain_unlock() to attempt to start another batch. - */ -static void kfree_rcu_monitor(struct work_struct *work) -{ - unsigned long flags; - struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu, - monitor_work.work); - - raw_spin_lock_irqsave(&krcp->lock, flags); - if (krcp->monitor_todo) - kfree_rcu_drain_unlock(krcp, flags); else - raw_spin_unlock_irqrestore(&krcp->lock, flags); + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + + raw_spin_unlock_irqrestore(&krcp->lock, flags); } static enum hrtimer_restart From 0cbc124bce8c527eb14c87f634683c5bcf4299c7 Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Tue, 16 Mar 2021 16:07:10 +0530 Subject: [PATCH 12/76] mm/slub: Fix backtrace of objects to handle redzone adjustment This commit fixes commit 8e7f37f2aaa5 ("mm: Add mem_dump_obj() to print source of memory block"). With current code, the backtrace of allocated object is incorrect: / # cat /proc/meminfo [ 14.969843] slab kmalloc-64 start c8ab0140 data offset 64 pointer offset 0 size 64 allocated at 0x6b6b6b6b [ 14.970635] 0x6b6b6b6b [ 14.970794] 0x6b6b6b6b [ 14.970932] 0x6b6b6b6b [ 14.971077] 0x6b6b6b6b [ 14.971202] 0x6b6b6b6b [ 14.971317] 0x6b6b6b6b [ 14.971423] 0x6b6b6b6b [ 14.971635] 0x6b6b6b6b [ 14.971740] 0x6b6b6b6b [ 14.971871] 0x6b6b6b6b [ 14.972229] 0x6b6b6b6b [ 14.972363] 0x6b6b6b6b [ 14.972505] 0xa56b6b6b [ 14.972631] 0xbbbbbbbb [ 14.972734] 0xc8ab0400 [ 14.972891] meminfo_proc_show+0x40/0x4fc The reason is that the object address was not adjusted for the red zone. With this fix, the backtrace is correct: / # cat /proc/meminfo [ 14.870782] slab kmalloc-64 start c8ab0140 data offset 64 pointer offset 128 size 64 allocated at meminfo_proc_show+0x40/0x4f4 [ 14.871817] meminfo_proc_show+0x40/0x4f4 [ 14.872035] seq_read_iter+0x18c/0x4c4 [ 14.872229] proc_reg_read_iter+0x84/0xac [ 14.872433] generic_file_splice_read+0xe8/0x17c [ 14.872621] splice_direct_to_actor+0xb8/0x290 [ 14.872747] do_splice_direct+0xa0/0xe0 [ 14.872896] do_sendfile+0x2d0/0x438 [ 14.873044] sys_sendfile64+0x12c/0x140 [ 14.873229] ret_fast_syscall+0x0/0x58 [ 14.873372] 0xbe861de4 Acked-by: Vlastimil Babka Signed-off-by: Vaneet Narang Signed-off-by: Maninder Singh Signed-off-by: Paul E. McKenney --- mm/slub.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/slub.c b/mm/slub.c index feda53ae62ba..8f2d13508ec9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4002,6 +4002,7 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) !(s->flags & SLAB_STORE_USER)) return; #ifdef CONFIG_SLUB_DEBUG + objp = fixup_red_left(s, objp); trackp = get_track(s, objp, TRACK_ALLOC); kpp->kp_ret = (void *)trackp->addr; #ifdef CONFIG_STACKTRACE From e548eaa116d858f07816d41e24835a41f7e7d270 Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Tue, 16 Mar 2021 16:07:11 +0530 Subject: [PATCH 13/76] mm/slub: Add Support for free path information of an object This commit adds enables a stack dump for the last free of an object: slab kmalloc-64 start c8ab0140 data offset 64 pointer offset 0 size 64 allocated at meminfo_proc_show+0x40/0x4fc [ 20.192078] meminfo_proc_show+0x40/0x4fc [ 20.192263] seq_read_iter+0x18c/0x4c4 [ 20.192430] proc_reg_read_iter+0x84/0xac [ 20.192617] generic_file_splice_read+0xe8/0x17c [ 20.192816] splice_direct_to_actor+0xb8/0x290 [ 20.193008] do_splice_direct+0xa0/0xe0 [ 20.193185] do_sendfile+0x2d0/0x438 [ 20.193345] sys_sendfile64+0x12c/0x140 [ 20.193523] ret_fast_syscall+0x0/0x58 [ 20.193695] 0xbeeacde4 [ 20.193822] Free path: [ 20.193935] meminfo_proc_show+0x5c/0x4fc [ 20.194115] seq_read_iter+0x18c/0x4c4 [ 20.194285] proc_reg_read_iter+0x84/0xac [ 20.194475] generic_file_splice_read+0xe8/0x17c [ 20.194685] splice_direct_to_actor+0xb8/0x290 [ 20.194870] do_splice_direct+0xa0/0xe0 [ 20.195014] do_sendfile+0x2d0/0x438 [ 20.195174] sys_sendfile64+0x12c/0x140 [ 20.195336] ret_fast_syscall+0x0/0x58 [ 20.195491] 0xbeeacde4 Acked-by: Vlastimil Babka Co-developed-by: Vaneet Narang Signed-off-by: Vaneet Narang Signed-off-by: Maninder Singh Signed-off-by: Paul E. McKenney --- mm/slab.h | 1 + mm/slab_common.c | 12 +++++++++++- mm/slub.c | 7 +++++++ mm/util.c | 2 +- 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 18c1927cd196..7189daa0c586 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -640,6 +640,7 @@ struct kmem_obj_info { struct kmem_cache *kp_slab_cache; void *kp_ret; void *kp_stack[KS_ADDRS_COUNT]; + void *kp_free_stack[KS_ADDRS_COUNT]; }; void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page); #endif diff --git a/mm/slab_common.c b/mm/slab_common.c index f8833d3e5d47..92e3aa78bb4d 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -564,7 +564,7 @@ EXPORT_SYMBOL_GPL(kmem_valid_obj); * depends on the type of object and on how much debugging is enabled. * For a slab-cache object, the fact that it is a slab object is printed, * and, if available, the slab name, return address, and stack trace from - * the allocation of that object. + * the allocation and last free path of that object. * * This function will splat if passed a pointer to a non-slab object. * If you are not sure what type of object you have, you should instead @@ -609,6 +609,16 @@ void kmem_dump_obj(void *object) break; pr_info(" %pS\n", kp.kp_stack[i]); } + + if (kp.kp_free_stack[0]) + pr_cont(" Free path:\n"); + + for (i = 0; i < ARRAY_SIZE(kp.kp_free_stack); i++) { + if (!kp.kp_free_stack[i]) + break; + pr_info(" %pS\n", kp.kp_free_stack[i]); + } + } EXPORT_SYMBOL_GPL(kmem_dump_obj); #endif diff --git a/mm/slub.c b/mm/slub.c index 8f2d13508ec9..deec894a1345 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4011,6 +4011,13 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) if (!kpp->kp_stack[i]) break; } + + trackp = get_track(s, objp, TRACK_FREE); + for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) { + kpp->kp_free_stack[i] = (void *)trackp->addrs[i]; + if (!kpp->kp_free_stack[i]) + break; + } #endif #endif } diff --git a/mm/util.c b/mm/util.c index a8bf17f18a81..0b6dd9d81da7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -983,7 +983,7 @@ int __weak memcmp_pages(struct page *page1, struct page *page2) * depends on the type of object and on how much debugging is enabled. * For example, for a slab-cache object, the slab name is printed, and, * if available, the return address and stack trace from the allocation - * of that object. + * and last free path of that object. */ void mem_dump_obj(void *object) { From d76e0926d8356e330afce1c711e0301132d06a67 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Feb 2021 01:10:03 +0100 Subject: [PATCH 14/76] rcu/nocb: Use the rcuog CPU's ->nocb_timer Currently each CPU has its own ->nocb_timer queued when the nocb_gp wakeup must be deferred. This approach has many drawbacks, compared to a solution based on a single timer per NOCB group: * There are a lot of timers to maintain. * The per-rdp ->nocb_lock must be held to queue and cancel the timer and this lock can already be heavily contended. * One timer firing doesn't cancel the other timers in the same group: - These other timers can thus cause spurious wakeups - Each rdp that queued a timer must lock both ->nocb_lock and then ->nocb_gp_lock upon exit from the kernel to idle/user/guest mode. * We can't cancel all of them if we detect an unflushed bypass in nocb_gp_wait(). In fact currently we only ever cancel the ->nocb_timer of the leader group. * The leader group's nocb_timer is cancelled without locking ->nocb_lock in nocb_gp_wait(). This currently appears to be safe but is an accident waiting to happen. * Since the timer acquires ->nocb_lock, it requires extra care in the NOCB (de-)offloading process, requiring that it be either enabled or disabled and then flushed. This commit instead uses the rcuog kthread's CPU's ->nocb_timer instead. It is protected by nocb_gp_lock, which is _way_ less contended and remains so even after this change. As a matter of fact, the nocb_timer almost never fires and the deferred wakeup is mostly carried out upon idle/user/guest entry. Now the early check performed at this point in do_nocb_deferred_wakeup() is done on rdp_gp->nocb_defer_wakeup, which is of course racy. However, this raciness is harmless because we only need the guarantee that the timer is queued if we were the last one to queue it. Any other situation (another CPU has queued it and we either see it or not) is fine. This solves all the issues listed above. Signed-off-by: Frederic Weisbecker Cc: Josh Triplett Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 - kernel/rcu/tree_plugin.h | 140 +++++++++++++++++++++------------------ 2 files changed, 77 insertions(+), 64 deletions(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 71821d59d95c..b280a843bd2c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -257,7 +257,6 @@ struct rcu_data { }; /* Values for nocb_defer_wakeup field in struct rcu_data. */ -#define RCU_NOCB_WAKE_OFF -1 #define RCU_NOCB_WAKE_NOT 0 #define RCU_NOCB_WAKE 1 #define RCU_NOCB_WAKE_FORCE 2 diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ad0156b86937..5a2aa9c4e569 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -33,10 +33,6 @@ static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) return false; } -static inline bool rcu_running_nocb_timer(struct rcu_data *rdp) -{ - return (timer_curr_running(&rdp->nocb_timer) && !in_irq()); -} #else static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) { @@ -48,11 +44,6 @@ static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) return false; } -static inline bool rcu_running_nocb_timer(struct rcu_data *rdp) -{ - return false; -} - #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) @@ -72,8 +63,7 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) rcu_lockdep_is_held_nocb(rdp) || (rdp == this_cpu_ptr(&rcu_data) && !(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) || - rcu_current_is_nocb_kthread(rdp) || - rcu_running_nocb_timer(rdp)), + rcu_current_is_nocb_kthread(rdp)), "Unsafe read of RCU_NOCB offloaded state" ); @@ -1692,43 +1682,50 @@ bool rcu_is_nocb_cpu(int cpu) return false; } -/* - * Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock - * and this function releases it. - */ -static bool wake_nocb_gp(struct rcu_data *rdp, bool force, - unsigned long flags) - __releases(rdp->nocb_lock) +static bool __wake_nocb_gp(struct rcu_data *rdp_gp, + struct rcu_data *rdp, + bool force, unsigned long flags) + __releases(rdp_gp->nocb_gp_lock) { bool needwake = false; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - lockdep_assert_held(&rdp->nocb_lock); if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) { - rcu_nocb_unlock_irqrestore(rdp, flags); + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("AlreadyAwake")); return false; } - if (READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT) { - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - del_timer(&rdp->nocb_timer); + if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { + WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + del_timer(&rdp_gp->nocb_timer); } - rcu_nocb_unlock_irqrestore(rdp, flags); - raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { WRITE_ONCE(rdp_gp->nocb_gp_sleep, false); needwake = true; - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); } raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); - if (needwake) + if (needwake) { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); wake_up_process(rdp_gp->nocb_gp_kthread); + } return needwake; } +/* + * Kick the GP kthread for this NOCB group. + */ +static bool wake_nocb_gp(struct rcu_data *rdp, bool force) +{ + unsigned long flags; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + return __wake_nocb_gp(rdp_gp, rdp, force, flags); +} + /* * Arrange to wake the GP kthread for this NOCB group at some future * time when it is safe to do so. @@ -1736,12 +1733,18 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force, static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, const char *reason) { - if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_OFF) - return; - if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) - mod_timer(&rdp->nocb_timer, jiffies + 1); - if (rdp->nocb_defer_wakeup < waketype) - WRITE_ONCE(rdp->nocb_defer_wakeup, waketype); + unsigned long flags; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + + if (rdp_gp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) + mod_timer(&rdp_gp->nocb_timer, jiffies + 1); + if (rdp_gp->nocb_defer_wakeup < waketype) + WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); + + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); } @@ -1968,13 +1971,14 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, rdp->qlen_last_fqs_check = len; if (!irqs_disabled_flags(flags)) { /* ... if queue was empty ... */ - wake_nocb_gp(rdp, false, flags); + rcu_nocb_unlock_irqrestore(rdp, flags); + wake_nocb_gp(rdp, false); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeEmpty")); } else { + rcu_nocb_unlock_irqrestore(rdp, flags); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, TPS("WakeEmptyIsDeferred")); - rcu_nocb_unlock_irqrestore(rdp, flags); } } else if (len > rdp->qlen_last_fqs_check + qhimark) { /* ... or if many callbacks queued. */ @@ -1989,10 +1993,14 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, smp_mb(); /* Enqueue before timer_pending(). */ if ((rdp->nocb_cb_sleep || !rcu_segcblist_ready_cbs(&rdp->cblist)) && - !timer_pending(&rdp->nocb_bypass_timer)) + !timer_pending(&rdp->nocb_bypass_timer)) { + rcu_nocb_unlock_irqrestore(rdp, flags); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, TPS("WakeOvfIsDeferred")); - rcu_nocb_unlock_irqrestore(rdp, flags); + } else { + rcu_nocb_unlock_irqrestore(rdp, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); + } } else { rcu_nocb_unlock_irqrestore(rdp, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); @@ -2118,11 +2126,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) bypass = true; } rnp = rdp->mynode; - if (bypass) { // Avoid race with first bypass CB. - WRITE_ONCE(my_rdp->nocb_defer_wakeup, - RCU_NOCB_WAKE_NOT); - del_timer(&my_rdp->nocb_timer); - } + // Advance callbacks if helpful and low contention. needwake_gp = false; if (!rcu_segcblist_restempty(&rdp->cblist, @@ -2168,11 +2172,18 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) my_rdp->nocb_gp_bypass = bypass; my_rdp->nocb_gp_gp = needwait_gp; my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0; - if (bypass && !rcu_nocb_poll) { - // At least one child with non-empty ->nocb_bypass, so set - // timer in order to avoid stranding its callbacks. + if (bypass) { raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); - mod_timer(&my_rdp->nocb_bypass_timer, j + 2); + // Avoid race with first bypass CB. + if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { + WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + del_timer(&my_rdp->nocb_timer); + } + if (!rcu_nocb_poll) { + // At least one child with non-empty ->nocb_bypass, so set + // timer in order to avoid stranding its callbacks. + mod_timer(&my_rdp->nocb_bypass_timer, j + 2); + } raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); } if (rcu_nocb_poll) { @@ -2344,15 +2355,18 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp) { unsigned long flags; int ndw; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; int ret; - rcu_nocb_lock_irqsave(rdp, flags); - if (!rcu_nocb_need_deferred_wakeup(rdp)) { - rcu_nocb_unlock_irqrestore(rdp, flags); + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + + if (!rcu_nocb_need_deferred_wakeup(rdp_gp)) { + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); return false; } - ndw = READ_ONCE(rdp->nocb_defer_wakeup); - ret = wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); + + ndw = rdp_gp->nocb_defer_wakeup; + ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); return ret; @@ -2373,7 +2387,10 @@ static void do_nocb_deferred_wakeup_timer(struct timer_list *t) */ static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) { - if (rcu_nocb_need_deferred_wakeup(rdp)) + if (!rdp->nocb_gp_rdp) + return false; + + if (rcu_nocb_need_deferred_wakeup(rdp->nocb_gp_rdp)) return do_nocb_deferred_wakeup_common(rdp); return false; } @@ -2443,17 +2460,15 @@ static long rcu_nocb_rdp_deoffload(void *arg) swait_event_exclusive(rdp->nocb_state_wq, !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP)); - rcu_nocb_lock_irqsave(rdp, flags); - /* Make sure nocb timer won't stay around */ - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_OFF); - rcu_nocb_unlock_irqrestore(rdp, flags); - del_timer_sync(&rdp->nocb_timer); - /* - * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY with CB unlocked - * and IRQs disabled but let's be paranoid. + * Lock one last time to acquire latest callback updates from kthreads + * so we can later handle callbacks locally without locking. */ rcu_nocb_lock_irqsave(rdp, flags); + /* + * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY after the nocb + * lock is released but how about being paranoid for once? + */ rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY); /* * With SEGCBLIST_SOFTIRQ_ONLY, we can't use @@ -2517,8 +2532,7 @@ static long rcu_nocb_rdp_offload(void *arg) * SEGCBLIST_SOFTIRQ_ONLY mode. */ raw_spin_lock_irqsave(&rdp->nocb_lock, flags); - /* Re-enable nocb timer */ - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + /* * We didn't take the nocb lock while working on the * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode. From 258ca95e2cd9a0fcc4508a1bf1742b1a3e9a7bbb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Feb 2021 01:10:04 +0100 Subject: [PATCH 15/76] timer: Revert "timer: Add timer_curr_running()" This reverts commit dcd42591ebb8a25895b551a5297ea9c24414ba54. The only user was RCU/nocb. Signed-off-by: Frederic Weisbecker Cc: Josh Triplett Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Boqun Feng Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney --- include/linux/timer.h | 2 -- kernel/time/timer.c | 14 -------------- 2 files changed, 16 deletions(-) diff --git a/include/linux/timer.h b/include/linux/timer.h index 4118a97e62fb..fda13c9d1256 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -192,8 +192,6 @@ extern int try_to_del_timer_sync(struct timer_list *timer); #define del_singleshot_timer_sync(t) del_timer_sync(t) -extern bool timer_curr_running(struct timer_list *timer); - extern void init_timers(void); struct hrtimer; extern enum hrtimer_restart it_real_fn(struct hrtimer *); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index d111adf4a0cb..84332f01dc57 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1237,20 +1237,6 @@ int try_to_del_timer_sync(struct timer_list *timer) } EXPORT_SYMBOL(try_to_del_timer_sync); -bool timer_curr_running(struct timer_list *timer) -{ - int i; - - for (i = 0; i < NR_BASES; i++) { - struct timer_base *base = this_cpu_ptr(&timer_bases[i]); - - if (base->running_timer == timer) - return true; - } - - return false; -} - #ifdef CONFIG_PREEMPT_RT static __init void timer_base_init_expiry_lock(struct timer_base *base) { From 94df76a1971d9c61eb2c67ae10cc294b68cbd03b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 2 Apr 2021 01:47:03 +0200 Subject: [PATCH 16/76] srcu: Remove superfluous sdp->srcu_lock_count zero filling Because alloc_percpu() zeroes out the allocated memory, there is no need to zero-fill newly allocated per-CPU memory. This commit therefore removes the loop zeroing the ->srcu_lock_count and ->srcu_unlock_count arrays from init_srcu_struct_nodes(). This is the only use of that function's is_static parameter, which this commit also removes. Signed-off-by: Frederic Weisbecker Cc: Boqun Feng Cc: Lai Jiangshan Cc: Neeraj Upadhyay Cc: Josh Triplett Cc: Joel Fernandes Cc: Uladzislau Rezki Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index e26547b34ad3..3414aff2a4b0 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -80,7 +80,7 @@ do { \ * srcu_read_unlock() running against them. So if the is_static parameter * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. */ -static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static) +static void init_srcu_struct_nodes(struct srcu_struct *ssp) { int cpu; int i; @@ -148,14 +148,6 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static) timer_setup(&sdp->delay_work, srcu_delay_timer, 0); sdp->ssp = ssp; sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); - if (is_static) - continue; - - /* Dynamically allocated, better be no srcu_read_locks()! */ - for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) { - sdp->srcu_lock_count[i] = 0; - sdp->srcu_unlock_count[i] = 0; - } } } @@ -179,7 +171,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->sda = alloc_percpu(struct srcu_data); if (!ssp->sda) return -ENOMEM; - init_srcu_struct_nodes(ssp, is_static); + init_srcu_struct_nodes(ssp); ssp->srcu_gp_seq_needed_exp = 0; ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ From c75e9d29159b94904d10b23ad6aebdf869b61106 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 2 Apr 2021 01:47:02 +0200 Subject: [PATCH 17/76] srcu: Remove superfluous ssp initialization for early callbacks Pre-srcu_init() invocations of call_srcu() initialize the srcu_struct structure in question, so there is no need to check this initialization in srcu_init() when initiating grace periods for srcu_struct structures that had early call_srcu() invocations. This commit therefore drops the calls to check_init_srcu_struct() in srcu_init(). Signed-off-by: Frederic Weisbecker Cc: Boqun Feng Cc: Lai Jiangshan Cc: Neeraj Upadhyay Cc: Josh Triplett Cc: Joel Fernandes Cc: Uladzislau Rezki Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 3414aff2a4b0..f4f0cbf7a02b 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1388,7 +1388,6 @@ void __init srcu_init(void) while (!list_empty(&srcu_boot_list)) { ssp = list_first_entry(&srcu_boot_list, struct srcu_struct, work.work.entry); - check_init_srcu_struct(ssp); list_del_init(&ssp->work.work.entry); queue_work(rcu_gp_wq, &ssp->work.work); } From 7bf0a6141ab9c1d113bd85d6d13d43903a4278ba Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 9 Apr 2021 00:38:58 +0200 Subject: [PATCH 18/76] srcu: Unconditionally embed struct lockdep_map Since struct lockdep_map has zero size when CONFIG_DEBUG_LOCK_ALLOC=n, this commit removes the #ifdef from the srcu_struct structure's ->dep_map. This change will simplify further manipulations of this field. Signed-off-by: Frederic Weisbecker Cc: Uladzislau Rezki Cc: Boqun Feng Cc: Lai Jiangshan Cc: Neeraj Upadhyay Cc: Josh Triplett Cc: Joel Fernandes Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 9cfcc8a756ae..cb1f4351e8ba 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -82,9 +82,7 @@ struct srcu_struct { /* callback for the barrier */ /* operation. */ struct delayed_work work; -#ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ }; /* Values for state variable (bottom bits of ->srcu_gp_seq). */ From 8e9c01c717df7e05c5bd1ca86aaa3a74b31f37f1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 9 Apr 2021 00:38:59 +0200 Subject: [PATCH 19/76] srcu: Initialize SRCU after timers Once srcu_init() is called, the SRCU core will make use of delayed workqueues, which rely on timers. However init_timers() is called several steps after rcu_init(). This means that a call_srcu() after rcu_init() but before init_timers() would find itself within a dangerously uninitialized timer core. This commit therefore creates a separate call to srcu_init() after init_timer() completes, which ensures that we stay in early SRCU mode until timers are safe(r). Signed-off-by: Frederic Weisbecker Cc: Uladzislau Rezki Cc: Boqun Feng Cc: Lai Jiangshan Cc: Neeraj Upadhyay Cc: Josh Triplett Cc: Joel Fernandes Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 6 ++++++ init/main.c | 2 ++ kernel/rcu/rcu.h | 6 ------ kernel/rcu/srcutree.c | 5 +++++ kernel/rcu/tiny.c | 1 - kernel/rcu/tree.c | 1 - 6 files changed, 13 insertions(+), 8 deletions(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index a0895bbf71ce..e6011a9975af 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -64,6 +64,12 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp); unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp); bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie); +#ifdef CONFIG_SRCU +void srcu_init(void); +#else /* #ifdef CONFIG_SRCU */ +static inline void srcu_init(void) { } +#endif /* #else #ifdef CONFIG_SRCU */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC /** diff --git a/init/main.c b/init/main.c index eb01e121d2f1..7b6f49c4d388 100644 --- a/init/main.c +++ b/init/main.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -979,6 +980,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void) tick_init(); rcu_init_nohz(); init_timers(); + srcu_init(); hrtimers_init(); softirq_init(); timekeeping_init(); diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index bf0827d4b659..ca3f2af32bf8 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -422,12 +422,6 @@ do { \ #endif /* #if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) */ -#ifdef CONFIG_SRCU -void srcu_init(void); -#else /* #ifdef CONFIG_SRCU */ -static inline void srcu_init(void) { } -#endif /* #else #ifdef CONFIG_SRCU */ - #ifdef CONFIG_TINY_RCU /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ static inline bool rcu_gp_is_normal(void) { return true; } diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index f4f0cbf7a02b..9ec35c158740 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1384,6 +1384,11 @@ void __init srcu_init(void) { struct srcu_struct *ssp; + /* + * Once that is set, call_srcu() can follow the normal path and + * queue delayed work. This must follow RCU workqueues creation + * and timers initialization. + */ srcu_init_done = true; while (!list_empty(&srcu_boot_list)) { ssp = list_first_entry(&srcu_boot_list, struct srcu_struct, diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index c8a029fbb114..340b3f8b090d 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -221,5 +221,4 @@ void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); rcu_early_boot_tests(); - srcu_init(); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8e78b2430c16..c35b15229cff 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4735,7 +4735,6 @@ void __init rcu_init(void) WARN_ON(!rcu_gp_wq); rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_par_gp_wq); - srcu_init(); /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ From b5befe842e6612cf894cf4a199924ee872d8b7d8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 17 Apr 2021 15:16:49 +0200 Subject: [PATCH 20/76] srcu: Fix broken node geometry after early ssp init An srcu_struct structure that is initialized before rcu_init_geometry() will have its srcu_node hierarchy based on CONFIG_NR_CPUS. Once rcu_init_geometry() is called, this hierarchy is compressed as needed for the actual maximum number of CPUs for this system. Later on, that srcu_struct structure is confused, sometimes referring to its initial CONFIG_NR_CPUS-based hierarchy, and sometimes instead to the new num_possible_cpus() hierarchy. For example, each of its ->mynode fields continues to reference the original leaf rcu_node structures, some of which might no longer exist. On the other hand, srcu_for_each_node_breadth_first() traverses to the new node hierarchy. There are at least two bad possible outcomes to this: 1) a) A callback enqueued early on an srcu_data structure (call it *sdp) is recorded pending on sdp->mynode->srcu_data_have_cbs in srcu_funnel_gp_start() with sdp->mynode pointing to a deep leaf (say 3 levels). b) The grace period ends after rcu_init_geometry() shrinks the nodes level to a single one. srcu_gp_end() walks through the new srcu_node hierarchy without ever reaching the old leaves so the callback is never executed. This is easily reproduced on an 8 CPUs machine with CONFIG_NR_CPUS >= 32 and "rcupdate.rcu_self_test=1". The srcu_barrier() after early tests verification never completes and the boot hangs: [ 5413.141029] INFO: task swapper/0:1 blocked for more than 4915 seconds. [ 5413.147564] Not tainted 5.12.0-rc4+ #28 [ 5413.151927] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 5413.159753] task:swapper/0 state:D stack: 0 pid: 1 ppid: 0 flags:0x00004000 [ 5413.168099] Call Trace: [ 5413.170555] __schedule+0x36c/0x930 [ 5413.174057] ? wait_for_completion+0x88/0x110 [ 5413.178423] schedule+0x46/0xf0 [ 5413.181575] schedule_timeout+0x284/0x380 [ 5413.185591] ? wait_for_completion+0x88/0x110 [ 5413.189957] ? mark_held_locks+0x61/0x80 [ 5413.193882] ? mark_held_locks+0x61/0x80 [ 5413.197809] ? _raw_spin_unlock_irq+0x24/0x50 [ 5413.202173] ? wait_for_completion+0x88/0x110 [ 5413.206535] wait_for_completion+0xb4/0x110 [ 5413.210724] ? srcu_torture_stats_print+0x110/0x110 [ 5413.215610] srcu_barrier+0x187/0x200 [ 5413.219277] ? rcu_tasks_verify_self_tests+0x50/0x50 [ 5413.224244] ? rdinit_setup+0x2b/0x2b [ 5413.227907] rcu_verify_early_boot_tests+0x2d/0x40 [ 5413.232700] do_one_initcall+0x63/0x310 [ 5413.236541] ? rdinit_setup+0x2b/0x2b [ 5413.240207] ? rcu_read_lock_sched_held+0x52/0x80 [ 5413.244912] kernel_init_freeable+0x253/0x28f [ 5413.249273] ? rest_init+0x250/0x250 [ 5413.252846] kernel_init+0xa/0x110 [ 5413.256257] ret_from_fork+0x22/0x30 2) An srcu_struct structure that is initialized before rcu_init_geometry() and used afterward will always have stale rdp->mynode references, resulting in callbacks to be missed in srcu_gp_end(), just like in the previous scenario. This commit therefore causes init_srcu_struct_nodes to initialize the geometry, if needed. This ensures that the srcu_node hierarchy is properly built and distributed from the get-go. Suggested-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Cc: Boqun Feng Cc: Lai Jiangshan Cc: Neeraj Upadhyay Cc: Josh Triplett Cc: Joel Fernandes Cc: Uladzislau Rezki Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 2 ++ kernel/rcu/srcutree.c | 3 +++ kernel/rcu/tree.c | 16 +++++++++++++++- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index ca3f2af32bf8..28de768bd875 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -308,6 +308,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) } } +extern void rcu_init_geometry(void); + /* Returns a pointer to the first leaf rcu_node structure. */ #define rcu_first_leaf_node() (rcu_state.level[rcu_num_lvls - 1]) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 9ec35c158740..5fdbde687feb 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -90,6 +90,9 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp) struct srcu_node *snp; struct srcu_node *snp_first; + /* Initialize geometry if it has not already been initialized. */ + rcu_init_geometry(); + /* Work out the overall tree geometry. */ ssp->level[0] = &ssp->node[0]; for (i = 1; i < rcu_num_lvls; i++) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c35b15229cff..0420b23fc9d0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4582,11 +4582,25 @@ static void __init rcu_init_one(void) * replace the definitions in tree.h because those are needed to size * the ->node array in the rcu_state structure. */ -static void __init rcu_init_geometry(void) +void rcu_init_geometry(void) { ulong d; int i; + static unsigned long old_nr_cpu_ids; int rcu_capacity[RCU_NUM_LVLS]; + static bool initialized; + + if (initialized) { + /* + * Warn if setup_nr_cpu_ids() had not yet been invoked, + * unless nr_cpus_ids == NR_CPUS, in which case who cares? + */ + WARN_ON_ONCE(old_nr_cpu_ids != nr_cpu_ids); + return; + } + + old_nr_cpu_ids = nr_cpu_ids; + initialized = true; /* * Initialize any unspecified boot parameters. From 06a3ec9205d570526665c2071d1a5492c3091a54 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 4 Mar 2021 14:41:47 -0800 Subject: [PATCH 21/76] rcu-tasks: Add block comment laying out RCU Tasks design This commit adds a block comment that gives a high-level overview of how RCU tasks grace periods progress. It also adds a note about how exiting tasks are handled, plus it gives an overview of the memory ordering. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 350ebf5051f9..94d2c2c7f0ab 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -377,6 +377,46 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) // Finally, this implementation does not support high call_rcu_tasks() // rates from multiple CPUs. If this is required, per-CPU callback lists // will be needed. +// +// The implementation uses rcu_tasks_wait_gp(), which relies on function +// pointers in the rcu_tasks structure. The rcu_spawn_tasks_kthread() +// function sets these function pointers up so that rcu_tasks_wait_gp() +// invokes these functions in this order: +// +// rcu_tasks_pregp_step(): +// Invokes synchronize_rcu() in order to wait for all in-flight +// t->on_rq and t->nvcsw transitions to complete. This works because +// all such transitions are carried out with interrupts disabled. +// rcu_tasks_pertask(), invoked on every non-idle task: +// For every runnable non-idle task other than the current one, use +// get_task_struct() to pin down that task, snapshot that task's +// number of voluntary context switches, and add that task to the +// holdout list. +// rcu_tasks_postscan(): +// Invoke synchronize_srcu() to ensure that all tasks that were +// in the process of exiting (and which thus might not know to +// synchronize with this RCU Tasks grace period) have completed +// exiting. +// check_all_holdout_tasks(), repeatedly until holdout list is empty: +// Scans the holdout list, attempting to identify a quiescent state +// for each task on the list. If there is a quiescent state, the +// corresponding task is removed from the holdout list. +// rcu_tasks_postgp(): +// Invokes synchronize_rcu() in order to ensure that all prior +// t->on_rq and t->nvcsw transitions are seen by all CPUs and tasks +// to have happened before the end of this RCU Tasks grace period. +// Again, this works because all such transitions are carried out +// with interrupts disabled. +// +// For each exiting task, the exit_tasks_rcu_start() and +// exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU +// read-side critical sections waited for by rcu_tasks_postscan(). +// +// Pre-grace-period update-side code is ordered before the grace via the +// ->cbs_lock and the smp_mb__after_spinlock(). Pre-grace-period read-side +// code is ordered before the grace period via synchronize_rcu() call +// in rcu_tasks_pregp_step() and by the scheduler's locks and interrupt +// disabling. /* Pre-grace-period preparation. */ static void rcu_tasks_pregp_step(void) From 9fc98e3143de7b7e8d766aef41b46ec0bc0ae4ca Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 4 Mar 2021 14:46:59 -0800 Subject: [PATCH 22/76] rcu-tasks: Add block comment laying out RCU Rude design This commit adds a block comment that gives a high-level overview of how RCU Rude grace periods progress. It also gives an overview of the memory ordering. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 94d2c2c7f0ab..d6aa352cd705 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -645,8 +645,13 @@ void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } // passing an empty function to schedule_on_each_cpu(). This approach // provides an asynchronous call_rcu_tasks_rude() API and batching // of concurrent calls to the synchronous synchronize_rcu_rude() API. -// This sends IPIs far and wide and induces otherwise unnecessary context -// switches on all online CPUs, whether idle or not. +// This invokes schedule_on_each_cpu() in order to send IPIs far and wide +// and induces otherwise unnecessary context switches on all online CPUs, +// whether idle or not. +// +// Callback handling is provided by the rcu_tasks_kthread() function. +// +// Ordering is provided by the scheduler's context-switch code. // Empty function to allow workqueues to force a context switch. static void rcu_tasks_be_rude(struct work_struct *work) From 98da77199f0c629f0687b92824f1da2010f677e3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 4 Mar 2021 14:15:00 -0800 Subject: [PATCH 23/76] torture: Fix remaining erroneous torture.sh instance of $* Although "eval" was removed from torture.sh, that commit failed to update the KCSAN instance of $* to "$@". This results in failures when (for example) --bootargs is given more than one argument. This commit therefore makes this change. There is one remaining instance of $* in torture.sh, but this is used only in the "echo" command, where quoting doesn't matter so much. Fixes: 197220d4a334 ("torture: Remove use of "eval" in torture.sh") Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/torture.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index 56e2e1a42569..53ec7c046262 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -302,7 +302,7 @@ function torture_set { kcsan_kmake_tag="--kmake-args" cur_kcsan_kmake_args="$kcsan_kmake_args" fi - torture_one $* --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y" $kcsan_kmake_tag $cur_kcsan_kmake_args --kcsan + torture_one "$@" --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y" $kcsan_kmake_tag $cur_kcsan_kmake_args --kcsan fi } From 3d2cc4fec861a825ecd7d9ce2797df4e5f0f5517 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 4 Mar 2021 17:21:17 -0800 Subject: [PATCH 24/76] torture: Add "scenarios" option to kvm.sh --dryrun parameter This commit adds "--dryrun scenarios" to kvm.sh, which prints something like this: 1. TREE03 2. TREE07 3. SRCU-P SRCU-N 4. TREE01 TRACE01 5. TREE02 TRACE02 6. TREE04 RUDE01 TASKS01 7. TREE05 TASKS03 SRCU-T SRCU-U 8. TASKS02 TINY01 TINY02 TREE09 This format is more convenient for scripts that run batches of scenarios. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 6bf00a003d3d..3bd523a33700 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -64,7 +64,7 @@ usage () { echo " --cpus N" echo " --datestamp string" echo " --defconfig string" - echo " --dryrun batches|sched|script" + echo " --dryrun batches|scenarios|sched|script" echo " --duration minutes | s | h | d" echo " --gdb" echo " --help" @@ -130,7 +130,7 @@ do shift ;; --dryrun) - checkarg --dryrun "batches|sched|script" $# "$2" 'batches\|sched\|script' '^--' + checkarg --dryrun "batches|sched|script" $# "$2" 'batches\|scenarios\|sched\|script' '^--' dryrun=$2 shift ;; @@ -577,6 +577,25 @@ egrep 'Start batch|Starting build\.' $T/script | grep -v ">>" | print batchno, $1, $2 }' > $T/batches +# As above, but one line per batch. +grep -v '^#' $T/batches | awk ' +BEGIN { + oldbatch = 1; +} + +{ + if (oldbatch != $1) { + print ++n ". " curbatch; + curbatch = ""; + oldbatch = $1; + } + curbatch = curbatch " " $2; +} + +END { + print ++n ". " curbatch; +}' > $T/scenarios + if test "$dryrun" = script then cat $T/script @@ -597,11 +616,16 @@ elif test "$dryrun" = batches then cat $T/batches exit 0 +elif test "$dryrun" = scenarios +then + cat $T/scenarios + exit 0 else # Not a dryrun. Record the batches and the number of CPUs, then run the script. bash $T/script ret=$? cp $T/batches $resdir/$ds/batches + cp $T/scenarios $resdir/$ds/scenarios echo '#' cpus=$cpus >> $resdir/$ds/batches echo " --- Done at `date` (`get_starttime_duration $starttime`) exitcode $ret" | tee -a $resdir/$ds/log exit $ret From fb4855c36249b3609718d2b83f4756b748a83349 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 4 Mar 2021 17:52:59 -0800 Subject: [PATCH 25/76] torture: Make kvm-again.sh use "scenarios" rather than "batches" file This commit saves a few lines of code by making kvm-again.sh use the "scenarios" file rather than the "batches" file, both of which are generated by kvm.sh. This results in a break point because new versions of kvm-again.sh cannot handle "res" directories produced by old versions of kvm.sh, which lack the "scenarios" file. In the unlikely event that this becomes a problem, a trivial script suffices to convert the "batches" file to a "scenarios" file, and this script may be easily extracted from kvm.sh. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/kvm-again.sh | 22 +++++-------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-again.sh b/tools/testing/selftests/rcutorture/bin/kvm-again.sh index 46e47a00a7db..b74bb4343ab9 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-again.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-again.sh @@ -29,7 +29,7 @@ then echo "Usage: $scriptname /path/to/old/run [ options ]" exit 1 fi -if ! cp "$oldrun/batches" $T/batches.oldrun +if ! cp "$oldrun/scenarios" $T/scenarios.oldrun then # Later on, can reconstitute this from console.log files. echo Prior run batches file does not exist: $oldrun/batches @@ -165,22 +165,12 @@ done grep '^#' $i | sed -e 's/^# //' > $T/qemu-cmd-settings . $T/qemu-cmd-settings -grep -v '^#' $T/batches.oldrun | awk ' -BEGIN { - oldbatch = 1; -} - +grep -v '^#' $T/scenarios.oldrun | awk ' { - if (oldbatch != $1) { - print "kvm-test-1-run-batch.sh" curbatch; - curbatch = ""; - oldbatch = $1; - } - curbatch = curbatch " " $2; -} - -END { - print "kvm-test-1-run-batch.sh" curbatch + curbatch = ""; + for (i = 2; i <= NF; i++) + curbatch = curbatch " " $i; + print "kvm-test-1-run-batch.sh" curbatch; }' > $T/runbatches.sh if test -n "$dryrun" From 68d415f91ff2284828211e937f12a3f6d9a18cb9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Mar 2021 13:12:36 -0800 Subject: [PATCH 26/76] refscale: Allow CPU hotplug to be enabled It is no longer possible to disable CPU hotplug in many configurations, which means that the CONFIG_HOTPLUG_CPU=n lines in refscale's Kconfig options are just a source of useless diagnostics. In addition, refscale doesn't do CPU-hotplug operations in any case. This commit therefore changes these lines to read CONFIG_HOTPLUG_CPU=y. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT | 2 +- tools/testing/selftests/rcutorture/configs/refscale/PREEMPT | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT index 1cd25b7314e3..ad505a887bec 100644 --- a/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT +++ b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT @@ -7,7 +7,7 @@ CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=y CONFIG_NO_HZ_FULL=n CONFIG_RCU_FAST_NO_HZ=n -CONFIG_HOTPLUG_CPU=n +CONFIG_HOTPLUG_CPU=y CONFIG_SUSPEND=n CONFIG_HIBERNATION=n CONFIG_RCU_NOCB_CPU=n diff --git a/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT index d10bc694f42c..4f08e641bb6b 100644 --- a/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT +++ b/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT @@ -7,7 +7,7 @@ CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=y CONFIG_NO_HZ_FULL=n CONFIG_RCU_FAST_NO_HZ=n -CONFIG_HOTPLUG_CPU=n +CONFIG_HOTPLUG_CPU=y CONFIG_SUSPEND=n CONFIG_HIBERNATION=n CONFIG_RCU_NOCB_CPU=n From 179141865d08d9b9ebdbef8775b2450dc6f98a14 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Mar 2021 13:15:31 -0800 Subject: [PATCH 27/76] rcuscale: Allow CPU hotplug to be enabled It is no longer possible to disable CPU hotplug in many configurations, which means that the CONFIG_HOTPLUG_CPU=n lines in rcuscale's Kconfig options are just a source of useless diagnostics. In addition, rcuscale doesn't do CPU-hotplug operations in any case. This commit therefore changes these lines to read CONFIG_HOTPLUG_CPU=y. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/configs/rcuscale/TREE | 2 +- tools/testing/selftests/rcutorture/configs/rcuscale/TREE54 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE index 721cfda76ab2..4cc1cc581321 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE @@ -7,7 +7,7 @@ CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=y CONFIG_NO_HZ_FULL=n CONFIG_RCU_FAST_NO_HZ=n -CONFIG_HOTPLUG_CPU=n +CONFIG_HOTPLUG_CPU=y CONFIG_SUSPEND=n CONFIG_HIBERNATION=n CONFIG_RCU_NOCB_CPU=n diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54 b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54 index 7629f5dd73b2..f5952061fde7 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54 +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54 @@ -8,7 +8,7 @@ CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=y CONFIG_NO_HZ_FULL=n CONFIG_RCU_FAST_NO_HZ=n -CONFIG_HOTPLUG_CPU=n +CONFIG_HOTPLUG_CPU=y CONFIG_SUSPEND=n CONFIG_HIBERNATION=n CONFIG_RCU_FANOUT=3 From 0092eae4cb4e4a34b728efcf9d5857ab0ac2e6f6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Mar 2021 13:59:54 -0800 Subject: [PATCH 28/76] torture: Add kvm-remote.sh script for distributed rcutorture test runs This commit adds a kvm-remote.sh script that prepares a tarball that is then downloaded to the remote system(s) and executed. The user is responsible for having set up the remote systems to run qemu, but all the kernel builds are done on the system running the kvm-remote.sh script. The user is also responsible for setting up the remote systems so that ssh can be run non-interactively, given that ssh is used to poll the remote systems in order to detect completion of each batch. See the script's header comment for usage information. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/kvm-remote.sh | 227 ++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100755 tools/testing/selftests/rcutorture/bin/kvm-remote.sh diff --git a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh new file mode 100755 index 000000000000..c4859fca87b2 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh @@ -0,0 +1,227 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Run a series of tests on remote systems under KVM. +# +# Usage: kvm-remote.sh "systems" [ ] +# kvm-remote.sh "systems" /path/to/old/run [ ] +# +# Copyright (C) 2021 Facebook, Inc. +# +# Authors: Paul E. McKenney + +scriptname=$0 +args="$*" + +if ! test -d tools/testing/selftests/rcutorture/bin +then + echo $scriptname must be run from top-level directory of kernel source tree. + exit 1 +fi + +KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM +PATH=${KVM}/bin:$PATH; export PATH +. functions.sh + +starttime="`get_starttime`" + +systems="$1" +if test -z "$systems" +then + echo $scriptname: Empty list of systems will go nowhere good, giving up. + exit 1 +fi +shift + +# Pathnames: +# T: /tmp/kvm-remote.sh.$$ +# resdir: /tmp/kvm-remote.sh.$$/res +# rundir: /tmp/kvm-remote.sh.$$/res/$ds ("-remote" suffix) +# oldrun: `pwd`/tools/testing/.../res/$otherds +# +# Pathname segments: +# TD: kvm-remote.sh.$$ +# ds: yyyy.mm.dd-hh.mm.ss-remote + +TD=kvm-remote.sh.$$ +T=${TMPDIR-/tmp}/$TD +trap 'rm -rf $T' 0 +mkdir $T + +resdir="$T/res" +ds=`date +%Y.%m.%d-%H.%M.%S`-remote +rundir=$resdir/$ds +echo Results directory: $rundir +echo $scriptname $args +if echo $1 | grep -q '^--' +then + # Fresh build. Create a datestamp unless the caller supplied one. + datestamp="`echo "$@" | awk -v ds="$ds" '{ + for (i = 1; i < NF; i++) { + if ($i == "--datestamp") { + ds = ""; + break; + } + } + if (ds != "") + print "--datestamp " ds; + }'`" + kvm.sh "$@" $datestamp --buildonly > $T/kvm.sh.out 2>&1 + ret=$? + if test "$ret" -ne 0 + then + echo $scriptname: kvm.sh failed exit code $? + cat $T/kvm.sh.out + exit 2 + fi + oldrun="`grep -m 1 "^Results directory: " $T/kvm.sh.out | awk '{ print $3 }'`" + touch "$oldrun/remote-log" + echo $scriptname $args >> "$oldrun/remote-log" + echo | tee -a "$oldrun/remote-log" + echo " ----" kvm.sh output: "(`date`)" | tee -a "$oldrun/remote-log" + cat $T/kvm.sh.out | tee -a "$oldrun/remote-log" + # We are going to run this, so remove the buildonly files. + rm -f "$oldrun"/*/buildonly + kvm-again.sh $oldrun --dryrun --remote --rundir "$rundir" > $T/kvm-again.sh.out 2>&1 + ret=$? + if test "$ret" -ne 0 + then + echo $scriptname: kvm-again.sh failed exit code $? | tee -a "$oldrun/remote-log" + cat $T/kvm-again.sh.out | tee -a "$oldrun/remote-log" + exit 2 + fi +else + # Re-use old run. + oldrun="$1" + if ! echo $oldrun | grep -q '^/' + then + oldrun="`pwd`/$oldrun" + fi + shift + touch "$oldrun/remote-log" + echo $scriptname $args >> "$oldrun/remote-log" + kvm-again.sh "$oldrun" "$@" --dryrun --remote --rundir "$rundir" > $T/kvm-again.sh.out 2>&1 + ret=$? + if test "$ret" -ne 0 + then + echo $scriptname: kvm-again.sh failed exit code $? | tee -a "$oldrun/remote-log" + cat $T/kvm-again.sh.out | tee -a "$oldrun/remote-log" + exit 2 + fi + cp -a "$rundir" "$KVM/res/" + oldrun="$KVM/res/$ds" +fi +echo | tee -a "$oldrun/remote-log" +echo " ----" kvm-again.sh output: "(`date`)" | tee -a "$oldrun/remote-log" +cat $T/kvm-again.sh.out +echo | tee -a "$oldrun/remote-log" +echo Remote run directory: $rundir | tee -a "$oldrun/remote-log" +echo Local build-side run directory: $oldrun | tee -a "$oldrun/remote-log" + +# Create the kvm-remote-N.sh scripts in the bin directory. +awk < "$rundir"/scenarios -v dest="$T/bin" -v rundir="$rundir" ' +{ + n = $1; + sub(/\./, "", n); + fn = dest "/kvm-remote-" n ".sh" + scenarios = ""; + for (i = 2; i <= NF; i++) + scenarios = scenarios " " $i; + print "kvm-test-1-run-batch.sh" scenarios > fn; + print "rm " rundir "/remote.run" >> fn; +}' +chmod +x $T/bin/kvm-remote-*.sh +( cd "`dirname $T`"; tar -chzf $T/binres.tgz "$TD/bin" "$TD/res" ) + +# Check first to avoid the need for cleanup for system-name typos +for i in $systems +do + ncpus="`ssh $i lscpu | grep '^CPU(' | awk '{ print $2 }'`" + echo $i: $ncpus CPUs " " `date` | tee -a "$oldrun/remote-log" + ret=$? + if test "$ret" -ne 0 + then + echo System $i unreachable, giving up. | tee -a "$oldrun/remote-log" + exit 4 | tee -a "$oldrun/remote-log" + fi +done + +# Download and expand the tarball on all systems. +for i in $systems +do + echo Downloading tarball to $i `date` | tee -a "$oldrun/remote-log" + cat $T/binres.tgz | ssh $i "cd /tmp; tar -xzf -" + ret=$? + if test "$ret" -ne 0 + then + echo Unable to download $T/binres.tgz to system $i, giving up. | tee -a "$oldrun/remote-log" + exit 10 | tee -a "$oldrun/remote-log" + fi +done + +# Function to start batches on idle remote $systems +# +# Usage: startbatches curbatch nbatches +# +# Batches are numbered starting at 1. Returns the next batch to start. +# Be careful to redirect all debug output to FD 2 (stderr). +startbatches () { + local curbatch="$1" + local nbatches="$2" + local ret + + # Each pass through the following loop examines one system. + for i in $systems + do + if test "$curbatch" -gt "$nbatches" + then + echo $((nbatches + 1)) + return 0 + fi + if ssh "$i" "test -f \"$resdir/$ds/remote.run\"" 1>&2 + then + continue # System still running last test, skip. + fi + ssh "$i" "cd \"$resdir/$ds\"; touch remote.run; PATH=\"$T/bin:$PATH\" nohup kvm-remote-$curbatch.sh > kvm-remote-$curbatch.sh.out 2>&1 &" 1>&2 + ret=$? + if test "$ret" -ne 0 + then + echo ssh $i failed: exitcode $ret 1>&2 + exit 11 + fi + echo " ----" System $i Batch `head -n $curbatch < "$rundir"/scenarios | tail -1` `date` 1>&2 + curbatch=$((curbatch + 1)) + done + echo $curbatch +} + +# Launch all the scenarios. +nbatches="`wc -l "$rundir"/scenarios | awk '{ print $1 }'`" +curbatch=1 +while test "$curbatch" -le "$nbatches" +do + startbatches $curbatch $nbatches > $T/curbatch 2> $T/startbatches.stderr + curbatch="`cat $T/curbatch`" + if test -s "$T/startbatches.stderr" + then + cat "$T/startbatches.stderr" | tee -a "$oldrun/remote-log" + fi + if test "$curbatch" -le "$nbatches" + then + sleep 30 + fi +done +echo All batches started. `date` + +# Wait for all remaining scenarios to complete and collect results. +for i in $systems +do + while ssh "$i" "test -f \"$resdir/$ds/remote.run\"" + do + sleep 30 + done + ( cd "$oldrun"; ssh $i "cd $rundir; tar -czf - kvm-remote-*.sh.out */console.log */kvm-test-1-run*.sh.out */qemu_pid */qemu-retval; rm -rf $T > /dev/null 2>&1" | tar -xzf - ) +done + +( kvm-end-run-stats.sh "$oldrun" "$starttime"; echo $? > $T/exitcode ) | tee -a "$oldrun/remote-log" +exit "`cat $T/exitcode`" From e9b800db96fa40170c5607d8968b2ec6212c2026 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Mar 2021 18:02:36 -0800 Subject: [PATCH 29/76] refscale: Add acqrel, lock, and lock-irq This commit adds scale_type of acqrel, lock, and lock-irq to test acquisition and release. Note that the refscale.nreaders=1 module parameter is required if you wish to test uncontended locking. In contrast, acqrel uses a per-CPU variable, so should be just fine with large values of the refscale.nreaders=1 module parameter. Signed-off-by: Paul E. McKenney --- kernel/rcu/refscale.c | 109 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 107 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 02dd9767b559..313d4547cbc7 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -362,6 +362,111 @@ static struct ref_scale_ops rwsem_ops = { .name = "rwsem" }; +// Definitions for global spinlock +static DEFINE_SPINLOCK(test_lock); + +static void ref_lock_section(const int nloops) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + spin_lock(&test_lock); + spin_unlock(&test_lock); + } + preempt_enable(); +} + +static void ref_lock_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + spin_lock(&test_lock); + un_delay(udl, ndl); + spin_unlock(&test_lock); + } + preempt_enable(); +} + +static struct ref_scale_ops lock_ops = { + .readsection = ref_lock_section, + .delaysection = ref_lock_delay_section, + .name = "lock" +}; + +// Definitions for global irq-save spinlock + +static void ref_lock_irq_section(const int nloops) +{ + unsigned long flags; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + spin_lock_irqsave(&test_lock, flags); + spin_unlock_irqrestore(&test_lock, flags); + } + preempt_enable(); +} + +static void ref_lock_irq_delay_section(const int nloops, const int udl, const int ndl) +{ + unsigned long flags; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + spin_lock_irqsave(&test_lock, flags); + un_delay(udl, ndl); + spin_unlock_irqrestore(&test_lock, flags); + } + preempt_enable(); +} + +static struct ref_scale_ops lock_irq_ops = { + .readsection = ref_lock_irq_section, + .delaysection = ref_lock_irq_delay_section, + .name = "lock-irq" +}; + +// Definitions acquire-release. +static DEFINE_PER_CPU(unsigned long, test_acqrel); + +static void ref_acqrel_section(const int nloops) +{ + unsigned long x; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + x = smp_load_acquire(this_cpu_ptr(&test_acqrel)); + smp_store_release(this_cpu_ptr(&test_acqrel), x + 1); + } + preempt_enable(); +} + +static void ref_acqrel_delay_section(const int nloops, const int udl, const int ndl) +{ + unsigned long x; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + x = smp_load_acquire(this_cpu_ptr(&test_acqrel)); + un_delay(udl, ndl); + smp_store_release(this_cpu_ptr(&test_acqrel), x + 1); + } + preempt_enable(); +} + +static struct ref_scale_ops acqrel_ops = { + .readsection = ref_acqrel_section, + .delaysection = ref_acqrel_delay_section, + .name = "acqrel" +}; + static void rcu_scale_one_reader(void) { if (readdelay <= 0) @@ -653,8 +758,8 @@ ref_scale_init(void) long i; int firsterr = 0; static struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, - &refcnt_ops, &rwlock_ops, &rwsem_ops, + &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, &refcnt_ops, &rwlock_ops, + &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, }; if (!torture_init_begin(scale_type, verbose)) From a5c095e0e9b6fedcffd0907c84f77751128e2a34 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 13 Mar 2021 20:05:31 -0800 Subject: [PATCH 30/76] rcutorture: Abstract read-lock-held checks This commit adds a (*readlock_held)() function pointer to the rcu_torture_ops structure in order to make the rcu_torture_one_read() function's rcu_dereference_check() lockdep expression more appropriate for a given run. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 29d2f4c647d3..bf488f957948 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -331,6 +331,7 @@ struct rcu_torture_ops { void (*read_delay)(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp); void (*readunlock)(int idx); + int (*readlock_held)(void); unsigned long (*get_gp_seq)(void); unsigned long (*gp_diff)(unsigned long new, unsigned long old); void (*deferred_free)(struct rcu_torture *p); @@ -359,6 +360,11 @@ static struct rcu_torture_ops *cur_ops; * Definitions for rcu torture testing. */ +static int torture_readlock_not_held(void) +{ + return rcu_read_lock_bh_held() || rcu_read_lock_sched_held(); +} + static int rcu_torture_read_lock(void) __acquires(RCU) { rcu_read_lock(); @@ -488,6 +494,7 @@ static struct rcu_torture_ops rcu_ops = { .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, + .readlock_held = torture_readlock_not_held, .get_gp_seq = rcu_get_gp_seq, .gp_diff = rcu_seq_diff, .deferred_free = rcu_torture_deferred_free, @@ -540,6 +547,7 @@ static struct rcu_torture_ops rcu_busted_ops = { .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock, + .readlock_held = torture_readlock_not_held, .get_gp_seq = rcu_no_completed, .deferred_free = rcu_busted_torture_deferred_free, .sync = synchronize_rcu_busted, @@ -589,6 +597,11 @@ static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp) srcu_read_unlock(srcu_ctlp, idx); } +static int torture_srcu_read_lock_held(void) +{ + return srcu_read_lock_held(srcu_ctlp); +} + static unsigned long srcu_torture_completed(void) { return srcu_batches_completed(srcu_ctlp); @@ -646,6 +659,7 @@ static struct rcu_torture_ops srcu_ops = { .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, + .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, @@ -681,6 +695,7 @@ static struct rcu_torture_ops srcud_ops = { .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, + .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, @@ -700,6 +715,7 @@ static struct rcu_torture_ops busted_srcud_ops = { .readlock = srcu_torture_read_lock, .read_delay = rcu_read_delay, .readunlock = srcu_torture_read_unlock, + .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, @@ -787,6 +803,7 @@ static struct rcu_torture_ops trivial_ops = { .readlock = rcu_torture_read_lock_trivial, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock_trivial, + .readlock_held = torture_readlock_not_held, .get_gp_seq = rcu_no_completed, .sync = synchronize_rcu_trivial, .exp_sync = synchronize_rcu_trivial, @@ -850,6 +867,7 @@ static struct rcu_torture_ops tasks_tracing_ops = { .readlock = tasks_tracing_torture_read_lock, .read_delay = srcu_read_delay, /* just reuse srcu's version. */ .readunlock = tasks_tracing_torture_read_unlock, + .readlock_held = rcu_read_lock_trace_held, .get_gp_seq = rcu_no_completed, .deferred_free = rcu_tasks_tracing_torture_deferred_free, .sync = synchronize_rcu_tasks_trace, @@ -871,11 +889,6 @@ static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) return cur_ops->gp_diff(new, old); } -static bool __maybe_unused torturing_tasks(void) -{ - return cur_ops == &tasks_ops || cur_ops == &tasks_rude_ops; -} - /* * RCU torture priority-boost testing. Runs one real-time thread per * CPU for moderate bursts, repeatedly registering RCU callbacks and @@ -1553,11 +1566,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, - rcu_read_lock_bh_held() || - rcu_read_lock_sched_held() || - srcu_read_lock_held(srcu_ctlp) || - rcu_read_lock_trace_held() || - torturing_tasks()); + !cur_ops->readlock_held || cur_ops->readlock_held()); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ rcutorture_one_extend(&readstate, 0, trsp, rtrsp); From 32dbdaf71ab9b606d0649616039c897df2b03e47 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 14 Mar 2021 15:19:59 -0700 Subject: [PATCH 31/76] torture: Fix grace-period rate output The kvm-again.sh script relies on shell comments added to the qemu-cmd file, but this means that code extracting values from the QEMU command in this file must grep out those commment. Which kvm-recheck-rcu.sh failed to do, which destroyed its grace-period-per-second calculation. This commit therefore adds the needed "grep -v '^#'" to kvm-recheck-rcu.sh. Fixes: 315957cad445 ("torture: Prepare for splitting qemu execution from kvm-test-1-run.sh") Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh index 1706cd4466b4..fbdf162b6acd 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh @@ -31,7 +31,7 @@ then echo "$configfile ------- " $stopstate else title="$configfile ------- $ngps GPs" - dur=`sed -e 's/^.* rcutorture.shutdown_secs=//' -e 's/ .*$//' < $i/qemu-cmd 2> /dev/null` + dur=`grep -v '^#' $i/qemu-cmd | sed -e 's/^.* rcutorture.shutdown_secs=//' -e 's/ .*$//'` if test -z "$dur" then : From ee8fef9137e9e75a36342077a2414dbd86c703bf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Mar 2021 12:26:04 -0700 Subject: [PATCH 32/76] torture: Abstract end-of-run summary This commit abstractst the end-of-run summary from kvm-again.sh, and, while in the area, brings its format into line with that of kvm.sh. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/kvm-again.sh | 11 ++--- .../rcutorture/bin/kvm-end-run-stats.sh | 40 +++++++++++++++++++ 2 files changed, 43 insertions(+), 8 deletions(-) create mode 100755 tools/testing/selftests/rcutorture/bin/kvm-end-run-stats.sh diff --git a/tools/testing/selftests/rcutorture/bin/kvm-again.sh b/tools/testing/selftests/rcutorture/bin/kvm-again.sh index b74bb4343ab9..d8c8483c46f1 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-again.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-again.sh @@ -143,6 +143,8 @@ then usage fi rm -f "$rundir"/*/{console.log,console.log.diags,qemu_pid,qemu-retval,Warnings,kvm-test-1-run.sh.out,kvm-test-1-run-qemu.sh.out,vmlinux} "$rundir"/log +touch "$rundir/log" +echo $scriptname $args | tee -a "$rundir/log" echo $oldrun > "$rundir/re-run" if ! test -d "$rundir/../../bin" then @@ -178,12 +180,5 @@ then echo ---- Dryrun complete, directory: $rundir | tee -a "$rundir/log" else ( cd "$rundir"; sh $T/runbatches.sh ) - kcsan-collapse.sh "$rundir" | tee -a "$rundir/log" - echo | tee -a "$rundir/log" - echo ---- Results directory: $rundir | tee -a "$rundir/log" - kvm-recheck.sh "$rundir" > $T/kvm-recheck.sh.out 2>&1 - ret=$? - cat $T/kvm-recheck.sh.out | tee -a "$rundir/log" - echo " --- Done at `date` (`get_starttime_duration $starttime`) exitcode $ret" | tee -a "$rundir/log" - exit $ret + kvm-end-run-stats.sh "$rundir" "$starttime" fi diff --git a/tools/testing/selftests/rcutorture/bin/kvm-end-run-stats.sh b/tools/testing/selftests/rcutorture/bin/kvm-end-run-stats.sh new file mode 100755 index 000000000000..e4a00779b8c6 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-end-run-stats.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Check the status of the specified run. +# +# Usage: kvm-end-run-stats.sh /path/to/run starttime +# +# Copyright (C) 2021 Facebook, Inc. +# +# Authors: Paul E. McKenney + +# scriptname=$0 +# args="$*" +rundir="$1" +if ! test -d "$rundir" +then + echo kvm-end-run-stats.sh: Specified run directory does not exist: $rundir + exit 1 +fi + +T=${TMPDIR-/tmp}/kvm-end-run-stats.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM +PATH=${KVM}/bin:$PATH; export PATH +. functions.sh +default_starttime="`get_starttime`" +starttime="${2-default_starttime}" + +echo | tee -a "$rundir/log" +echo | tee -a "$rundir/log" +echo " --- `date` Test summary:" | tee -a "$rundir/log" +echo Results directory: $rundir | tee -a "$rundir/log" +kcsan-collapse.sh "$rundir" | tee -a "$rundir/log" +kvm-recheck.sh "$rundir" > $T/kvm-recheck.sh.out 2>&1 +ret=$? +cat $T/kvm-recheck.sh.out | tee -a "$rundir/log" +echo " --- Done at `date` (`get_starttime_duration $starttime`) exitcode $ret" | tee -a "$rundir/log" +exit $ret From f254a0b52787d108879cc8761ee4f6ce33698029 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Mar 2021 13:21:41 -0700 Subject: [PATCH 33/76] torture: Make kvm.sh use abstracted kvm-end-run-stats.sh This commit reduces duplicate code by making kvm.sh use the new kvm-end-run-stats.sh script rather than taking its historical approach of open-coding it. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 3bd523a33700..fab3bd9cf9d1 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -550,20 +550,7 @@ END { if (ncpus != 0) dump(first, i, batchnum); }' >> $T/script - -cat << '___EOF___' >> $T/script -echo | tee -a $TORTURE_RESDIR/log -echo | tee -a $TORTURE_RESDIR/log -echo " --- `date` Test summary:" | tee -a $TORTURE_RESDIR/log -___EOF___ -cat << ___EOF___ >> $T/script -echo Results directory: $resdir/$ds | tee -a $resdir/$ds/log -kcsan-collapse.sh $resdir/$ds | tee -a $resdir/$ds/log -kvm-recheck.sh $resdir/$ds > $T/kvm-recheck.sh.out 2>&1 -___EOF___ -echo 'ret=$?' >> $T/script -echo "cat $T/kvm-recheck.sh.out | tee -a $resdir/$ds/log" >> $T/script -echo 'exit $ret' >> $T/script +echo kvm-end-run-stats.sh "$resdir/$ds" "$starttime" >> $T/script # Extract the tests and their batches from the script. egrep 'Start batch|Starting build\.' $T/script | grep -v ">>" | @@ -627,7 +614,6 @@ else cp $T/batches $resdir/$ds/batches cp $T/scenarios $resdir/$ds/scenarios echo '#' cpus=$cpus >> $resdir/$ds/batches - echo " --- Done at `date` (`get_starttime_duration $starttime`) exitcode $ret" | tee -a $resdir/$ds/log exit $ret fi From b09751d752fb0e8dce4062254da9f813dcb00de5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 18 Mar 2021 14:00:59 -0700 Subject: [PATCH 34/76] torture: Make the build machine control N in "make -jN" Given remote rcutorture runs, it is quite possible that the build system will have fewer CPUs than the system(s) running the actual test scenarios. In such cases, using the number of CPUs on the test systems can overload the build system, slowing down the build or, worse, OOMing the build system. This commit therefore uses the build system's CPU count to set N in "make -jN", and by tradition sets "N" to double the CPU count. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-build.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-build.sh b/tools/testing/selftests/rcutorture/bin/kvm-build.sh index 115e1822b26f..55f4fc102624 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-build.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-build.sh @@ -40,8 +40,10 @@ if test $retval -gt 1 then exit 2 fi -ncpus=`cpus2use.sh` -make -j$ncpus $TORTURE_KMAKE_ARG > $resdir/Make.out 2>&1 + +# Tell "make" to use double the number of real CPUs on the build system. +ncpus="`lscpu | grep '^CPU(' | awk '{ print $2 }'`" +make -j$((2 * ncpus)) $TORTURE_KMAKE_ARG > $resdir/Make.out 2>&1 retval=$? if test $retval -ne 0 || grep "rcu[^/]*": < $resdir/Make.out | egrep -q "Stop|Error|error:|warning:" || egrep -q "Stop|Error|error:" < $resdir/Make.out then From 226dd39d23487c01ab5cc1d68eba142a4dc76a08 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Mar 2021 19:39:14 -0700 Subject: [PATCH 35/76] torture: Make kvm-find-errors.sh account for kvm-remote.sh Currently, kvm-find-errors.sh assumes that if "--buildonly" appears in the log file, then the run did builds but ran no kernels. This breaks with kvm-remote.sh, which uses kvm.sh to do a build, then kvm-again.sh to run the kernels built on remote systems. This commit therefore adds a check for a kvm-remote.sh run. While in the area, this commit checks for "--build-only" as well as "--build-only". Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh index 0670841122d8..daf64b507038 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh @@ -43,7 +43,7 @@ then else echo No build errors. fi -if grep -q -e "--buildonly" < ${rundir}/log +if grep -q -e "--build-\?only" < ${rundir}/log && ! test -f "${rundir}/remote-log" then echo Build-only run, no console logs to check. exit $editorret From ea6d962e80b61996aeacb443661cc3adcb605315 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Mar 2021 16:30:32 -0700 Subject: [PATCH 36/76] rcutorture: Judge RCU priority boosting on grace periods, not callbacks Currently, rcutorture's testing of RCU priority boosting insists not only that grace periods complete, but also that callbacks be invoked. Although this is in fact what the user would want, ensuring that there is sufficient CPU bandwidth devoted to callback execution is in fact the user's responsibility. One could argue that rcutorture can take on that responsibility, which is true in theory. But in practice, ensuring sufficient CPU bandwidth to ksoftirqd, any rcuc kthreads, and any rcuo kthreads is not particularly consistent with rcutorture's main job, that of stress-testing RCU. In addition, if the system administrator (say) makes very poor choices when pinning rcuo kthreads and then runs rcutorture, there really isn't much rcutorture can do. Besides, RCU priority boosting only boosts lagging readers, not all the machinery required to invoke callbacks in a timely fashion. This commit therefore switches rcutorture's evaluation of RCU priority boosting from callback execution to grace-period completion by using the new start_poll_synchronize_rcu() and poll_state_synchronize_rcu() functions. When rcutorture is built in (as in when there is no innocent workload to inconvenience), the ksoftirqd ktheads are boosted to real-time priority 2 in order to allow timeouts to work properly in the face of rcutorture's testing of RCU priority boosting. Indeed, it is not as easy as it looks to create a reliable test of RCU priority boosting without destroying the rest of the kernel! Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 111 ++++++++++++++++++---------------------- 1 file changed, 51 insertions(+), 60 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bf488f957948..06d08f4f3e52 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -245,12 +245,6 @@ static const char *rcu_torture_writer_state_getname(void) return rcu_torture_writer_state_names[i]; } -#if defined(CONFIG_RCU_BOOST) && defined(CONFIG_PREEMPT_RT) -# define rcu_can_boost() 1 -#else -# define rcu_can_boost() 0 -#endif - #ifdef CONFIG_RCU_TRACE static u64 notrace rcu_trace_clock_local(void) { @@ -511,7 +505,7 @@ static struct rcu_torture_ops rcu_ops = { .gp_kthread_dbg = show_rcu_gp_kthreads, .stall_dur = rcu_jiffies_till_stall_check, .irq_capable = 1, - .can_boost = rcu_can_boost(), + .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), .extendables = RCUTORTURE_MAX_EXTEND, .name = "rcu" }; @@ -891,25 +885,11 @@ static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) /* * RCU torture priority-boost testing. Runs one real-time thread per - * CPU for moderate bursts, repeatedly registering RCU callbacks and - * spinning waiting for them to be invoked. If a given callback takes - * too long to be invoked, we assume that priority inversion has occurred. + * CPU for moderate bursts, repeatedly starting grace periods and waiting + * for them to complete. If a given grace period takes too long, we assume + * that priority inversion has occurred. */ -struct rcu_boost_inflight { - struct rcu_head rcu; - int inflight; -}; - -static void rcu_torture_boost_cb(struct rcu_head *head) -{ - struct rcu_boost_inflight *rbip = - container_of(head, struct rcu_boost_inflight, rcu); - - /* Ensure RCU-core accesses precede clearing ->inflight */ - smp_store_release(&rbip->inflight, 0); -} - static int old_rt_runtime = -1; static void rcu_torture_disable_rt_throttle(void) @@ -936,15 +916,18 @@ static void rcu_torture_enable_rt_throttle(void) old_rt_runtime = -1; } -static bool rcu_torture_boost_failed(unsigned long start, unsigned long end) +static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long start, unsigned long end) { static int dbg_done; if (end - start > test_boost_duration * HZ - HZ / 2) { VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); n_rcu_torture_boost_failure++; - if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg) + if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg) { + pr_info("Boost inversion thread ->rt_priority %u gp_state %lu jiffies %lu\n", + current->rt_priority, gp_state, end - start); cur_ops->gp_kthread_dbg(); + } return true; /* failed */ } @@ -954,21 +937,20 @@ static bool rcu_torture_boost_failed(unsigned long start, unsigned long end) static int rcu_torture_boost(void *arg) { - unsigned long call_rcu_time; unsigned long endtime; + unsigned long gp_state; + unsigned long gp_state_time; unsigned long oldstarttime; - struct rcu_boost_inflight rbi = { .inflight = 0 }; VERBOSE_TOROUT_STRING("rcu_torture_boost started"); /* Set real-time priority. */ sched_set_fifo_low(current); - init_rcu_head_on_stack(&rbi.rcu); /* Each pass through the following loop does one boost-test cycle. */ do { bool failed = false; // Test failed already in this test interval - bool firsttime = true; + bool gp_initiated = false; /* Increment n_rcu_torture_boosts once per boost-test */ while (!kthread_should_stop()) { @@ -992,33 +974,33 @@ static int rcu_torture_boost(void *arg) goto checkwait; } - /* Do one boost-test interval. */ + // Do one boost-test interval. endtime = oldstarttime + test_boost_duration * HZ; while (time_before(jiffies, endtime)) { - /* If we don't have a callback in flight, post one. */ - if (!smp_load_acquire(&rbi.inflight)) { - /* RCU core before ->inflight = 1. */ - smp_store_release(&rbi.inflight, 1); - cur_ops->call(&rbi.rcu, rcu_torture_boost_cb); - /* Check if the boost test failed */ - if (!firsttime && !failed) - failed = rcu_torture_boost_failed(call_rcu_time, jiffies); - call_rcu_time = jiffies; - firsttime = false; + // Has current GP gone too long? + if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state)) + failed = rcu_torture_boost_failed(gp_state, gp_state_time, jiffies); + // If we don't have a grace period in flight, start one. + if (!gp_initiated || cur_ops->poll_gp_state(gp_state)) { + gp_state = cur_ops->start_gp_poll(); + gp_initiated = true; + gp_state_time = jiffies; } - if (stutter_wait("rcu_torture_boost")) + if (stutter_wait("rcu_torture_boost")) { sched_set_fifo_low(current); + // If the grace period already ended, + // we don't know when that happened, so + // start over. + if (cur_ops->poll_gp_state(gp_state)) + gp_initiated = false; + } if (torture_must_stop()) goto checkwait; } - /* - * If boost never happened, then inflight will always be 1, in - * this case the boost check would never happen in the above - * loop so do another one here. - */ - if (!firsttime && !failed && smp_load_acquire(&rbi.inflight)) - rcu_torture_boost_failed(call_rcu_time, jiffies); + // In case the grace period extended beyond the end of the loop. + if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state)) + rcu_torture_boost_failed(gp_state, gp_state_time, jiffies); /* * Set the start time of the next test interval. @@ -1027,11 +1009,9 @@ static int rcu_torture_boost(void *arg) * interval. Besides, we are running at RT priority, * so delays should be relatively rare. */ - while (oldstarttime == boost_starttime && - !kthread_should_stop()) { + while (oldstarttime == boost_starttime && !kthread_should_stop()) { if (mutex_trylock(&boost_mutex)) { - boost_starttime = jiffies + - test_boost_interval * HZ; + boost_starttime = jiffies + test_boost_interval * HZ; mutex_unlock(&boost_mutex); break; } @@ -1043,15 +1023,11 @@ checkwait: if (stutter_wait("rcu_torture_boost")) sched_set_fifo_low(current); } while (!torture_must_stop()); - while (smp_load_acquire(&rbi.inflight)) - schedule_timeout_uninterruptible(1); // rcu_barrier() deadlocks. - /* Clean up and exit. */ - while (!kthread_should_stop() || smp_load_acquire(&rbi.inflight)) { + while (!kthread_should_stop()) { torture_shutdown_absorb("rcu_torture_boost"); schedule_timeout_uninterruptible(1); } - destroy_rcu_head_on_stack(&rbi.rcu); torture_kthread_stopping("rcu_torture_boost"); return 0; } @@ -2643,7 +2619,7 @@ static bool rcu_torture_can_boost(void) if (!(test_boost == 1 && cur_ops->can_boost) && test_boost != 2) return false; - if (!cur_ops->call) + if (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state) return false; prio = rcu_get_gp_kthreads_prio(); @@ -2651,7 +2627,7 @@ static bool rcu_torture_can_boost(void) return false; if (prio < 2) { - if (boost_warn_once == 1) + if (boost_warn_once == 1) return false; pr_alert("%s: WARN: RCU kthread priority too low to test boosting. Skipping RCU boost test. Try passing rcutree.kthread_prio > 1 on the kernel command line.\n", KBUILD_MODNAME); @@ -3129,6 +3105,21 @@ rcu_torture_init(void) if (firsterr < 0) goto unwind; rcutor_hp = firsterr; + + // Testing RCU priority boosting requires rcutorture do + // some serious abuse. Counter this by running ksoftirqd + // at higher priority. + if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) { + for_each_online_cpu(cpu) { + struct sched_param sp; + struct task_struct *t; + + t = per_cpu(ksoftirqd, cpu); + WARN_ON_ONCE(!t); + sp.sched_priority = 2; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + } + } } shutdown_jiffies = jiffies + shutdown_secs * HZ; firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); From f8c8484dbda78e09912a391a8c87414920bbdfee Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 1 Apr 2021 15:26:02 +0200 Subject: [PATCH 37/76] torture: Correctly fetch number of CPUs for non-English languages Grepping for "CPU" on lscpu output isn't always successful, depending on the local language setting. As a result, the build can be aborted early with: "make: the '-j' option requires a positive integer argument" This commit therefore uses the human-language-independent approach available via the getconf command, both in kvm-build.sh and in kvm-remote.sh. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-build.sh | 2 +- tools/testing/selftests/rcutorture/bin/kvm-remote.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-build.sh b/tools/testing/selftests/rcutorture/bin/kvm-build.sh index 55f4fc102624..5ad973dca820 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-build.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-build.sh @@ -42,7 +42,7 @@ then fi # Tell "make" to use double the number of real CPUs on the build system. -ncpus="`lscpu | grep '^CPU(' | awk '{ print $2 }'`" +ncpus="`getconf _NPROCESSORS_ONLN`" make -j$((2 * ncpus)) $TORTURE_KMAKE_ARG > $resdir/Make.out 2>&1 retval=$? if test $retval -ne 0 || grep "rcu[^/]*": < $resdir/Make.out | egrep -q "Stop|Error|error:|warning:" || egrep -q "Stop|Error|error:" < $resdir/Make.out diff --git a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh index c4859fca87b2..f08d415d4f99 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh @@ -136,7 +136,7 @@ chmod +x $T/bin/kvm-remote-*.sh # Check first to avoid the need for cleanup for system-name typos for i in $systems do - ncpus="`ssh $i lscpu | grep '^CPU(' | awk '{ print $2 }'`" + ncpus="`ssh $i getconf _NPROCESSORS_ONLN 2> /dev/null`" echo $i: $ncpus CPUs " " `date` | tee -a "$oldrun/remote-log" ret=$? if test "$ret" -ne 0 From 00ad25f6019b3bd61bd2ddc128509728b49ac589 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Apr 2021 15:26:56 -0700 Subject: [PATCH 38/76] torture: Set kvm.sh language to English Some of the code invoked directly and indirectly from kvm.sh parses the output of commands. This parsing assumes English, which can cause failures if the user has set some other language. In a few cases, there are language-independent commands available, but this is not always the case. Therefore, as an alternative to polyglot parsing, this commit sets the LANG environment variable to en_US.UTF-8. Reported-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index fab3bd9cf9d1..390bb97b07d8 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -20,6 +20,9 @@ mkdir $T cd `dirname $scriptname`/../../../../../ +# This script knows only English. +LANG=en_US.UTF-8; export LANG + dur=$((30*60)) dryrun="" KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM From 7b9dad7abad70750c7fbacd5eb5e917f73b42759 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Apr 2021 17:09:37 -0700 Subject: [PATCH 39/76] rcutorture: Delay-based false positives for RCU priority boosting tests If an rcu_torture_boost() kthread determines that its grace period has not yet ended, it invokes rcu_torture_boost_failed() which checks whether enough time has elapsed for this to be considered a failure of RCU priority boosting, and, if so, flags the error. Unfortunately, that kthread might be preempted for some seconds between the time that it checks the grace period and the time that it checks the time. This delay can result in a false positive, featuring a complaint that a particular grace period has not ended, followed by a diagnostic dump featuring a much later grace period. This commit avoids these false positives by rechecking for the end of the grace period after the time check. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 06d08f4f3e52..3defd0febe15 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -921,6 +921,10 @@ static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long start static int dbg_done; if (end - start > test_boost_duration * HZ - HZ / 2) { + // Recheck after checking time to avoid false positives. + smp_mb(); // Time check before grace-period check. + if (cur_ops->poll_gp_state(gp_state)) + return false; // passed, though perhaps just barely VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); n_rcu_torture_boost_failure++; if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg) { @@ -929,10 +933,10 @@ static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long start cur_ops->gp_kthread_dbg(); } - return true; /* failed */ + return true; // failed } - return false; /* passed */ + return false; // passed } static int rcu_torture_boost(void *arg) From 8c7ec02e2a69807db8024635b48829dca5701c42 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Apr 2021 20:00:00 -0700 Subject: [PATCH 40/76] rcutorture: Consolidate rcu_torture_boost() timing and statistics This commit consolidates two loops in rcu_torture_boost(), one of which counts the number of boost-test episodes and the other of which computes the start time of the next episode, into one loop that does both with but a single acquisition of boost_mutex. This means that the count of the number of boost-test episodes is incremented after an episode completes rather than before it starts, but it also avoids the over-counting that was possible previously. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 3defd0febe15..31338b2d6609 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -956,15 +956,6 @@ static int rcu_torture_boost(void *arg) bool failed = false; // Test failed already in this test interval bool gp_initiated = false; - /* Increment n_rcu_torture_boosts once per boost-test */ - while (!kthread_should_stop()) { - if (mutex_trylock(&boost_mutex)) { - n_rcu_torture_boosts++; - mutex_unlock(&boost_mutex); - break; - } - schedule_timeout_uninterruptible(1); - } if (kthread_should_stop()) goto checkwait; @@ -1015,7 +1006,10 @@ static int rcu_torture_boost(void *arg) */ while (oldstarttime == boost_starttime && !kthread_should_stop()) { if (mutex_trylock(&boost_mutex)) { - boost_starttime = jiffies + test_boost_interval * HZ; + if (oldstarttime == boost_starttime) { + boost_starttime = jiffies + test_boost_interval * HZ; + n_rcu_torture_boosts++; + } mutex_unlock(&boost_mutex); break; } From bcd4af44e2f173074328980b60178fdbb1853e4f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 8 Apr 2021 10:46:55 -0700 Subject: [PATCH 41/76] rcutorture: Make rcu_torture_boost_failed() check for GP end It is possible that a delayed grace period that rcu_torture_boost() was polling for ended while rcu_torture_boost_failed() was printing the failure splat. It would be good to know when this happens. This commit therefore has rcu_torture_boost_failed() recheck the grace period after printing the splat, and printing a message indicating whether or not the grace period has ended. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 31338b2d6609..02a14dfcae67 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -919,6 +919,7 @@ static void rcu_torture_enable_rt_throttle(void) static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long start, unsigned long end) { static int dbg_done; + bool gp_done; if (end - start > test_boost_duration * HZ - HZ / 2) { // Recheck after checking time to avoid false positives. @@ -931,6 +932,11 @@ static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long start pr_info("Boost inversion thread ->rt_priority %u gp_state %lu jiffies %lu\n", current->rt_priority, gp_state, end - start); cur_ops->gp_kthread_dbg(); + // Recheck after print to flag grace period ending during splat. + gp_done = cur_ops->poll_gp_state(gp_state); + pr_info("Boost inversion: GP %lu %s.\n", gp_state, + gp_done ? "ended already" : "still pending"); + } return true; // failed From d4240d628f989efe32b3ad10a78d6921f8e28bd6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 11 Apr 2021 10:44:12 -0700 Subject: [PATCH 42/76] rcutorture: Add BUSTED-BOOST to test RCU priority boosting tests This commit adds the BUSTED-BOOST rcutorture scenario, which can be used to test rcutorture's ability to test RCU priority boosting. Signed-off-by: Paul E. McKenney --- .../rcutorture/configs/rcu/BUSTED-BOOST | 17 +++++++++++++++++ .../rcutorture/configs/rcu/BUSTED-BOOST.boot | 8 ++++++++ 2 files changed, 25 insertions(+) create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot diff --git a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST new file mode 100644 index 000000000000..22d598f9cabe --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST @@ -0,0 +1,17 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=16 +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y +#CHECK#CONFIG_PREEMPT_RCU=y +CONFIG_HZ_PERIODIC=y +CONFIG_NO_HZ_IDLE=n +CONFIG_NO_HZ_FULL=n +CONFIG_RCU_TRACE=y +CONFIG_HOTPLUG_CPU=y +CONFIG_RCU_FANOUT=2 +CONFIG_RCU_FANOUT_LEAF=2 +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot new file mode 100644 index 000000000000..f57720c52c0f --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot @@ -0,0 +1,8 @@ +rcutorture.test_boost=2 +rcutorture.stutter=0 +rcutree.gp_preinit_delay=12 +rcutree.gp_init_delay=3 +rcutree.gp_cleanup_delay=3 +rcutree.kthread_prio=2 +threadirqs +tree.use_softirq=0 From 0260b92e1c39412b1e345e202355c43169c16274 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 8 Apr 2021 13:01:14 -0700 Subject: [PATCH 43/76] rcutorture: Forgive RCU boost failures when CPUs don't pass through QS Currently, rcu_torture_boost() runs CPU-bound at real-time priority to force RCU priority inversions. It then checks that grace periods progress during this CPU-bound time. If grace periods fail to progress, it reports and RCU priority boosting failure. However, it is possible (and sometimes does happen) that the grace period fails to progress due to a CPU failing to pass through a quiescent state for an extended time period (3.5 seconds by default). This can happen due to vCPU preemption, long-running interrupts, and much else besides. There is nothing that RCU priority boosting can do about these situations, and so they should not be counted as RCU priority boosting failures. This commit therefore checks for CPUs (as opposed to preempted tasks) holding up a grace period, and flags the resulting RCU priority boosting failures, but does not splat nor count them as errors. It does rate-limit them to avoid flooding the console log. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 2 ++ kernel/rcu/rcutorture.c | 67 +++++++++++++++++++++++++---------------- kernel/rcu/tree_stall.h | 36 ++++++++++++++++++++++ 3 files changed, 79 insertions(+), 26 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index bf0827d4b659..daf0cd3f2926 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -519,6 +519,7 @@ static inline unsigned long rcu_exp_batches_completed(void) { return 0; } static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) { return 0; } static inline void rcu_force_quiescent_state(void) { } +static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { return true; } static inline void show_rcu_gp_kthreads(void) { } static inline int rcu_get_gp_kthreads_prio(void) { return 0; } static inline void rcu_fwd_progress_check(unsigned long j) { } @@ -527,6 +528,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); unsigned long srcu_batches_completed(struct srcu_struct *sp); +bool rcu_check_boost_fail(unsigned long gp_state, int *cpup); void show_rcu_gp_kthreads(void); int rcu_get_gp_kthreads_prio(void); void rcu_fwd_progress_check(unsigned long j); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 02a14dfcae67..5ae4dcc6ba27 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -340,6 +340,7 @@ struct rcu_torture_ops { void (*fqs)(void); void (*stats)(void); void (*gp_kthread_dbg)(void); + bool (*check_boost_failed)(unsigned long gp_state, int *cpup); int (*stall_dur)(void); int irq_capable; int can_boost; @@ -483,31 +484,32 @@ static void rcu_sync_torture_init(void) } static struct rcu_torture_ops rcu_ops = { - .ttype = RCU_FLAVOR, - .init = rcu_sync_torture_init, - .readlock = rcu_torture_read_lock, - .read_delay = rcu_read_delay, - .readunlock = rcu_torture_read_unlock, - .readlock_held = torture_readlock_not_held, - .get_gp_seq = rcu_get_gp_seq, - .gp_diff = rcu_seq_diff, - .deferred_free = rcu_torture_deferred_free, - .sync = synchronize_rcu, - .exp_sync = synchronize_rcu_expedited, - .get_gp_state = get_state_synchronize_rcu, - .start_gp_poll = start_poll_synchronize_rcu, - .poll_gp_state = poll_state_synchronize_rcu, - .cond_sync = cond_synchronize_rcu, - .call = call_rcu, - .cb_barrier = rcu_barrier, - .fqs = rcu_force_quiescent_state, - .stats = NULL, - .gp_kthread_dbg = show_rcu_gp_kthreads, - .stall_dur = rcu_jiffies_till_stall_check, - .irq_capable = 1, - .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), - .extendables = RCUTORTURE_MAX_EXTEND, - .name = "rcu" + .ttype = RCU_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock, + .read_delay = rcu_read_delay, + .readunlock = rcu_torture_read_unlock, + .readlock_held = torture_readlock_not_held, + .get_gp_seq = rcu_get_gp_seq, + .gp_diff = rcu_seq_diff, + .deferred_free = rcu_torture_deferred_free, + .sync = synchronize_rcu, + .exp_sync = synchronize_rcu_expedited, + .get_gp_state = get_state_synchronize_rcu, + .start_gp_poll = start_poll_synchronize_rcu, + .poll_gp_state = poll_state_synchronize_rcu, + .cond_sync = cond_synchronize_rcu, + .call = call_rcu, + .cb_barrier = rcu_barrier, + .fqs = rcu_force_quiescent_state, + .stats = NULL, + .gp_kthread_dbg = show_rcu_gp_kthreads, + .check_boost_failed = rcu_check_boost_fail, + .stall_dur = rcu_jiffies_till_stall_check, + .irq_capable = 1, + .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), + .extendables = RCUTORTURE_MAX_EXTEND, + .name = "rcu" }; /* @@ -918,14 +920,27 @@ static void rcu_torture_enable_rt_throttle(void) static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long start, unsigned long end) { + int cpu; static int dbg_done; bool gp_done; + unsigned long j; + static unsigned long last_persist; + unsigned long lp; + unsigned long mininterval = test_boost_duration * HZ - HZ / 2; - if (end - start > test_boost_duration * HZ - HZ / 2) { + if (end - start > mininterval) { // Recheck after checking time to avoid false positives. smp_mb(); // Time check before grace-period check. if (cur_ops->poll_gp_state(gp_state)) return false; // passed, though perhaps just barely + if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, &cpu)) { + // At most one persisted message per boost test. + j = jiffies; + lp = READ_ONCE(last_persist); + if (time_after(j, lp + mininterval) && cmpxchg(&last_persist, lp, j) == lp) + pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu); + return false; // passed on a technicality + } VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); n_rcu_torture_boost_failure++; if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg) { diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 59b95cc5cbdf..af92d9fee0d4 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -716,6 +716,42 @@ static void check_cpu_stall(struct rcu_data *rdp) // RCU forward-progress mechanisms, including of callback invocation. +/* + * Check to see if a failure to end RCU priority inversion was due to + * a CPU not passing through a quiescent state. When this happens, there + * is nothing that RCU priority boosting can do to help, so we shouldn't + * count this as an RCU priority boosting failure. A return of true says + * RCU priority boosting is to blame, and false says otherwise. If false + * is returned, the first of the CPUs to blame is stored through cpup. + */ +bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) +{ + int cpu; + unsigned long flags; + struct rcu_node *rnp; + + rcu_for_each_leaf_node(rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (!rnp->qsmask) { + // No CPUs without quiescent states for this rnp. + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + continue; + } + // Find the first holdout CPU. + for_each_leaf_node_possible_cpu(rnp, cpu) { + if (rnp->qsmask & (1UL << (cpu - rnp->grplo))) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + *cpup = cpu; + return false; + } + } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + // Can't blame CPUs, so must blame RCU priority boosting. + return true; +} +EXPORT_SYMBOL_GPL(rcu_check_boost_fail); + /* * Show the state of the grace-period kthreads. */ From 063f5a4df99145ba0a5d4879d171a8175235f37b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 Apr 2021 13:00:10 -0700 Subject: [PATCH 44/76] rcutorture: Don't count CPU-stalled time against priority boosting It will frequently be the case that rcu_torture_boost() will get a ->start_gp_poll() cookie that needs almost all of the current grace period plus an additional grace period to elapse before ->poll_gp_state() will return true. It is quite possible that the current grace period will have (say) two seconds of stall by a CPU failing to pass through a quiescent state, followed by 300 milliseconds of delay due to a preempted reader. The next grace period might suffer only one second of stall by a CPU, followed by another 300 milliseconds of delay due to a preempted reader. This is an example of RCU priority boosting doing its job, but the full elapsed time of 3.6 seconds exceeds the 3.5-second limit. In addition, there is no CPU stall in force at the 3.5-second mark, so this would nevertheless currently be counted as an RCU priority boosting failure. This commit therefore avoids this sort of false positive by resetting the gp_state_time timestamp any time that the current grace period is being blocked by a CPU. This results in extremely frequent calls to the ->check_boost_failed() function, so this commit provides a lockless fastpath that is selected by supplying a NULL CPU-number pointer. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 13 ++++++++----- kernel/rcu/tree_stall.h | 10 ++++++++++ 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 5ae4dcc6ba27..8b347b9659aa 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -918,17 +918,18 @@ static void rcu_torture_enable_rt_throttle(void) old_rt_runtime = -1; } -static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long start, unsigned long end) +static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long *start) { int cpu; static int dbg_done; + unsigned long end = jiffies; bool gp_done; unsigned long j; static unsigned long last_persist; unsigned long lp; unsigned long mininterval = test_boost_duration * HZ - HZ / 2; - if (end - start > mininterval) { + if (end - *start > mininterval) { // Recheck after checking time to avoid false positives. smp_mb(); // Time check before grace-period check. if (cur_ops->poll_gp_state(gp_state)) @@ -945,7 +946,7 @@ static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long start n_rcu_torture_boost_failure++; if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg) { pr_info("Boost inversion thread ->rt_priority %u gp_state %lu jiffies %lu\n", - current->rt_priority, gp_state, end - start); + current->rt_priority, gp_state, end - *start); cur_ops->gp_kthread_dbg(); // Recheck after print to flag grace period ending during splat. gp_done = cur_ops->poll_gp_state(gp_state); @@ -955,6 +956,8 @@ static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long start } return true; // failed + } else if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, NULL)) { + *start = jiffies; } return false; // passed @@ -995,7 +998,7 @@ static int rcu_torture_boost(void *arg) while (time_before(jiffies, endtime)) { // Has current GP gone too long? if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state)) - failed = rcu_torture_boost_failed(gp_state, gp_state_time, jiffies); + failed = rcu_torture_boost_failed(gp_state, &gp_state_time); // If we don't have a grace period in flight, start one. if (!gp_initiated || cur_ops->poll_gp_state(gp_state)) { gp_state = cur_ops->start_gp_poll(); @@ -1016,7 +1019,7 @@ static int rcu_torture_boost(void *arg) // In case the grace period extended beyond the end of the loop. if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state)) - rcu_torture_boost_failed(gp_state, gp_state_time, jiffies); + rcu_torture_boost_failed(gp_state, &gp_state_time); /* * Set the start time of the next test interval. diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index af92d9fee0d4..8bde1b53b0c9 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -723,6 +723,10 @@ static void check_cpu_stall(struct rcu_data *rdp) * count this as an RCU priority boosting failure. A return of true says * RCU priority boosting is to blame, and false says otherwise. If false * is returned, the first of the CPUs to blame is stored through cpup. + * + * If cpup is NULL, then a lockless quick check is carried out, suitable + * for high-rate usage. On the other hand, if cpup is non-NULL, each + * rcu_node structure's ->lock is acquired, ruling out high-rate usage. */ bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { @@ -731,6 +735,12 @@ bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) struct rcu_node *rnp; rcu_for_each_leaf_node(rnp) { + if (!cpup) { + if (READ_ONCE(rnp->qsmask)) + return false; + else + continue; + } raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!rnp->qsmask) { // No CPUs without quiescent states for this rnp. From c43d3b0083b4f2e9b14174a5857ab06cbca986df Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 27 Apr 2021 09:56:42 -0700 Subject: [PATCH 45/76] torture: Make kvm-remote.sh account for network failure in pathname checks In a long-duration kvm-remote.sh run, almost all of the remote accesses will be simple file-existence checks. These are thus the most likely to be caught out by network failures, which do happen from time to time. This commit therefore takes a first step towards tolerating temporary network outages by making the file-existence checks repeat in the face of such an outage. They also print a message every minute during a outage, allowing the user to take appropriate action. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/kvm-remote.sh | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh index f08d415d4f99..20e848d2c0bb 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh @@ -159,6 +159,28 @@ do fi done +# Function to check for presence of a file on the specified system. +# Complain if the system cannot be reached, and retry after a wait. +# Currently just waits forever if a machine disappears. +# +# Usage: checkremotefile system pathname +checkremotefile () { + local ret + local sleeptime=60 + + while : + do + ssh $1 "test -f \"$2\"" + ret=$? + if test "$ret" -ne 255 + then + return $ret + fi + echo " ---" ssh failure to $1 checking for file $2, retry after $sleeptime seconds. `date` + sleep $sleeptime + done +} + # Function to start batches on idle remote $systems # # Usage: startbatches curbatch nbatches @@ -178,7 +200,7 @@ startbatches () { echo $((nbatches + 1)) return 0 fi - if ssh "$i" "test -f \"$resdir/$ds/remote.run\"" 1>&2 + if checkremotefile "$i" "$resdir/$ds/remote.run" 1>&2 then continue # System still running last test, skip. fi @@ -216,7 +238,7 @@ echo All batches started. `date` # Wait for all remaining scenarios to complete and collect results. for i in $systems do - while ssh "$i" "test -f \"$resdir/$ds/remote.run\"" + while checkremotefile "$i" "$resdir/$ds/remote.run" do sleep 30 done From 3d78668e5b50f1a28fdfd4293fc61b90eb10ba75 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 27 Apr 2021 13:51:35 -0700 Subject: [PATCH 46/76] torture: Don't cap remote runs by build-system number of CPUs Currently, if a torture scenario requires more CPUs than are present on the build system, kvm.sh and friends limit the CPUs available to that scenario. This makes total sense when the build system and the system running the scenarios are one and the same, but not so much when remote systems might well have more CPUs. This commit therefore introduces a --remote flag to kvm.sh that suppresses this CPU-limiting behavior, and causes kvm-remote.sh to use this flag. Signed-off-by: Paul E. McKenney --- .../testing/selftests/rcutorture/bin/kvm-remote.sh | 2 +- tools/testing/selftests/rcutorture/bin/kvm.sh | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh index 20e848d2c0bb..79e680e0e7bf 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh @@ -66,7 +66,7 @@ then if (ds != "") print "--datestamp " ds; }'`" - kvm.sh "$@" $datestamp --buildonly > $T/kvm.sh.out 2>&1 + kvm.sh --remote "$@" $datestamp --buildonly > $T/kvm.sh.out 2>&1 ret=$? if test "$ret" -ne 0 then diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 390bb97b07d8..b4ac4ee33222 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -44,6 +44,7 @@ TORTURE_KCONFIG_KASAN_ARG="" TORTURE_KCONFIG_KCSAN_ARG="" TORTURE_KMAKE_ARG="" TORTURE_QEMU_MEM=512 +TORTURE_REMOTE= TORTURE_SHUTDOWN_GRACE=180 TORTURE_SUITE=rcu TORTURE_MOD=rcutorture @@ -80,6 +81,7 @@ usage () { echo " --no-initrd" echo " --qemu-args qemu-arguments" echo " --qemu-cmd qemu-system-..." + echo " --remote" echo " --results absolute-pathname" echo " --torture lock|rcu|rcuscale|refscale|scf" echo " --trust-make" @@ -115,10 +117,13 @@ do checkarg --cpus "(number)" "$#" "$2" '^[0-9]*$' '^--' cpus=$2 TORTURE_ALLOTED_CPUS="$2" - max_cpus="`identify_qemu_vcpus`" - if test "$TORTURE_ALLOTED_CPUS" -gt "$max_cpus" + if test -z "$TORTURE_REMOTE" then - TORTURE_ALLOTED_CPUS=$max_cpus + max_cpus="`identify_qemu_vcpus`" + if test "$TORTURE_ALLOTED_CPUS" -gt "$max_cpus" + then + TORTURE_ALLOTED_CPUS=$max_cpus + fi fi shift ;; @@ -209,6 +214,9 @@ do TORTURE_QEMU_CMD="$2" shift ;; + --remote) + TORTURE_REMOTE=1 + ;; --results) checkarg --results "(absolute pathname)" "$#" "$2" '^/' '^error' resdir=$2 From 7ab2bd31df871408792eac871c4187e29d039315 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 2 May 2021 19:56:05 -0700 Subject: [PATCH 47/76] rcutorture: Move mem_dump_obj() tests into separate function To make the purpose of the code more apparent, this commit moves the tests of mem_dump_obj() to a new rcu_torture_mem_dump_obj() function and calls it from rcu_torture_cleanup(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 81 +++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 39 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8b347b9659aa..ec69273898af 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1868,48 +1868,49 @@ rcu_torture_stats(void *arg) torture_shutdown_absorb("rcu_torture_stats"); } while (!torture_must_stop()); torture_kthread_stopping("rcu_torture_stats"); - - { - struct rcu_head *rhp; - struct kmem_cache *kcp; - static int z; - - kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL); - rhp = kmem_cache_alloc(kcp, GFP_KERNEL); - pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z); - pr_alert("mem_dump_obj(ZERO_SIZE_PTR):"); - mem_dump_obj(ZERO_SIZE_PTR); - pr_alert("mem_dump_obj(NULL):"); - mem_dump_obj(NULL); - pr_alert("mem_dump_obj(%px):", &rhp); - mem_dump_obj(&rhp); - pr_alert("mem_dump_obj(%px):", rhp); - mem_dump_obj(rhp); - pr_alert("mem_dump_obj(%px):", &rhp->func); - mem_dump_obj(&rhp->func); - pr_alert("mem_dump_obj(%px):", &z); - mem_dump_obj(&z); - kmem_cache_free(kcp, rhp); - kmem_cache_destroy(kcp); - rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); - pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp); - pr_alert("mem_dump_obj(kmalloc %px):", rhp); - mem_dump_obj(rhp); - pr_alert("mem_dump_obj(kmalloc %px):", &rhp->func); - mem_dump_obj(&rhp->func); - kfree(rhp); - rhp = vmalloc(4096); - pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp); - pr_alert("mem_dump_obj(vmalloc %px):", rhp); - mem_dump_obj(rhp); - pr_alert("mem_dump_obj(vmalloc %px):", &rhp->func); - mem_dump_obj(&rhp->func); - vfree(rhp); - } - return 0; } +/* Test mem_dump_obj() and friends. */ +static void rcu_torture_mem_dump_obj(void) +{ + struct rcu_head *rhp; + struct kmem_cache *kcp; + static int z; + + kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL); + rhp = kmem_cache_alloc(kcp, GFP_KERNEL); + pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z); + pr_alert("mem_dump_obj(ZERO_SIZE_PTR):"); + mem_dump_obj(ZERO_SIZE_PTR); + pr_alert("mem_dump_obj(NULL):"); + mem_dump_obj(NULL); + pr_alert("mem_dump_obj(%px):", &rhp); + mem_dump_obj(&rhp); + pr_alert("mem_dump_obj(%px):", rhp); + mem_dump_obj(rhp); + pr_alert("mem_dump_obj(%px):", &rhp->func); + mem_dump_obj(&rhp->func); + pr_alert("mem_dump_obj(%px):", &z); + mem_dump_obj(&z); + kmem_cache_free(kcp, rhp); + kmem_cache_destroy(kcp); + rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); + pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp); + pr_alert("mem_dump_obj(kmalloc %px):", rhp); + mem_dump_obj(rhp); + pr_alert("mem_dump_obj(kmalloc %px):", &rhp->func); + mem_dump_obj(&rhp->func); + kfree(rhp); + rhp = vmalloc(4096); + pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp); + pr_alert("mem_dump_obj(vmalloc %px):", rhp); + mem_dump_obj(rhp); + pr_alert("mem_dump_obj(vmalloc %px):", &rhp->func); + mem_dump_obj(&rhp->func); + vfree(rhp); +} + static void rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) { @@ -2825,6 +2826,8 @@ rcu_torture_cleanup(void) if (cur_ops->cleanup != NULL) cur_ops->cleanup(); + rcu_torture_mem_dump_obj(); + rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ if (err_segs_recorded) { From ce7c169dee28866539abb0e603b9a23055d30fdc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Mar 2021 13:23:49 -0700 Subject: [PATCH 48/76] rcu: Remove the unused rcu_irq_exit_preempt() function Commit 9ee01e0f69a9 ("x86/entry: Clean up idtentry_enter/exit() leftovers") left the rcu_irq_exit_preempt() in place in order to avoid conflicts with the -rcu tree. Now that this change has long since hit mainline, this commit removes the no-longer-used rcu_irq_exit_preempt() function. Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 1 - include/linux/rcutree.h | 1 - kernel/rcu/tree.c | 22 ---------------------- 3 files changed, 24 deletions(-) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 35e0be326ffc..953e70fafe38 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -86,7 +86,6 @@ static inline void rcu_irq_enter(void) { } static inline void rcu_irq_exit_irqson(void) { } static inline void rcu_irq_enter_irqson(void) { } static inline void rcu_irq_exit(void) { } -static inline void rcu_irq_exit_preempt(void) { } static inline void rcu_irq_exit_check_preempt(void) { } #define rcu_is_idle_cpu(cpu) \ (is_idle_task(current) && !in_nmi() && !in_irq() && !in_serving_softirq()) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index b89b54130f49..53209d669400 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -49,7 +49,6 @@ void rcu_idle_enter(void); void rcu_idle_exit(void); void rcu_irq_enter(void); void rcu_irq_exit(void); -void rcu_irq_exit_preempt(void); void rcu_irq_enter_irqson(void); void rcu_irq_exit_irqson(void); bool rcu_is_idle_cpu(int cpu); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8e78b2430c16..f6543b8004c0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -833,28 +833,6 @@ void noinstr rcu_irq_exit(void) rcu_nmi_exit(); } -/** - * rcu_irq_exit_preempt - Inform RCU that current CPU is exiting irq - * towards in kernel preemption - * - * Same as rcu_irq_exit() but has a sanity check that scheduling is safe - * from RCU point of view. Invoked from return from interrupt before kernel - * preemption. - */ -void rcu_irq_exit_preempt(void) -{ - lockdep_assert_irqs_disabled(); - rcu_nmi_exit(); - - RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0, - "RCU dynticks_nesting counter underflow/zero!"); - RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) != - DYNTICK_IRQ_NONIDLE, - "Bad RCU dynticks_nmi_nesting counter\n"); - RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), - "RCU in extended quiescent state!"); -} - #ifdef CONFIG_PROVE_RCU /** * rcu_irq_exit_check_preempt - Validate that scheduling is possible From 277ffe1b709280856391663c2ca5685a28308fc5 Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Tue, 30 Mar 2021 13:47:42 -0700 Subject: [PATCH 49/76] rcu: Improve tree.c comments and add code cleanups This commit cleans up some comments and code in kernel/rcu/tree.c. Signed-off-by: Zhouyi Zhou Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f6543b8004c0..06f3de96997c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -202,7 +202,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio); * the need for long delays to increase some race probabilities with the * need for fast grace periods to increase other race probabilities. */ -#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */ +#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays for debugging. */ /* * Compute the mask of online CPUs for the specified rcu_node structure. @@ -937,7 +937,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit); */ void noinstr rcu_user_exit(void) { - rcu_eqs_exit(1); + rcu_eqs_exit(true); } /** @@ -1203,7 +1203,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); #endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ /* - * We are reporting a quiescent state on behalf of some other CPU, so + * When trying to report a quiescent state on behalf of some other CPU, * it is our responsibility to check for and handle potential overflow * of the rcu_node ->gp_seq counter with respect to the rcu_data counters. * After all, the CPU might be in deep idle state, and thus executing no @@ -2607,7 +2607,7 @@ static void rcu_do_batch(struct rcu_data *rdp) * state, for example, user mode or idle loop. It also schedules RCU * core processing. If the current grace period has gone on too long, * it will ask the scheduler to manufacture a context switch for the sole - * purpose of providing a providing the needed quiescent state. + * purpose of providing the needed quiescent state. */ void rcu_sched_clock_irq(int user) { @@ -3236,7 +3236,7 @@ put_cached_bnode(struct kfree_rcu_cpu *krcp, /* * This function is invoked in workqueue context after a grace period. - * It frees all the objects queued on ->bhead_free or ->head_free. + * It frees all the objects queued on ->bkvhead_free or ->head_free. */ static void kfree_rcu_work(struct work_struct *work) { @@ -3263,7 +3263,7 @@ static void kfree_rcu_work(struct work_struct *work) krwp->head_free = NULL; raw_spin_unlock_irqrestore(&krcp->lock, flags); - // Handle two first channels. + // Handle the first two channels. for (i = 0; i < FREE_N_CHANNELS; i++) { for (; bkvhead[i]; bkvhead[i] = bnext) { bnext = bkvhead[i]->next; @@ -3530,11 +3530,11 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, } /* - * Queue a request for lazy invocation of appropriate free routine after a - * grace period. Please note there are three paths are maintained, two are the - * main ones that use array of pointers interface and third one is emergency - * one, that is used only when the main path can not be maintained temporary, - * due to memory pressure. + * Queue a request for lazy invocation of the appropriate free routine + * after a grace period. Please note that three paths are maintained, + * two for the common case using arrays of pointers and a third one that + * is used only when the main paths cannot be used, for example, due to + * memory pressure. * * Each kvfree_call_rcu() request is added to a batch. The batch will be drained * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will @@ -4708,7 +4708,7 @@ void __init rcu_init(void) rcutree_online_cpu(cpu); } - /* Create workqueue for expedited GPs and for Tree SRCU. */ + /* Create workqueue for Tree SRCU and for expedited GPs. */ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_gp_wq); rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); From 8e4b1d2bc198e34b48fc7cc3a3c5a2fcb269e271 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 31 Mar 2021 10:59:05 -0700 Subject: [PATCH 50/76] rcu: Invoke rcu_spawn_core_kthreads() from rcu_spawn_gp_kthread() Currently, rcu_spawn_core_kthreads() is invoked via an early_initcall(), which works, except that rcu_spawn_gp_kthread() is also invoked via an early_initcall() and rcu_spawn_core_kthreads() relies on adjustments to kthread_prio that are carried out by rcu_spawn_gp_kthread(). There is no guaranttee of ordering among early_initcall() handlers, and thus no guarantee that kthread_prio will be properly checked and range-limited at the time that rcu_spawn_core_kthreads() needs it. In most cases, this bug is harmless. After all, the only reason that rcu_spawn_gp_kthread() adjusts the value of kthread_prio is if the user specified a nonsensical value for this boot parameter, which experience indicates is rare. Nevertheless, a bug is a bug. This commit therefore causes the rcu_spawn_core_kthreads() function to be invoked directly from rcu_spawn_gp_kthread() after any needed adjustments to kthread_prio have been carried out. Fixes: 48d07c04b4cc ("rcu: Enable elimination of Tree-RCU softirq processing") Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 06f3de96997c..2532e584e95f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2889,7 +2889,6 @@ static int __init rcu_spawn_core_kthreads(void) "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__); return 0; } -early_initcall(rcu_spawn_core_kthreads); /* * Handle any core-RCU processing required by a call_rcu() invocation. @@ -4450,6 +4449,7 @@ static int __init rcu_spawn_gp_kthread(void) wake_up_process(t); rcu_spawn_nocb_kthreads(); rcu_spawn_boost_kthreads(); + rcu_spawn_core_kthreads(); return 0; } early_initcall(rcu_spawn_gp_kthread); From e44111ed20d8b2d7b05b20d694358ae77d4e93e2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 2 Apr 2021 21:51:50 -0700 Subject: [PATCH 51/76] rcu: Add ->rt_priority and ->gp_start to show_rcu_gp_kthreads() output This commit adds ->rt_priority and ->gp_start to show_rcu_gp_kthreads() output in order to better diagnose RCU priority boosting failures. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 59b95cc5cbdf..fb4702570316 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -726,6 +726,7 @@ void show_rcu_gp_kthreads(void) unsigned long j; unsigned long ja; unsigned long jr; + unsigned long js; unsigned long jw; struct rcu_data *rdp; struct rcu_node *rnp; @@ -734,11 +735,12 @@ void show_rcu_gp_kthreads(void) j = jiffies; ja = j - data_race(rcu_state.gp_activity); jr = j - data_race(rcu_state.gp_req_activity); + js = j - data_race(rcu_state.gp_start); jw = j - data_race(rcu_state.gp_wake_time); - pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", + pr_info("%s: wait state: %s(%d) ->state: %#lx ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", rcu_state.name, gp_state_getname(rcu_state.gp_state), - rcu_state.gp_state, t ? t->state : 0x1ffffL, - ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq), + rcu_state.gp_state, t ? t->state : 0x1ffffL, t ? t->rt_priority : 0xffU, + js, ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq), (long)data_race(rcu_state.gp_seq), (long)data_race(rcu_get_root()->gp_seq_needed), data_race(rcu_state.gp_flags)); From 27ba76e164fc83ffe6ceeb0415c427ad1191af6c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 4 Apr 2021 17:23:36 -0700 Subject: [PATCH 52/76] rcu: Add ->gp_max to show_rcu_gp_kthreads() output This commit adds ->gp_max to show_rcu_gp_kthreads() output in order to better diagnose RCU priority boosting failures. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index fb4702570316..a4e2bb3bdce7 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -737,12 +737,13 @@ void show_rcu_gp_kthreads(void) jr = j - data_race(rcu_state.gp_req_activity); js = j - data_race(rcu_state.gp_start); jw = j - data_race(rcu_state.gp_wake_time); - pr_info("%s: wait state: %s(%d) ->state: %#lx ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", + pr_info("%s: wait state: %s(%d) ->state: %#lx ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n", rcu_state.name, gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, t ? t->state : 0x1ffffL, t ? t->rt_priority : 0xffU, js, ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq), (long)data_race(rcu_state.gp_seq), (long)data_race(rcu_get_root()->gp_seq_needed), + data_race(rcu_state.gp_max), data_race(rcu_state.gp_flags)); rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), From 1feb2cc8db481b902272559ad7aae3c091762ad0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 5 Apr 2021 09:47:59 -0700 Subject: [PATCH 53/76] lockdep: Explicitly flag likely false-positive report The reason that lockdep_rcu_suspicious() prints the value of debug_locks is because a value of zero indicates a likely false positive. This can work, but is a bit obtuse. This commit therefore explicitly calls out the possibility of a false positive. Reviewed-by: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/locking/lockdep.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 48d736aa03b2..d6c3c987009d 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -6393,6 +6393,7 @@ asmlinkage __visible void lockdep_sys_exit(void) void lockdep_rcu_suspicious(const char *file, const int line, const char *s) { struct task_struct *curr = current; + int dl = READ_ONCE(debug_locks); /* Note: the following can be executed concurrently, so be careful. */ pr_warn("\n"); @@ -6402,11 +6403,12 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) pr_warn("-----------------------------\n"); pr_warn("%s:%d %s!\n", file, line, s); pr_warn("\nother info that might help us debug this:\n\n"); - pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n", + pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n%s", !rcu_lockdep_current_cpu_online() ? "RCU used illegally from offline CPU!\n" : "", - rcu_scheduler_active, debug_locks); + rcu_scheduler_active, dl, + dl ? "" : "Possible false positive due to lockdep disabling via debug_locks = 0\n"); /* * If a CPU is in the RCU-free window in idle (ie: in the section From 3066820034b5dd4e89bd74a7739c51c2d6f5e554 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 5 Apr 2021 09:51:05 -0700 Subject: [PATCH 54/76] rcu: Reject RCU_LOCKDEP_WARN() false positives If another lockdep report runs concurrently with an RCU lockdep report from RCU_LOCKDEP_WARN(), the following sequence of events can occur: 1. debug_lockdep_rcu_enabled() sees that lockdep is enabled when called from (say) synchronize_rcu(). 2. Lockdep is disabled by a concurrent lockdep report. 3. debug_lockdep_rcu_enabled() evaluates its lockdep-expression argument, for example, lock_is_held(&rcu_bh_lock_map). 4. Because lockdep is now disabled, lock_is_held() plays it safe and returns the constant 1. 5. But in this case, the constant 1 is not safe, because invoking synchronize_rcu() under rcu_read_lock_bh() is disallowed. 6. debug_lockdep_rcu_enabled() wrongly invokes lockdep_rcu_suspicious(), resulting in a false-positive splat. This commit therefore changes RCU_LOCKDEP_WARN() to check debug_lockdep_rcu_enabled() after checking the lockdep expression, so that any "safe" returns from lock_is_held() are rejected by debug_lockdep_rcu_enabled(). This requires memory ordering, which is supplied by READ_ONCE(debug_locks). The resulting volatile accesses prevent the compiler from reordering and the fact that only one variable is being accessed prevents the underlying hardware from reordering. The combination works for IA64, which can reorder reads to the same location, but this is defeated by the volatile accesses, which compile to load instructions that provide ordering. Reported-by: syzbot+dde0cc33951735441301@syzkaller.appspotmail.com Reported-by: Matthew Wilcox Reported-by: syzbot+88e4f02896967fe1ab0d@syzkaller.appspotmail.com Reported-by: Thomas Gleixner Suggested-by: Boqun Feng Reviewed-by: Boqun Feng Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 +- kernel/rcu/update.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 9455476c5ba2..1199ffd305d1 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -315,7 +315,7 @@ static inline int rcu_read_lock_any_held(void) #define RCU_LOCKDEP_WARN(c, s) \ do { \ static bool __section(".data.unlikely") __warned; \ - if (debug_lockdep_rcu_enabled() && !__warned && (c)) { \ + if ((c) && debug_lockdep_rcu_enabled() && !__warned) { \ __warned = true; \ lockdep_rcu_suspicious(__FILE__, __LINE__, s); \ } \ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index b95ae86c40a7..dd94a602a6d2 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -277,7 +277,7 @@ EXPORT_SYMBOL_GPL(rcu_callback_map); noinstr int notrace debug_lockdep_rcu_enabled(void) { - return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks && + return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && READ_ONCE(debug_locks) && current->lockdep_recursion == 0; } EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); From 396eba65f62414ee8850ed5f7b5ce844719ebebf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Apr 2021 16:31:42 -0700 Subject: [PATCH 55/76] rcu: Add quiescent states and boost states to show_rcu_gp_kthreads() output This commit adds each rcu_node structure's ->qsmask and "bBEG" output indicating whether: (1) There is a boost kthread, (2) A reader needs to be (or is in the process of being) boosted, (3) A reader is blocking an expedited grace period, and (4) A reader is blocking a normal grace period. This helps diagnose RCU priority boosting failures. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 1 + kernel/rcu/tree_stall.h | 12 +++++++++--- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 71821d59d95c..5fd0c443517e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -115,6 +115,7 @@ struct rcu_node { /* boosting for this rcu_node structure. */ unsigned int boost_kthread_status; /* State of boost_kthread_task for tracing. */ + unsigned long n_boosts; /* Number of boosts for this rcu_node structure. */ #ifdef CONFIG_RCU_NOCB_CPU struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 2cbe8f8456e6..ef004cc7101d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1098,6 +1098,7 @@ static int rcu_boost(struct rcu_node *rnp) /* Lock only for side effect: boosts task t's priority. */ rt_mutex_lock(&rnp->boost_mtx); rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ + rnp->n_boosts++; return READ_ONCE(rnp->exp_tasks) != NULL || READ_ONCE(rnp->boost_tasks) != NULL; diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index a4e2bb3bdce7..c1f83864a18e 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -749,9 +749,15 @@ void show_rcu_gp_kthreads(void) if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed))) continue; - pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", - rnp->grplo, rnp->grphi, (long)data_race(rnp->gp_seq), - (long)data_race(rnp->gp_seq_needed)); + pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld ->qsmask %#lx %c%c%c%c ->n_boosts %ld\n", + rnp->grplo, rnp->grphi, + (long)data_race(rnp->gp_seq), (long)data_race(rnp->gp_seq_needed), + data_race(rnp->qsmask), + ".b"[!!data_race(rnp->boost_kthread_task)], + ".B"[!!data_race(rnp->boost_tasks)], + ".E"[!!data_race(rnp->exp_tasks)], + ".G"[!!data_race(rnp->gp_tasks)], + data_race(rnp->n_boosts)); if (!rcu_is_leaf_node(rnp)) continue; for_each_leaf_node_possible_cpu(rnp, cpu) { From 3ef5a1c3821ab61da3e9fe0f4561be903ae2bc84 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 5 Apr 2021 20:42:09 -0700 Subject: [PATCH 56/76] rcu: Make RCU priority boosting work on single-CPU rcu_node structures When any CPU comes online, it checks to see if an RCU-boost kthread has already been created for that CPU's leaf rcu_node structure, and if not, it creates one. Unfortunately, it also verifies that this leaf rcu_node structure actually has at least one online CPU, and if not, it declines to create the kthread. Although this behavior makes sense during early boot, especially on systems that claim far more CPUs than they actually have, it makes no sense for the first CPU to come online for a given rcu_node structure. There is no point in checking because we know there is a CPU on its way in. The problem is that timing differences can cause this incoming CPU to not yet be reflected in the various bit masks even at rcutree_online_cpu() time, and there is no chance at rcutree_prepare_cpu() time. Plus it would be better to create the RCU-boost kthread at rcutree_prepare_cpu() to handle the case where the CPU is involved in an RCU priority inversion very shortly after it comes online. This commit therefore moves the checking to rcu_prepare_kthreads(), which is called only at early boot, when the check is appropriate. In addition, it makes rcutree_prepare_cpu() invoke rcu_spawn_one_boost_kthread(), which no longer does any checking for online CPUs. With this change, RCU priority boosting tests now pass for short rcutorture runs, even with single-CPU leaf rcu_node structures. Cc: Sebastian Andrzej Siewior Cc: Scott Wood Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 31 ++++++++----------------------- 3 files changed, 10 insertions(+), 25 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2532e584e95f..00a3ebca70b8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4166,7 +4166,7 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - rcu_prepare_kthreads(cpu); + rcu_spawn_one_boost_kthread(rnp); rcu_spawn_cpu_nocb_kthread(cpu); WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 5fd0c443517e..b5508f44ff29 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -418,8 +418,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static bool rcu_is_callbacks_kthread(void); static void rcu_cpu_kthread_setup(unsigned int cpu); +static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp); static void __init rcu_spawn_boost_kthreads(void); -static void rcu_prepare_kthreads(int cpu); static void rcu_cleanup_after_idle(void); static void rcu_prepare_for_idle(void); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ef004cc7101d..3c90dad00d3c 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1198,22 +1198,16 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) */ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) { - int rnp_index = rnp - rcu_get_root(); unsigned long flags; + int rnp_index = rnp - rcu_get_root(); struct sched_param sp; struct task_struct *t; - if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) - return; - - if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0) + if (rnp->boost_kthread_task || !rcu_scheduler_fully_active) return; rcu_state.boost = 1; - if (rnp->boost_kthread_task != NULL) - return; - t = kthread_create(rcu_boost_kthread, (void *)rnp, "rcub/%d", rnp_index); if (WARN_ON_ONCE(IS_ERR(t))) @@ -1265,17 +1259,8 @@ static void __init rcu_spawn_boost_kthreads(void) struct rcu_node *rnp; rcu_for_each_leaf_node(rnp) - rcu_spawn_one_boost_kthread(rnp); -} - -static void rcu_prepare_kthreads(int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - struct rcu_node *rnp = rdp->mynode; - - /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ - if (rcu_scheduler_fully_active) - rcu_spawn_one_boost_kthread(rnp); + if (rcu_rnp_online_cpus(rnp)) + rcu_spawn_one_boost_kthread(rnp); } #else /* #ifdef CONFIG_RCU_BOOST */ @@ -1295,6 +1280,10 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) { } +static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) +{ +} + static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { } @@ -1303,10 +1292,6 @@ static void __init rcu_spawn_boost_kthreads(void) { } -static void rcu_prepare_kthreads(int cpu) -{ -} - #endif /* #else #ifdef CONFIG_RCU_BOOST */ #if !defined(CONFIG_RCU_FAST_NO_HZ) From b15805013b441b13fcf6e402c03421c03edb79c6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Apr 2021 15:14:01 -0700 Subject: [PATCH 57/76] rcu: Make show_rcu_gp_kthreads() dump rcu_node structures blocking GP Currently, show_rcu_gp_kthreads() only dumps rcu_node structures that have outdated ideas of the current grace-period number. This commit also dumps those that are in any way blocking the current grace period. This helps diagnose RCU priority boosting failures. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index c1f83864a18e..e6bd518e0bc4 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -746,8 +746,9 @@ void show_rcu_gp_kthreads(void) data_race(rcu_state.gp_max), data_race(rcu_state.gp_flags)); rcu_for_each_node_breadth_first(rnp) { - if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), - READ_ONCE(rnp->gp_seq_needed))) + if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed)) && + !data_race(rnp->qsmask) && !data_race(rnp->boost_tasks) && + !data_race(rnp->exp_tasks) && !data_race(rnp->gp_tasks)) continue; pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld ->qsmask %#lx %c%c%c%c ->n_boosts %ld\n", rnp->grplo, rnp->grphi, From 4d80b8e196fad9852050f3c8624eea09a6bbeada Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Apr 2021 15:21:32 -0700 Subject: [PATCH 58/76] rcu: Restrict RCU_STRICT_GRACE_PERIOD to at most four CPUs Kernels built with CONFIG_RCU_STRICT_GRACE_PERIOD=y can experience significant lock contention due to RCU's resulting focus on ending grace periods as soon as possible. This is OK, but only if there are not very many CPUs. This commit therefore puts this Kconfig option off-limits to systems with more than four CPUs. Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 1942c1f1bb65..4fd64999300f 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -116,7 +116,7 @@ config RCU_EQS_DEBUG config RCU_STRICT_GRACE_PERIOD bool "Provide debug RCU implementation with short grace periods" - depends on DEBUG_KERNEL && RCU_EXPERT + depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4 default n select PREEMPT_COUNT if PREEMPT=n help From 2f20de99a63b0de9bcceedafc3281e65fbf7d4fd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 11 Apr 2021 10:49:52 -0700 Subject: [PATCH 59/76] rcu: Make rcu_gp_cleanup() be noinline for tracing Although there are trace events for RCU grace periods, these are only enabled in CONFIG_RCU_TRACE=y kernels. This commit therefore marks rcu_gp_cleanup() noinline in order to provide a function that can be traced that is invoked near the end of each grace period. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 00a3ebca70b8..6eb64e44bdcd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2026,7 +2026,7 @@ static void rcu_gp_fqs_loop(void) /* * Clean up after the old grace period. */ -static void rcu_gp_cleanup(void) +static noinline void rcu_gp_cleanup(void) { int cpu; bool needgp = false; From 3d3a0d1b508dcc47e82b0e12cde6585bc088b0cc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 16 Apr 2021 16:53:16 -0700 Subject: [PATCH 60/76] rcu: Point to documentation of ordering guarantees Add comments to synchronize_rcu() and friends that point to Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 3 +++ kernel/rcu/tree.c | 20 ++++++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index e26547b34ad3..f8340c3b1c00 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1000,6 +1000,9 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are * passed the same srcu_struct structure. * + * Implementation of these memory-ordering guarantees is similar to + * that of synchronize_rcu(). + * * If SRCU is likely idle, expedite the first request. This semantic * was provided by Classic SRCU, and is relied upon by its users, so TREE * SRCU must also provide it. Note that detecting idleness is heuristic diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6eb64e44bdcd..2437960a2795 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3084,6 +3084,9 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) * between the call to call_rcu() and the invocation of "func()" -- even * if CPU A and CPU B are the same CPU (but again only if the system has * more than one CPU). + * + * Implementation of these memory-ordering guarantees is described here: + * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { @@ -3751,6 +3754,9 @@ static int rcu_blocking_is_gp(void) * to have executed a full memory barrier during the execution of * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but * again only if the system has more than one CPU). + * + * Implementation of these memory-ordering guarantees is described here: + * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. */ void synchronize_rcu(void) { @@ -3821,7 +3827,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); /** * poll_state_synchronize_rcu - Conditionally wait for an RCU grace period * - * @oldstate: return from call to get_state_synchronize_rcu() or start_poll_synchronize_rcu() + * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu() * * If a full RCU grace period has elapsed since the earlier call from * which oldstate was obtained, return @true, otherwise return @false. @@ -3837,6 +3843,11 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); * (many hours even on 32-bit systems) should check them occasionally * and either refresh them or set a flag indicating that the grace period * has completed. + * + * This function provides the same memory-ordering guarantees that + * would be provided by a synchronize_rcu() that was invoked at the call + * to the function that provided @oldstate, and that returned at the end + * of this function. */ bool poll_state_synchronize_rcu(unsigned long oldstate) { @@ -3851,7 +3862,7 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); /** * cond_synchronize_rcu - Conditionally wait for an RCU grace period * - * @oldstate: return value from earlier call to get_state_synchronize_rcu() + * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu() * * If a full RCU grace period has elapsed since the earlier call to * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return. @@ -3861,6 +3872,11 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); * counter wrap is harmless. If the counter wraps, we have waited for * more than 2 billion grace periods (and way more on a 64-bit system!), * so waiting for one additional grace period should be just fine. + * + * This function provides the same memory-ordering guarantees that + * would be provided by a synchronize_rcu() that was invoked at the call + * to the function that provided @oldstate, and that returned at the end + * of this function. */ void cond_synchronize_rcu(unsigned long oldstate) { From 5390473ec1697b71af0e9d63ef7aaa7ecd27e2c9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 Apr 2021 16:30:34 -0700 Subject: [PATCH 61/76] rcu: Don't penalize priority boosting when there is nothing to boost RCU priority boosting cannot do anything unless there is at least one task blocking the current RCU grace period that was preempted within the RCU read-side critical section that it still resides in. However, the current rcu_torture_boost_failed() code will count this as an RCU priority-boosting failure if there were no CPUs blocking the current grace period. This situation can happen (for example) if the last CPU blocking the current grace period was subjected to vCPU preemption, which is always a risk for rcutorture guest OSes. This commit therefore causes rcu_torture_boost_failed() to refrain from reporting failure unless there is at least one task blocking the current RCU grace period that was preempted within the RCU read-side critical section that it still resides in. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 8bde1b53b0c9..65302518e006 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -723,6 +723,10 @@ static void check_cpu_stall(struct rcu_data *rdp) * count this as an RCU priority boosting failure. A return of true says * RCU priority boosting is to blame, and false says otherwise. If false * is returned, the first of the CPUs to blame is stored through cpup. + * If there was no CPU blocking the current grace period, but also nothing + * in need of being boosted, *cpup is set to -1. This can happen in case + * of vCPU preemption while the last CPU is reporting its quiscent state, + * for example. * * If cpup is NULL, then a lockless quick check is carried out, suitable * for high-rate usage. On the other hand, if cpup is non-NULL, each @@ -730,18 +734,25 @@ static void check_cpu_stall(struct rcu_data *rdp) */ bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { + bool atb = false; int cpu; unsigned long flags; struct rcu_node *rnp; rcu_for_each_leaf_node(rnp) { if (!cpup) { - if (READ_ONCE(rnp->qsmask)) + if (READ_ONCE(rnp->qsmask)) { return false; - else + } else { + if (READ_ONCE(rnp->gp_tasks)) + atb = true; continue; + } } + *cpup = -1; raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (rnp->gp_tasks) + atb = true; if (!rnp->qsmask) { // No CPUs without quiescent states for this rnp. raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -758,7 +769,7 @@ bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } // Can't blame CPUs, so must blame RCU priority boosting. - return true; + return atb; } EXPORT_SYMBOL_GPL(rcu_check_boost_fail); From c7ef7500a891432a3bb2b036f535bfdf58aa3605 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Feb 2021 01:10:05 +0100 Subject: [PATCH 62/76] rcu/nocb: Directly call __wake_nocb_gp() from bypass timer The bypass timer calls __call_rcu_nocb_wake() instead of directly calling __wake_nocb_gp(). The only difference here is that rdp->qlen_last_fqs_check gets overridden. But resetting the deferred force quiescent state base shouldn't be relevant for that timer. In fact the bypass queue in question can be for any rdp from the group and not necessarily the rdp leader on which the bypass timer is attached. This commit therefore calls __wake_nocb_gp() directly. This way we don't even need to lock the ->nocb_lock. Signed-off-by: Frederic Weisbecker Cc: Josh Triplett Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5a2aa9c4e569..82e9ffb92347 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2015,9 +2015,10 @@ static void do_nocb_bypass_wakeup_timer(struct timer_list *t) struct rcu_data *rdp = from_timer(rdp, t, nocb_bypass_timer); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer")); - rcu_nocb_lock_irqsave(rdp, flags); + + raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags); smp_mb__after_spinlock(); /* Timer expire before wakeup. */ - __call_rcu_nocb_wake(rdp, true, flags); + __wake_nocb_gp(rdp, rdp, false, flags); } /* From 552cac80e65f38a0cc9022456c09efed7e88f9d6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Feb 2021 01:10:06 +0100 Subject: [PATCH 63/76] rcu/nocb: Allow de-offloading rdp leader The only thing that prevented an rdp leader from being de-offloaded was the nocb_bypass_timer that used to lock the nocb_lock of the rdp leader. If an rdp gets de-offloaded, it will subtlely ignore rcu_nocb_lock() calls and do its job in the timer unsafely. Worse yet: If it gets re-offloaded in the middle of the timer, rcu_nocb_unlock() would try to unlock, leaving it imbalanced. Now that the nocb_bypass_timer doesn't use the nocb_lock anymore, de-offloading the rdp leader is now safe. This commit therefore allows the rdp leader to be de-offloaded. Reported-by: Paul E. McKenney Cc: Josh Triplett Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 82e9ffb92347..015adec5bdc5 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2489,10 +2489,6 @@ int rcu_nocb_cpu_deoffload(int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); int ret = 0; - if (rdp == rdp->nocb_gp_rdp) { - pr_info("Can't deoffload an rdp GP leader (yet)\n"); - return -EINVAL; - } mutex_lock(&rcu_state.barrier_mutex); cpus_read_lock(); if (rcu_rdp_is_offloaded(rdp)) { From b6e2c4ed35c33d7e55197aa26d99645a690ca467 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Feb 2021 01:10:07 +0100 Subject: [PATCH 64/76] rcu/nocb: Cancel nocb_timer upon nocb_gp wakeup When waking up in nocb_gp_wait(), there is no need to keep the nocb_timer around because this function will traverse the whole rdp list. Any update performed before the timer was armed will now be visible after the ->nocb_gp_lock acquire. Signed-off-by: Frederic Weisbecker Cc: Josh Triplett Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 015adec5bdc5..a667551a5501 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2211,6 +2211,10 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); if (bypass) del_timer(&my_rdp->nocb_bypass_timer); + if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { + WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + del_timer(&my_rdp->nocb_timer); + } WRITE_ONCE(my_rdp->nocb_gp_sleep, true); raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); } From 3b2348e2fdf403b25a317b394db605257f321966 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Feb 2021 01:10:08 +0100 Subject: [PATCH 65/76] rcu/nocb: Delete bypass_timer upon nocb_gp wakeup A NOCB-gp wake p can safely delete the ->nocb_bypass_timer because nocb_gp_wait() will recheck again the bypass state and rearm the bypass timer if necessary. This commit therefore deletes this timer. Reviewed-by: Boqun Feng Signed-off-by: Frederic Weisbecker Cc: Josh Triplett Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a667551a5501..4253a0ecc14f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1701,6 +1701,8 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp, del_timer(&rdp_gp->nocb_timer); } + del_timer(&rdp_gp->nocb_bypass_timer); + if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { WRITE_ONCE(rdp_gp->nocb_gp_sleep, false); needwake = true; From f9fc166b790bd214083035d865653133b8a963d1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Feb 2021 01:10:09 +0100 Subject: [PATCH 66/76] rcu/nocb: Only cancel nocb timer if not polling This commit refrains deleting the ->nocb_timer if rcu_nocb is polling because it should not ever have been queued in the polling case. Signed-off-by: Frederic Weisbecker Cc: Josh Triplett Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 4253a0ecc14f..db28e3123f32 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2176,18 +2176,18 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) my_rdp->nocb_gp_gp = needwait_gp; my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0; if (bypass) { - raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); - // Avoid race with first bypass CB. - if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { - WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - del_timer(&my_rdp->nocb_timer); - } if (!rcu_nocb_poll) { + raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); + // Avoid race with first bypass CB. + if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { + WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + del_timer(&my_rdp->nocb_timer); + } // At least one child with non-empty ->nocb_bypass, so set // timer in order to avoid stranding its callbacks. mod_timer(&my_rdp->nocb_bypass_timer, j + 2); + raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); } - raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); } if (rcu_nocb_poll) { /* Polling, so trace if first poll in the series. */ From 870905169da8bfae0570df013efe860d33251b0f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Feb 2021 01:10:10 +0100 Subject: [PATCH 67/76] rcu/nocb: Prepare for fine-grained deferred wakeup Tuning the deferred wakeup level must be done from a safe wakeup point. Currently those sites are: * ->nocb_timer * user/idle/guest entry * CPU down * softirq/rcuc All of these sites perform the wake up for both RCU_NOCB_WAKE and RCU_NOCB_WAKE_FORCE. In order to merge ->nocb_timer and ->nocb_bypass_timer together, we plan to add a new RCU_NOCB_WAKE_BYPASS that really should be deferred until a timer fires so that we don't wake up the NOCB-gp kthread too early. To prepare for that, this commit specifies the per-callsite wakeup level/limit. Cc: Josh Triplett Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Boqun Feng Signed-off-by: Frederic Weisbecker [ paulmck: Fix non-NOCB rcu_nocb_need_deferred_wakeup() definition. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 17 +++++++++-------- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8e78b2430c16..5f1545aab9ed 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3911,7 +3911,7 @@ static int rcu_pending(int user) check_cpu_stall(rdp); /* Does this CPU need a deferred NOCB wakeup? */ - if (rcu_nocb_need_deferred_wakeup(rdp)) + if (rcu_nocb_need_deferred_wakeup(rdp, RCU_NOCB_WAKE)) return 1; /* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index b280a843bd2c..2510e86265c1 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -433,7 +433,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, bool *was_alldone, unsigned long flags); static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, unsigned long flags); -static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); +static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level); static bool do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_cpu_nocb_kthread(int cpu); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index db28e3123f32..e2e5e4968f43 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2352,13 +2352,14 @@ static int rcu_nocb_cb_kthread(void *arg) } /* Is a deferred wakeup of rcu_nocb_kthread() required? */ -static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) +static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level) { - return READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT; + return READ_ONCE(rdp->nocb_defer_wakeup) >= level; } /* Do a deferred wakeup of rcu_nocb_kthread(). */ -static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp) +static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp, + int level) { unsigned long flags; int ndw; @@ -2367,7 +2368,7 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp) raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); - if (!rcu_nocb_need_deferred_wakeup(rdp_gp)) { + if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) { raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); return false; } @@ -2384,7 +2385,7 @@ static void do_nocb_deferred_wakeup_timer(struct timer_list *t) { struct rcu_data *rdp = from_timer(rdp, t, nocb_timer); - do_nocb_deferred_wakeup_common(rdp); + do_nocb_deferred_wakeup_common(rdp, RCU_NOCB_WAKE); } /* @@ -2397,8 +2398,8 @@ static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) if (!rdp->nocb_gp_rdp) return false; - if (rcu_nocb_need_deferred_wakeup(rdp->nocb_gp_rdp)) - return do_nocb_deferred_wakeup_common(rdp); + if (rcu_nocb_need_deferred_wakeup(rdp->nocb_gp_rdp, RCU_NOCB_WAKE)) + return do_nocb_deferred_wakeup_common(rdp, RCU_NOCB_WAKE); return false; } @@ -2939,7 +2940,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { } -static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) +static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level) { return false; } From e75bcd48e2c4026b1f3feda916a2327b1744d664 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Feb 2021 01:10:11 +0100 Subject: [PATCH 68/76] rcu/nocb: Unify timers Now that ->nocb_timer and ->nocb_bypass_timer have become quite similar, this commit merges them together. A new RCU_NOCB_WAKE_BYPASS wake level is introduced. As a result, timers perform all kinds of deferred wake ups but other deferred wakeup callsites only handle non-bypass wakeups in order not to wake up rcuo too early. The timer also unconditionally executes a full barrier so as to order timer_pending() and callback enqueue although the path performing RCU_NOCB_WAKE_FORCE that makes use of it is debatable. It should also test against the rdp leader instead of the current rdp. This unconditional full barrier shouldn't bring visible overhead since these timers almost never fire. Signed-off-by: Frederic Weisbecker Cc: Josh Triplett Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Boqun Feng Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 1 + kernel/rcu/tree.h | 6 +-- kernel/rcu/tree_plugin.h | 92 ++++++++++++++++---------------------- 3 files changed, 43 insertions(+), 56 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 6768b64bc738..670e41783edd 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -278,6 +278,7 @@ TRACE_EVENT_RCU(rcu_exp_funnel_lock, * "WakeNot": Don't wake rcuo kthread. * "WakeNotPoll": Don't wake rcuo kthread because it is polling. * "WakeOvfIsDeferred": Wake rcuo kthread later, CB list is huge. + * "WakeBypassIsDeferred": Wake rcuo kthread later, bypass list is contended. * "WokeEmpty": rcuo CB kthread woke to find empty list. */ TRACE_EVENT_RCU(rcu_nocb_wake, diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 2510e86265c1..9a16487edfca 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -218,7 +218,6 @@ struct rcu_data { /* The following fields are used by GP kthread, hence own cacheline. */ raw_spinlock_t nocb_gp_lock ____cacheline_internodealigned_in_smp; - struct timer_list nocb_bypass_timer; /* Force nocb_bypass flush. */ u8 nocb_gp_sleep; /* Is the nocb GP thread asleep? */ u8 nocb_gp_bypass; /* Found a bypass on last scan? */ u8 nocb_gp_gp; /* GP to wait for on last scan? */ @@ -258,8 +257,9 @@ struct rcu_data { /* Values for nocb_defer_wakeup field in struct rcu_data. */ #define RCU_NOCB_WAKE_NOT 0 -#define RCU_NOCB_WAKE 1 -#define RCU_NOCB_WAKE_FORCE 2 +#define RCU_NOCB_WAKE_BYPASS 1 +#define RCU_NOCB_WAKE 2 +#define RCU_NOCB_WAKE_FORCE 3 #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) /* For jiffies_till_first_fqs and */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e2e5e4968f43..dfb048ec559f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1701,8 +1701,6 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp, del_timer(&rdp_gp->nocb_timer); } - del_timer(&rdp_gp->nocb_bypass_timer); - if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { WRITE_ONCE(rdp_gp->nocb_gp_sleep, false); needwake = true; @@ -1740,10 +1738,19 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); - if (rdp_gp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) - mod_timer(&rdp_gp->nocb_timer, jiffies + 1); - if (rdp_gp->nocb_defer_wakeup < waketype) + /* + * Bypass wakeup overrides previous deferments. In case + * of callback storm, no need to wake up too early. + */ + if (waketype == RCU_NOCB_WAKE_BYPASS) { + mod_timer(&rdp_gp->nocb_timer, jiffies + 2); WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); + } else { + if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE) + mod_timer(&rdp_gp->nocb_timer, jiffies + 1); + if (rdp_gp->nocb_defer_wakeup < waketype) + WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); + } raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); @@ -1995,7 +2002,7 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, smp_mb(); /* Enqueue before timer_pending(). */ if ((rdp->nocb_cb_sleep || !rcu_segcblist_ready_cbs(&rdp->cblist)) && - !timer_pending(&rdp->nocb_bypass_timer)) { + !timer_pending(&rdp->nocb_timer)) { rcu_nocb_unlock_irqrestore(rdp, flags); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, TPS("WakeOvfIsDeferred")); @@ -2010,19 +2017,6 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, return; } -/* Wake up the no-CBs GP kthread to flush ->nocb_bypass. */ -static void do_nocb_bypass_wakeup_timer(struct timer_list *t) -{ - unsigned long flags; - struct rcu_data *rdp = from_timer(rdp, t, nocb_bypass_timer); - - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer")); - - raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags); - smp_mb__after_spinlock(); /* Timer expire before wakeup. */ - __wake_nocb_gp(rdp, rdp, false, flags); -} - /* * Check if we ignore this rdp. * @@ -2175,19 +2169,12 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) my_rdp->nocb_gp_bypass = bypass; my_rdp->nocb_gp_gp = needwait_gp; my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0; - if (bypass) { - if (!rcu_nocb_poll) { - raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); - // Avoid race with first bypass CB. - if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { - WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - del_timer(&my_rdp->nocb_timer); - } - // At least one child with non-empty ->nocb_bypass, so set - // timer in order to avoid stranding its callbacks. - mod_timer(&my_rdp->nocb_bypass_timer, j + 2); - raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); - } + + if (bypass && !rcu_nocb_poll) { + // At least one child with non-empty ->nocb_bypass, so set + // timer in order to avoid stranding its callbacks. + wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS, + TPS("WakeBypassIsDeferred")); } if (rcu_nocb_poll) { /* Polling, so trace if first poll in the series. */ @@ -2211,8 +2198,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) } if (!rcu_nocb_poll) { raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); - if (bypass) - del_timer(&my_rdp->nocb_bypass_timer); if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); del_timer(&my_rdp->nocb_timer); @@ -2358,16 +2343,14 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level) } /* Do a deferred wakeup of rcu_nocb_kthread(). */ -static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp, - int level) +static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp, + struct rcu_data *rdp, int level, + unsigned long flags) + __releases(rdp_gp->nocb_gp_lock) { - unsigned long flags; int ndw; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; int ret; - raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); - if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) { raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); return false; @@ -2383,9 +2366,15 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp, /* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ static void do_nocb_deferred_wakeup_timer(struct timer_list *t) { + unsigned long flags; struct rcu_data *rdp = from_timer(rdp, t, nocb_timer); - do_nocb_deferred_wakeup_common(rdp, RCU_NOCB_WAKE); + WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer")); + + raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags); + smp_mb__after_spinlock(); /* Timer expire before wakeup. */ + do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags); } /* @@ -2395,12 +2384,14 @@ static void do_nocb_deferred_wakeup_timer(struct timer_list *t) */ static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) { - if (!rdp->nocb_gp_rdp) + unsigned long flags; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + + if (!rdp_gp || !rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE)) return false; - if (rcu_nocb_need_deferred_wakeup(rdp->nocb_gp_rdp, RCU_NOCB_WAKE)) - return do_nocb_deferred_wakeup_common(rdp, RCU_NOCB_WAKE); - return false; + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + return do_nocb_deferred_wakeup_common(rdp_gp, rdp, RCU_NOCB_WAKE, flags); } void rcu_nocb_flush_deferred_wakeup(void) @@ -2644,7 +2635,6 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) raw_spin_lock_init(&rdp->nocb_bypass_lock); raw_spin_lock_init(&rdp->nocb_gp_lock); timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); - timer_setup(&rdp->nocb_bypass_timer, do_nocb_bypass_wakeup_timer, 0); rcu_cblist_init(&rdp->nocb_bypass); } @@ -2803,13 +2793,12 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp) { struct rcu_node *rnp = rdp->mynode; - pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n", + pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n", rdp->cpu, "kK"[!!rdp->nocb_gp_kthread], "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)], "dD"[!!rdp->nocb_defer_wakeup], "tT"[timer_pending(&rdp->nocb_timer)], - "bB"[timer_pending(&rdp->nocb_bypass_timer)], "sS"[!!rdp->nocb_gp_sleep], ".W"[swait_active(&rdp->nocb_gp_wq)], ".W"[swait_active(&rnp->nocb_gp_wq[0])], @@ -2830,7 +2819,6 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) char bufr[20]; struct rcu_segcblist *rsclp = &rdp->cblist; bool waslocked; - bool wastimer; bool wassleep; if (rdp->nocb_gp_rdp == rdp) @@ -2867,15 +2855,13 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) return; waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock); - wastimer = timer_pending(&rdp->nocb_bypass_timer); wassleep = swait_active(&rdp->nocb_gp_wq); - if (!rdp->nocb_gp_sleep && !waslocked && !wastimer && !wassleep) + if (!rdp->nocb_gp_sleep && !waslocked && !wassleep) return; /* Nothing untowards. */ - pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c%c %c\n", + pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c %c\n", "lL"[waslocked], "dD"[!!rdp->nocb_defer_wakeup], - "tT"[wastimer], "sS"[!!rdp->nocb_gp_sleep], ".W"[wassleep]); } From a616aec9aa140ef1ca61b06cec467391cbef11d7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 22 Mar 2021 22:29:10 -0700 Subject: [PATCH 69/76] rcu: Fix various typos in comments Fix ~12 single-word typos in RCU code comments. [ paulmck: Apply feedback from Randy Dunlap. ] Reviewed-by: Randy Dunlap Signed-off-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 4 ++-- kernel/rcu/sync.c | 4 ++-- kernel/rcu/tasks.h | 8 ++++---- kernel/rcu/tree.c | 6 +++--- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 2 +- .../selftests/rcutorture/formal/srcu-cbmc/src/locks.h | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index e26547b34ad3..036ff5499ad5 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -777,9 +777,9 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp) spin_unlock_irqrestore_rcu_node(sdp, flags); /* - * No local callbacks, so probabalistically probe global state. + * No local callbacks, so probabilistically probe global state. * Exact information would require acquiring locks, which would - * kill scalability, hence the probabalistic nature of the probe. + * kill scalability, hence the probabilistic nature of the probe. */ /* First, see if enough time has passed since the last GP. */ diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index d4558ab7a07d..33d896d85902 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -94,9 +94,9 @@ static void rcu_sync_func(struct rcu_head *rhp) rcu_sync_call(rsp); } else { /* - * We're at least a GP after the last rcu_sync_exit(); eveybody + * We're at least a GP after the last rcu_sync_exit(); everybody * will now have observed the write side critical section. - * Let 'em rip!. + * Let 'em rip! */ WRITE_ONCE(rsp->gp_state, GP_IDLE); } diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 350ebf5051f9..da906b7f3a86 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -23,7 +23,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * struct rcu_tasks - Definition for a Tasks-RCU-like mechanism. * @cbs_head: Head of callback list. * @cbs_tail: Tail pointer for callback list. - * @cbs_wq: Wait queue allowning new callback to get kthread's attention. + * @cbs_wq: Wait queue allowing new callback to get kthread's attention. * @cbs_lock: Lock protecting callback list. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. @@ -504,7 +504,7 @@ DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks"); * or transition to usermode execution. As such, there are no read-side * primitives analogous to rcu_read_lock() and rcu_read_unlock() because * this primitive is intended to determine that all tasks have passed - * through a safe state, not so much for data-strcuture synchronization. + * through a safe state, not so much for data-structure synchronization. * * See the description of call_rcu() for more detailed information on * memory ordering guarantees. @@ -637,7 +637,7 @@ DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude, * there are no read-side primitives analogous to rcu_read_lock() and * rcu_read_unlock() because this primitive is intended to determine * that all tasks have passed through a safe state, not so much for - * data-strcuture synchronization. + * data-structure synchronization. * * See the description of call_rcu() for more detailed information on * memory ordering guarantees. @@ -1163,7 +1163,7 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t) * there are no read-side primitives analogous to rcu_read_lock() and * rcu_read_unlock() because this primitive is intended to determine * that all tasks have passed through a safe state, not so much for - * data-strcuture synchronization. + * data-structure synchronization. * * See the description of call_rcu() for more detailed information on * memory ordering guarantees. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5f1545aab9ed..ed1b5465b3e8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2489,7 +2489,7 @@ int rcutree_dead_cpu(unsigned int cpu) /* * Invoke any RCU callbacks that have made it to the end of their grace - * period. Thottle as specified by rdp->blimit. + * period. Throttle as specified by rdp->blimit. */ static void rcu_do_batch(struct rcu_data *rdp) { @@ -3848,7 +3848,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); * * If a full RCU grace period has elapsed since the earlier call from * which oldstate was obtained, return @true, otherwise return @false. - * If @false is returned, it is the caller's responsibilty to invoke this + * If @false is returned, it is the caller's responsibility to invoke this * function later on until it does return @true. Alternatively, the caller * can explicitly wait for a grace period, for example, by passing @oldstate * to cond_synchronize_rcu() or by directly invoking synchronize_rcu(). @@ -4094,7 +4094,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier); /* * Propagate ->qsinitmask bits up the rcu_node tree to account for the * first CPU in a given leaf rcu_node structure coming online. The caller - * must hold the corresponding leaf rcu_node ->lock with interrrupts + * must hold the corresponding leaf rcu_node ->lock with interrupts * disabled. */ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9a16487edfca..c1ed047cb128 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -153,7 +153,7 @@ struct rcu_data { unsigned long gp_seq; /* Track rsp->gp_seq counter. */ unsigned long gp_seq_needed; /* Track furthest future GP request. */ union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ - bool core_needs_qs; /* Core waits for quiesc state. */ + bool core_needs_qs; /* Core waits for quiescent state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible ->gp_seq wrap. */ bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index dfb048ec559f..b0c3fb4379c3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2857,7 +2857,7 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock); wassleep = swait_active(&rdp->nocb_gp_wq); if (!rdp->nocb_gp_sleep && !waslocked && !wassleep) - return; /* Nothing untowards. */ + return; /* Nothing untoward. */ pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c %c\n", "lL"[waslocked], diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/locks.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/locks.h index cf6938d679d7..1e24827f96f1 100644 --- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/locks.h +++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/locks.h @@ -174,7 +174,7 @@ static inline bool spin_trylock(spinlock_t *lock) } struct completion { - /* Hopefuly this won't overflow. */ + /* Hopefully this won't overflow. */ unsigned int count; }; From 0a580fa65cfa08a40af1a0a2cf73d100863e4981 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 14 Apr 2021 15:24:13 +0200 Subject: [PATCH 70/76] srcu: Early test SRCU polling start Place an early call to start_poll_synchronize_srcu() before the invocation of call_srcu() on the same srcu_struct structure. After the later call to srcu_barrier(), the completion of the first grace period should be visible to a subsequent invocation of poll_state_synchronize_srcu(), and if not, warn. Signed-off-by: Frederic Weisbecker Cc: Boqun Feng Cc: Lai Jiangshan Cc: Neeraj Upadhyay Cc: Josh Triplett Cc: Joel Fernandes Cc: Uladzislau Rezki Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index b95ae86c40a7..0aa118ac37ba 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -524,6 +524,7 @@ static void test_callback(struct rcu_head *r) } DEFINE_STATIC_SRCU(early_srcu); +static unsigned long early_srcu_cookie; struct early_boot_kfree_rcu { struct rcu_head rh; @@ -536,8 +537,10 @@ static void early_boot_test_call_rcu(void) struct early_boot_kfree_rcu *rhp; call_rcu(&head, test_callback); - if (IS_ENABLED(CONFIG_SRCU)) + if (IS_ENABLED(CONFIG_SRCU)) { + early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu); call_srcu(&early_srcu, &shead, test_callback); + } rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); if (!WARN_ON_ONCE(!rhp)) kfree_rcu(rhp, rh); @@ -563,6 +566,7 @@ static int rcu_verify_early_boot_tests(void) if (IS_ENABLED(CONFIG_SRCU)) { early_boot_test_counter++; srcu_barrier(&early_srcu); + WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie)); } } if (rcu_self_test_counter != early_boot_test_counter) { From 76c8eaafe4f061f3790112842a2fbb297e4bea88 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 21 Apr 2021 14:30:54 -0700 Subject: [PATCH 71/76] rcu: Create an unrcu_pointer() to remove __rcu from a pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xchg() and cmpxchg() functions are sometimes used to carry out RCU updates. Unfortunately, this can result in sparse warnings for both the old-value and new-value arguments, as well as for the return value. The arguments can be dealt with using RCU_INITIALIZER(): old_p = xchg(&p, RCU_INITIALIZER(new_p)); But a sparse warning still remains due to assigning the __rcu pointer returned from xchg to the (most likely) non-__rcu pointer old_p. This commit therefore provides an unrcu_pointer() macro that strips the __rcu. This macro can be used as follows: old_p = unrcu_pointer(xchg(&p, RCU_INITIALIZER(new_p))); Reported-by: Toke Høiland-Jørgensen Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 1199ffd305d1..b071d02a028a 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -363,6 +363,20 @@ static inline void rcu_preempt_sleep_check(void) { } #define rcu_check_sparse(p, space) #endif /* #else #ifdef __CHECKER__ */ +/** + * unrcu_pointer - mark a pointer as not being RCU protected + * @p: pointer needing to lose its __rcu property + * + * Converts @p from an __rcu pointer to a __kernel pointer. + * This allows an __rcu pointer to be used with xchg() and friends. + */ +#define unrcu_pointer(p) \ +({ \ + typeof(*p) *_________p1 = (typeof(*p) *__force)(p); \ + rcu_check_sparse(p, __rcu); \ + ((typeof(*p) __force __kernel *)(_________p1)); \ +}) + #define __rcu_access_pointer(p, space) \ ({ \ typeof(*p) *_________p1 = (typeof(*p) *__force)READ_ONCE(p); \ From 1893afd63409111c6edcee9d6e1196fc06cf4fd7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 29 Apr 2021 11:18:01 -0700 Subject: [PATCH 72/76] rcu: Improve comments describing RCU read-side critical sections There are a number of places that call out the fact that preempt-disable regions of code now act as RCU read-side critical sections, where preempt-disable regions of code include irq-disable regions of code, bh-disable regions of code, hardirq handlers, and NMI handlers. However, someone relying solely on (for example) the call_rcu() header comment might well have no idea that preempt-disable regions of code have RCU semantics. This commit therefore updates the header comments for call_rcu(), synchronize_rcu(), rcu_dereference_bh_check(), and rcu_dereference_sched_check() to call out these new(ish) forms of RCU readers. Reported-by: Michel Lespinasse [ paulmck: Apply Matthew Wilcox and Michel Lespinasse feedback. ] Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 35 ++++++++++++++++++++++++++++------- kernel/rcu/tree.c | 24 ++++++++++++++---------- 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index b071d02a028a..f0eecb9e49c8 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -532,7 +532,12 @@ do { \ * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * - * This is the RCU-bh counterpart to rcu_dereference_check(). + * This is the RCU-bh counterpart to rcu_dereference_check(). However, + * please note that starting in v5.0 kernels, vanilla RCU grace periods + * wait for local_bh_disable() regions of code in addition to regions of + * code demarked by rcu_read_lock() and rcu_read_unlock(). This means + * that synchronize_rcu(), call_rcu, and friends all take not only + * rcu_read_lock() but also rcu_read_lock_bh() into account. */ #define rcu_dereference_bh_check(p, c) \ __rcu_dereference_check((p), (c) || rcu_read_lock_bh_held(), __rcu) @@ -543,6 +548,11 @@ do { \ * @c: The conditions under which the dereference will take place * * This is the RCU-sched counterpart to rcu_dereference_check(). + * However, please note that starting in v5.0 kernels, vanilla RCU grace + * periods wait for preempt_disable() regions of code in addition to + * regions of code demarked by rcu_read_lock() and rcu_read_unlock(). + * This means that synchronize_rcu(), call_rcu, and friends all take not + * only rcu_read_lock() but also rcu_read_lock_sched() into account. */ #define rcu_dereference_sched_check(p, c) \ __rcu_dereference_check((p), (c) || rcu_read_lock_sched_held(), \ @@ -634,6 +644,12 @@ do { \ * sections, invocation of the corresponding RCU callback is deferred * until after the all the other CPUs exit their critical sections. * + * In v5.0 and later kernels, synchronize_rcu() and call_rcu() also + * wait for regions of code with preemption disabled, including regions of + * code with interrupts or softirqs disabled. In pre-v5.0 kernels, which + * define synchronize_sched(), only code enclosed within rcu_read_lock() + * and rcu_read_unlock() are guaranteed to be waited for. + * * Note, however, that RCU callbacks are permitted to run concurrently * with new RCU read-side critical sections. One way that this can happen * is via the following sequence of events: (1) CPU 0 enters an RCU @@ -728,9 +744,11 @@ static inline void rcu_read_unlock(void) /** * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section * - * This is equivalent of rcu_read_lock(), but also disables softirqs. - * Note that anything else that disables softirqs can also serve as - * an RCU read-side critical section. + * This is equivalent to rcu_read_lock(), but also disables softirqs. + * Note that anything else that disables softirqs can also serve as an RCU + * read-side critical section. However, please note that this equivalence + * applies only to v5.0 and later. Before v5.0, rcu_read_lock() and + * rcu_read_lock_bh() were unrelated. * * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh() * must occur in the same context, for example, it is illegal to invoke @@ -763,9 +781,12 @@ static inline void rcu_read_unlock_bh(void) /** * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section * - * This is equivalent of rcu_read_lock(), but disables preemption. - * Read-side critical sections can also be introduced by anything else - * that disables preemption, including local_irq_disable() and friends. + * This is equivalent to rcu_read_lock(), but also disables preemption. + * Read-side critical sections can also be introduced by anything else that + * disables preemption, including local_irq_disable() and friends. However, + * please note that the equivalence to rcu_read_lock() applies only to + * v5.0 and later. Before v5.0, rcu_read_lock() and rcu_read_lock_sched() + * were unrelated. * * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched() * must occur in the same context, for example, it is illegal to invoke diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2437960a2795..4b00e4fbfa10 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3059,12 +3059,14 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) * period elapses, in other words after all pre-existing RCU read-side * critical sections have completed. However, the callback function * might well execute concurrently with RCU read-side critical sections - * that started after call_rcu() was invoked. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), and - * may be nested. In addition, regions of code across which interrupts, - * preemption, or softirqs have been disabled also serve as RCU read-side - * critical sections. This includes hardware interrupt handlers, softirq - * handlers, and NMI handlers. + * that started after call_rcu() was invoked. + * + * RCU read-side critical sections are delimited by rcu_read_lock() + * and rcu_read_unlock(), and may be nested. In addition, but only in + * v5.0 and later, regions of code across which interrupts, preemption, + * or softirqs have been disabled also serve as RCU read-side critical + * sections. This includes hardware interrupt handlers, softirq handlers, + * and NMI handlers. * * Note that all CPUs must agree that the grace period extended beyond * all pre-existing RCU read-side critical section. On systems with more @@ -3730,10 +3732,12 @@ static int rcu_blocking_is_gp(void) * read-side critical sections have completed. Note, however, that * upon return from synchronize_rcu(), the caller might well be executing * concurrently with new RCU read-side critical sections that began while - * synchronize_rcu() was waiting. RCU read-side critical sections are - * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. - * In addition, regions of code across which interrupts, preemption, or - * softirqs have been disabled also serve as RCU read-side critical + * synchronize_rcu() was waiting. + * + * RCU read-side critical sections are delimited by rcu_read_lock() + * and rcu_read_unlock(), and may be nested. In addition, but only in + * v5.0 and later, regions of code across which interrupts, preemption, + * or softirqs have been disabled also serve as RCU read-side critical * sections. This includes hardware interrupt handlers, softirq handlers, * and NMI handlers. * From 0223846010750e28e4330f1beefb5564ba406ef7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 29 Apr 2021 11:30:49 -0700 Subject: [PATCH 73/76] rcu: Remove obsolete rcu_read_unlock() deadlock commentary The deferred quiescent states resulting from the consolidation of RCU-bh and RCU-sched into RCU means that rcu_read_unlock() will no longer attempt to acquire scheduler locks if interrupts were disabled across that call to rcu_read_unlock(). The cautions in the rcu_read_unlock() header comment are therefore obsolete. This commit therefore removes them. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 33 ++++++--------------------------- 1 file changed, 6 insertions(+), 27 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index f0eecb9e49c8..d9680b798b21 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -702,33 +702,12 @@ static __always_inline void rcu_read_lock(void) /** * rcu_read_unlock() - marks the end of an RCU read-side critical section. * - * In most situations, rcu_read_unlock() is immune from deadlock. - * However, in kernels built with CONFIG_RCU_BOOST, rcu_read_unlock() - * is responsible for deboosting, which it does via rt_mutex_unlock(). - * Unfortunately, this function acquires the scheduler's runqueue and - * priority-inheritance spinlocks. This means that deadlock could result - * if the caller of rcu_read_unlock() already holds one of these locks or - * any lock that is ever acquired while holding them. - * - * That said, RCU readers are never priority boosted unless they were - * preempted. Therefore, one way to avoid deadlock is to make sure - * that preemption never happens within any RCU read-side critical - * section whose outermost rcu_read_unlock() is called with one of - * rt_mutex_unlock()'s locks held. Such preemption can be avoided in - * a number of ways, for example, by invoking preempt_disable() before - * critical section's outermost rcu_read_lock(). - * - * Given that the set of locks acquired by rt_mutex_unlock() might change - * at any time, a somewhat more future-proofed approach is to make sure - * that that preemption never happens within any RCU read-side critical - * section whose outermost rcu_read_unlock() is called with irqs disabled. - * This approach relies on the fact that rt_mutex_unlock() currently only - * acquires irq-disabled locks. - * - * The second of these two approaches is best in most situations, - * however, the first approach can also be useful, at least to those - * developers willing to keep abreast of the set of locks acquired by - * rt_mutex_unlock(). + * In almost all situations, rcu_read_unlock() is immune from deadlock. + * In recent kernels that have consolidated synchronize_sched() and + * synchronize_rcu_bh() into synchronize_rcu(), this deadlock immunity + * also extends to the scheduler's runqueue and priority-inheritance + * spinlocks, courtesy of the quiescent-state deferral that is carried + * out when rcu_read_unlock() is invoked with interrupts disabled. * * See rcu_read_lock() for more information. */ From c70360c3343f975bd066b6b98159d93f1bd4219f Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Thu, 29 Apr 2021 00:12:19 +0100 Subject: [PATCH 74/76] rcu: Add missing __releases() annotation Sparse reports a warning at rcu_print_task_stall(): "warning: context imbalance in rcu_print_task_stall - unexpected unlock" The root cause is a missing annotation on rcu_print_task_stall(). This commit therefore adds the missing __releases(rnp->lock) annotation. Signed-off-by: Jules Irenge Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index e6bd518e0bc4..ffb8cf6c6437 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -314,6 +314,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) * tasks blocked within RCU read-side critical sections. */ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return 0; From cf868c2af244417ed276ba7f716b980841a71340 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 24 Mar 2021 17:08:48 -0700 Subject: [PATCH 75/76] rcu-tasks: Make ksoftirqd provide RCU Tasks quiescent states MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Heavy networking load can cause a CPU to execute continuously and indefinitely within ksoftirqd, in which case there will be no voluntary task switches and thus no RCU-tasks quiescent states. This commit therefore causes the exiting rcu_softirq_qs() to provide an RCU-tasks quiescent state. This of course means that __do_softirq() and its callers cannot be invoked from within a tracing trampoline. Reported-by: Toke Høiland-Jørgensen Tested-by: Toke Høiland-Jørgensen Reviewed-by: Masami Hiramatsu Signed-off-by: Paul E. McKenney Cc: Steven Rostedt Cc: Masami Hiramatsu --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8e78b2430c16..f4daa4e60b14 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -242,6 +242,7 @@ void rcu_softirq_qs(void) { rcu_qs(); rcu_preempt_deferred_qs(current); + rcu_tasks_qs(current, false); } /* From 474d0997361c07d163693d0de41e76a2f2899d0a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 20 Apr 2021 10:58:07 -0700 Subject: [PATCH 76/76] tasks-rcu: Make show_rcu_tasks_gp_kthreads() be static inline In some architectures, the no-op variant of show_rcu_tasks_gp_kthreads() get "no previous prototype" compiler warnings. These are false positives given that kernel/rcu/tasks.h is included only once. But why put up with the compiler noise? This commit therefore adds "static inline" to this definition to force the compiler to accept this situation, while also moving it to its proper place in kernel/rcu/rcu.h. Reported-by: kernel test robot [ paulmck: Update per Stephen Rothwell feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 4 ++++ kernel/rcu/tasks.h | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index bf0827d4b659..c0b3ab6736b8 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -441,7 +441,11 @@ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ void rcu_expedite_gp(void); void rcu_unexpedite_gp(void); void rcupdate_announce_bootup_oddness(void); +#ifdef CONFIG_TASKS_RCU_GENERIC void show_rcu_tasks_gp_kthreads(void); +#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +static inline void show_rcu_tasks_gp_kthreads(void) {} +#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ void rcu_request_urgent_qs_task(struct task_struct *t); #endif /* #else #ifdef CONFIG_TINY_RCU */ diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d6aa352cd705..fc218539d151 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1401,5 +1401,4 @@ void __init rcu_init_tasks_generic(void) #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void rcu_tasks_bootup_oddness(void) {} -void show_rcu_tasks_gp_kthreads(void) {} #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */