mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-04 04:06:26 +00:00
rcu/exp: Remove rcu_par_gp_wq
TREE04 running on short iterations can produce writer stalls of the following kind: ??? Writer stall state RTWS_EXP_SYNC(4) g3968 f0x0 ->state 0x2 cpu 0 task:rcu_torture_wri state:D stack:14568 pid:83 ppid:2 flags:0x00004000 Call Trace: <TASK> __schedule+0x2de/0x850 ? trace_event_raw_event_rcu_exp_funnel_lock+0x6d/0xb0 schedule+0x4f/0x90 synchronize_rcu_expedited+0x430/0x670 ? __pfx_autoremove_wake_function+0x10/0x10 ? __pfx_synchronize_rcu_expedited+0x10/0x10 do_rtws_sync.constprop.0+0xde/0x230 rcu_torture_writer+0x4b4/0xcd0 ? __pfx_rcu_torture_writer+0x10/0x10 kthread+0xc7/0xf0 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2f/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1b/0x30 </TASK> Waiting for an expedited grace period and polling for an expedited grace period both are operations that internally rely on the same workqueue performing necessary asynchronous work. However, a dependency chain is involved between those two operations, as depicted below: ====== CPU 0 ======= ====== CPU 1 ======= synchronize_rcu_expedited() exp_funnel_lock() mutex_lock(&rcu_state.exp_mutex); start_poll_synchronize_rcu_expedited queue_work(rcu_gp_wq, &rnp->exp_poll_wq); synchronize_rcu_expedited_queue_work() queue_work(rcu_gp_wq, &rew->rew_work); wait_event() // A, wait for &rew->rew_work completion mutex_unlock() // B //======> switch to kworker sync_rcu_do_polled_gp() { synchronize_rcu_expedited() exp_funnel_lock() mutex_lock(&rcu_state.exp_mutex); // C, wait B .... } // D Since workqueues are usually implemented on top of several kworkers handling the queue concurrently, the above situation wouldn't deadlock most of the time because A then doesn't depend on D. But in case of memory stress, a single kworker may end up handling alone all the works in a serialized way. In that case the above layout becomes a problem because A then waits for D, closing a circular dependency: A -> D -> C -> B -> A This however only happens when CONFIG_RCU_EXP_KTHREAD=n. Indeed synchronize_rcu_expedited() is otherwise implemented on top of a kthread worker while polling still relies on rcu_gp_wq workqueue, breaking the above circular dependency chain. Fix this with making expedited grace period to always rely on kthread worker. The workqueue based implementation is essentially a duplicate anyway now that the per-node initialization is performed by per-node kthread workers. Meanwhile the CONFIG_RCU_EXP_KTHREAD switch is still kept around to manage the scheduler policy of these kthread workers. Reported-by: Anna-Maria Behnsen <anna-maria@linutronix.de> Reported-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Joel Fernandes <joel@joelfernandes.org> Suggested-by: Paul E. McKenney <paulmck@kernel.org> Suggested-by: Neeraj upadhyay <Neeraj.Upadhyay@amd.com> Signed-off-by: Frederic Weisbecker <frederic@kernel.org> Reviewed-by: Paul E. McKenney <paulmck@kernel.org> Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
This commit is contained in:
parent
b67cffcbbf
commit
23da2ad64d
@ -623,11 +623,7 @@ int rcu_get_gp_kthreads_prio(void);
|
||||
void rcu_fwd_progress_check(unsigned long j);
|
||||
void rcu_force_quiescent_state(void);
|
||||
extern struct workqueue_struct *rcu_gp_wq;
|
||||
#ifdef CONFIG_RCU_EXP_KTHREAD
|
||||
extern struct kthread_worker *rcu_exp_gp_kworker;
|
||||
#else /* !CONFIG_RCU_EXP_KTHREAD */
|
||||
extern struct workqueue_struct *rcu_par_gp_wq;
|
||||
#endif /* CONFIG_RCU_EXP_KTHREAD */
|
||||
void rcu_gp_slow_register(atomic_t *rgssp);
|
||||
void rcu_gp_slow_unregister(atomic_t *rgssp);
|
||||
#endif /* #else #ifdef CONFIG_TINY_RCU */
|
||||
|
@ -4394,7 +4394,6 @@ rcu_boot_init_percpu_data(int cpu)
|
||||
rcu_boot_init_nocb_percpu_data(rdp);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_RCU_EXP_KTHREAD
|
||||
struct kthread_worker *rcu_exp_gp_kworker;
|
||||
|
||||
static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
|
||||
@ -4414,7 +4413,9 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
|
||||
return;
|
||||
}
|
||||
WRITE_ONCE(rnp->exp_kworker, kworker);
|
||||
sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m);
|
||||
|
||||
if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD))
|
||||
sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m);
|
||||
}
|
||||
|
||||
static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp)
|
||||
@ -4438,39 +4439,14 @@ static void __init rcu_start_exp_gp_kworker(void)
|
||||
rcu_exp_gp_kworker = NULL;
|
||||
return;
|
||||
}
|
||||
sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m);
|
||||
}
|
||||
|
||||
static inline void rcu_alloc_par_gp_wq(void)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD))
|
||||
sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m);
|
||||
}
|
||||
#else /* !CONFIG_RCU_EXP_KTHREAD */
|
||||
struct workqueue_struct *rcu_par_gp_wq;
|
||||
|
||||
static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
|
||||
{
|
||||
}
|
||||
|
||||
static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void __init rcu_start_exp_gp_kworker(void)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void rcu_alloc_par_gp_wq(void)
|
||||
{
|
||||
rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
|
||||
WARN_ON(!rcu_par_gp_wq);
|
||||
}
|
||||
#endif /* CONFIG_RCU_EXP_KTHREAD */
|
||||
|
||||
static void rcu_spawn_rnp_kthreads(struct rcu_node *rnp)
|
||||
{
|
||||
if ((IS_ENABLED(CONFIG_RCU_EXP_KTHREAD) ||
|
||||
IS_ENABLED(CONFIG_RCU_BOOST)) && rcu_scheduler_fully_active) {
|
||||
if (rcu_scheduler_fully_active) {
|
||||
mutex_lock(&rnp->kthread_mutex);
|
||||
rcu_spawn_one_boost_kthread(rnp);
|
||||
rcu_spawn_exp_par_gp_kworker(rnp);
|
||||
@ -4554,9 +4530,6 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu)
|
||||
struct rcu_node *rnp;
|
||||
struct task_struct *task_boost, *task_exp;
|
||||
|
||||
if (!IS_ENABLED(CONFIG_RCU_EXP_KTHREAD) && !IS_ENABLED(CONFIG_RCU_BOOST))
|
||||
return;
|
||||
|
||||
rdp = per_cpu_ptr(&rcu_data, cpu);
|
||||
rnp = rdp->mynode;
|
||||
|
||||
@ -5245,7 +5218,6 @@ void __init rcu_init(void)
|
||||
/* Create workqueue for Tree SRCU and for expedited GPs. */
|
||||
rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
|
||||
WARN_ON(!rcu_gp_wq);
|
||||
rcu_alloc_par_gp_wq();
|
||||
|
||||
/* Fill in default value for rcutree.qovld boot parameter. */
|
||||
/* -After- the rcu_node ->lock fields are initialized! */
|
||||
|
@ -21,14 +21,10 @@
|
||||
|
||||
#include "rcu_segcblist.h"
|
||||
|
||||
/* Communicate arguments to a workqueue handler. */
|
||||
/* Communicate arguments to a kthread worker handler. */
|
||||
struct rcu_exp_work {
|
||||
unsigned long rew_s;
|
||||
#ifdef CONFIG_RCU_EXP_KTHREAD
|
||||
struct kthread_work rew_work;
|
||||
#else
|
||||
struct work_struct rew_work;
|
||||
#endif /* CONFIG_RCU_EXP_KTHREAD */
|
||||
};
|
||||
|
||||
/* RCU's kthread states for tracing. */
|
||||
|
@ -418,7 +418,6 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
|
||||
|
||||
static void rcu_exp_sel_wait_wake(unsigned long s);
|
||||
|
||||
#ifdef CONFIG_RCU_EXP_KTHREAD
|
||||
static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp)
|
||||
{
|
||||
struct rcu_exp_work *rewp =
|
||||
@ -470,69 +469,6 @@ static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew
|
||||
kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work);
|
||||
}
|
||||
|
||||
static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
|
||||
{
|
||||
}
|
||||
#else /* !CONFIG_RCU_EXP_KTHREAD */
|
||||
static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
|
||||
{
|
||||
struct rcu_exp_work *rewp =
|
||||
container_of(wp, struct rcu_exp_work, rew_work);
|
||||
|
||||
__sync_rcu_exp_select_node_cpus(rewp);
|
||||
}
|
||||
|
||||
static inline bool rcu_exp_worker_started(void)
|
||||
{
|
||||
return !!READ_ONCE(rcu_gp_wq);
|
||||
}
|
||||
|
||||
static inline bool rcu_exp_par_worker_started(struct rcu_node *rnp)
|
||||
{
|
||||
return !!READ_ONCE(rcu_par_gp_wq);
|
||||
}
|
||||
|
||||
static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
|
||||
{
|
||||
int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
|
||||
|
||||
INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
|
||||
/* If all offline, queue the work on an unbound CPU. */
|
||||
if (unlikely(cpu > rnp->grphi - rnp->grplo))
|
||||
cpu = WORK_CPU_UNBOUND;
|
||||
else
|
||||
cpu += rnp->grplo;
|
||||
queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
|
||||
}
|
||||
|
||||
static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
|
||||
{
|
||||
flush_work(&rnp->rew.rew_work);
|
||||
}
|
||||
|
||||
/*
|
||||
* Work-queue handler to drive an expedited grace period forward.
|
||||
*/
|
||||
static void wait_rcu_exp_gp(struct work_struct *wp)
|
||||
{
|
||||
struct rcu_exp_work *rewp;
|
||||
|
||||
rewp = container_of(wp, struct rcu_exp_work, rew_work);
|
||||
rcu_exp_sel_wait_wake(rewp->rew_s);
|
||||
}
|
||||
|
||||
static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
|
||||
{
|
||||
INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp);
|
||||
queue_work(rcu_gp_wq, &rew->rew_work);
|
||||
}
|
||||
|
||||
static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
|
||||
{
|
||||
destroy_work_on_stack(&rew->rew_work);
|
||||
}
|
||||
#endif /* CONFIG_RCU_EXP_KTHREAD */
|
||||
|
||||
/*
|
||||
* Select the nodes that the upcoming expedited grace period needs
|
||||
* to wait for.
|
||||
@ -965,7 +901,6 @@ static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp)
|
||||
*/
|
||||
void synchronize_rcu_expedited(void)
|
||||
{
|
||||
bool use_worker;
|
||||
unsigned long flags;
|
||||
struct rcu_exp_work rew;
|
||||
struct rcu_node *rnp;
|
||||
@ -976,9 +911,6 @@ void synchronize_rcu_expedited(void)
|
||||
lock_is_held(&rcu_sched_lock_map),
|
||||
"Illegal synchronize_rcu_expedited() in RCU read-side critical section");
|
||||
|
||||
use_worker = (rcu_scheduler_active != RCU_SCHEDULER_INIT) &&
|
||||
rcu_exp_worker_started();
|
||||
|
||||
/* Is the state is such that the call is a grace period? */
|
||||
if (rcu_blocking_is_gp()) {
|
||||
// Note well that this code runs with !PREEMPT && !SMP.
|
||||
@ -1008,7 +940,7 @@ void synchronize_rcu_expedited(void)
|
||||
return; /* Someone else did our work for us. */
|
||||
|
||||
/* Ensure that load happens before action based on it. */
|
||||
if (unlikely(!use_worker)) {
|
||||
if (unlikely((rcu_scheduler_active == RCU_SCHEDULER_INIT) || !rcu_exp_worker_started())) {
|
||||
/* Direct call during scheduler init and early_initcalls(). */
|
||||
rcu_exp_sel_wait_wake(s);
|
||||
} else {
|
||||
@ -1025,9 +957,6 @@ void synchronize_rcu_expedited(void)
|
||||
|
||||
/* Let the next expedited grace period start. */
|
||||
mutex_unlock(&rcu_state.exp_mutex);
|
||||
|
||||
if (likely(use_worker))
|
||||
synchronize_rcu_expedited_destroy_work(&rew);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user