mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-07 13:43:51 +00:00
Merge branch 'report-rcu-qs-for-busy-network-kthreads'
Yan Zhai says:
====================
Report RCU QS for busy network kthreads
This changeset fixes a common problem for busy networking kthreads.
These threads, e.g. NAPI threads, typically will do:
* polling a batch of packets
* if there are more work, call cond_resched() to allow scheduling
* continue to poll more packets when rx queue is not empty
We observed this being a problem in production, since it can block RCU
tasks from making progress under heavy load. Investigation indicates
that just calling cond_resched() is insufficient for RCU tasks to reach
quiescent states. This also has the side effect of frequently clearing
the TIF_NEED_RESCHED flag on voluntary preempt kernels. As a result,
schedule() will not be called in these circumstances, despite schedule()
in fact provides required quiescent states. This at least affects NAPI
threads, napi_busy_loop, and also cpumap kthread.
By reporting RCU QSes in these kthreads periodically before cond_resched, the
blocked RCU waiters can correctly progress. Instead of just reporting QS for
RCU tasks, these code share the same concern as noted in the commit
d28139c4e9
("rcu: Apply RCU-bh QSes to RCU-sched and RCU-preempt when safe").
So report a consolidated QS for safety.
It is worth noting that, although this problem is reproducible in
napi_busy_loop, it only shows up when setting the polling interval to as high
as 2ms, which is far larger than recommended 50us-100us in the documentation.
So napi_busy_loop is left untouched.
Lastly, this does not affect RT kernels, which does not enter the scheduler
through cond_resched(). Without the mentioned side effect, schedule() will
be called time by time, and clear the RCU task holdouts.
V4: https://lore.kernel.org/bpf/cover.1710525524.git.yan@cloudflare.com/
V3: https://lore.kernel.org/lkml/20240314145459.7b3aedf1@kernel.org/t/
V2: https://lore.kernel.org/bpf/ZeFPz4D121TgvCje@debian.debian/
V1: https://lore.kernel.org/lkml/Zd4DXTyCf17lcTfq@debian.debian/#t
====================
Link: https://lore.kernel.org/r/cover.1710877680.git.yan@cloudflare.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
commit
3201de46a2
@ -247,6 +247,37 @@ do { \
|
||||
cond_resched(); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* rcu_softirq_qs_periodic - Report RCU and RCU-Tasks quiescent states
|
||||
* @old_ts: jiffies at start of processing.
|
||||
*
|
||||
* This helper is for long-running softirq handlers, such as NAPI threads in
|
||||
* networking. The caller should initialize the variable passed in as @old_ts
|
||||
* at the beginning of the softirq handler. When invoked frequently, this macro
|
||||
* will invoke rcu_softirq_qs() every 100 milliseconds thereafter, which will
|
||||
* provide both RCU and RCU-Tasks quiescent states. Note that this macro
|
||||
* modifies its old_ts argument.
|
||||
*
|
||||
* Because regions of code that have disabled softirq act as RCU read-side
|
||||
* critical sections, this macro should be invoked with softirq (and
|
||||
* preemption) enabled.
|
||||
*
|
||||
* The macro is not needed when CONFIG_PREEMPT_RT is defined. RT kernels would
|
||||
* have more chance to invoke schedule() calls and provide necessary quiescent
|
||||
* states. As a contrast, calling cond_resched() only won't achieve the same
|
||||
* effect because cond_resched() does not provide RCU-Tasks quiescent states.
|
||||
*/
|
||||
#define rcu_softirq_qs_periodic(old_ts) \
|
||||
do { \
|
||||
if (!IS_ENABLED(CONFIG_PREEMPT_RT) && \
|
||||
time_after(jiffies, (old_ts) + HZ / 10)) { \
|
||||
preempt_disable(); \
|
||||
rcu_softirq_qs(); \
|
||||
preempt_enable(); \
|
||||
(old_ts) = jiffies; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* Infrastructure to implement the synchronize_() primitives in
|
||||
* TREE_RCU and rcu_barrier_() primitives in TINY_RCU.
|
||||
|
@ -263,6 +263,7 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
|
||||
static int cpu_map_kthread_run(void *data)
|
||||
{
|
||||
struct bpf_cpu_map_entry *rcpu = data;
|
||||
unsigned long last_qs = jiffies;
|
||||
|
||||
complete(&rcpu->kthread_running);
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
@ -288,10 +289,12 @@ static int cpu_map_kthread_run(void *data)
|
||||
if (__ptr_ring_empty(rcpu->queue)) {
|
||||
schedule();
|
||||
sched = 1;
|
||||
last_qs = jiffies;
|
||||
} else {
|
||||
__set_current_state(TASK_RUNNING);
|
||||
}
|
||||
} else {
|
||||
rcu_softirq_qs_periodic(last_qs);
|
||||
sched = cond_resched();
|
||||
}
|
||||
|
||||
|
@ -6743,6 +6743,8 @@ static int napi_threaded_poll(void *data)
|
||||
void *have;
|
||||
|
||||
while (!napi_thread_wait(napi)) {
|
||||
unsigned long last_qs = jiffies;
|
||||
|
||||
for (;;) {
|
||||
bool repoll = false;
|
||||
|
||||
@ -6767,6 +6769,7 @@ static int napi_threaded_poll(void *data)
|
||||
if (!repoll)
|
||||
break;
|
||||
|
||||
rcu_softirq_qs_periodic(last_qs);
|
||||
cond_resched();
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user