kthread: Default affine kthread to its preferred NUMA node

Kthreads attached to a preferred NUMA node for their task structure
allocation can also be assumed to run preferrably within that same node.

A more precise affinity is usually notified by calling
kthread_create_on_cpu() or kthread_bind[_mask]() before the first wakeup.

For the others, a default affinity to the node is desired and sometimes
implemented with more or less success when it comes to deal with hotplug
events and nohz_full / CPU Isolation interactions:

- kcompactd is affine to its node and handles hotplug but not CPU Isolation
- kswapd is affine to its node and ignores hotplug and CPU Isolation
- A bunch of drivers create their kthreads on a specific node and
  don't take care about affining further.

Handle that default node affinity preference at the generic level
instead, provided a kthread is created on an actual node and doesn't
apply any specific affinity such as a given CPU or a custom cpumask to
bind to before its first wake-up.

This generic handling is aware of CPU hotplug events and CPU isolation
such that:

* When a housekeeping CPU goes up that is part of the node of a given
  kthread, the related task is re-affined to that own node if it was
  previously running on the default last resort online housekeeping set
  from other nodes.

* When a housekeeping CPU goes down while it was part of the node of a
  kthread, the running task is migrated (or the sleeping task is woken
  up) automatically by the scheduler to other housekeepers within the
  same node or, as a last resort, to all housekeepers from other nodes.

Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
This commit is contained in:
Frederic Weisbecker 2024-09-27 00:49:01 +02:00
parent 5eacb68a35
commit d1a8919758
2 changed files with 106 additions and 1 deletions

View File

@ -240,6 +240,7 @@ enum cpuhp_state {
CPUHP_AP_WORKQUEUE_ONLINE,
CPUHP_AP_RANDOM_ONLINE,
CPUHP_AP_RCUTREE_ONLINE,
CPUHP_AP_KTHREADS_ONLINE,
CPUHP_AP_BASE_CACHEINFO_ONLINE,
CPUHP_AP_ONLINE_DYN,
CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 40,

View File

@ -35,6 +35,9 @@ static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;
static LIST_HEAD(kthreads_hotplug);
static DEFINE_MUTEX(kthreads_hotplug_lock);
struct kthread_create_info
{
/* Information passed to kthread() from kthreadd. */
@ -53,6 +56,7 @@ struct kthread_create_info
struct kthread {
unsigned long flags;
unsigned int cpu;
unsigned int node;
int started;
int result;
int (*threadfn)(void *);
@ -64,6 +68,8 @@ struct kthread {
#endif
/* To store the full name if task comm is truncated. */
char *full_name;
struct task_struct *task;
struct list_head hotplug_node;
};
enum KTHREAD_BITS {
@ -122,8 +128,11 @@ bool set_kthread_struct(struct task_struct *p)
init_completion(&kthread->exited);
init_completion(&kthread->parked);
INIT_LIST_HEAD(&kthread->hotplug_node);
p->vfork_done = &kthread->exited;
kthread->task = p;
kthread->node = tsk_fork_get_node(current);
p->worker_private = kthread;
return true;
}
@ -314,6 +323,11 @@ void __noreturn kthread_exit(long result)
{
struct kthread *kthread = to_kthread(current);
kthread->result = result;
if (!list_empty(&kthread->hotplug_node)) {
mutex_lock(&kthreads_hotplug_lock);
list_del(&kthread->hotplug_node);
mutex_unlock(&kthreads_hotplug_lock);
}
do_exit(0);
}
EXPORT_SYMBOL(kthread_exit);
@ -339,6 +353,48 @@ void __noreturn kthread_complete_and_exit(struct completion *comp, long code)
}
EXPORT_SYMBOL(kthread_complete_and_exit);
static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpumask)
{
cpumask_and(cpumask, cpumask_of_node(kthread->node),
housekeeping_cpumask(HK_TYPE_KTHREAD));
if (cpumask_empty(cpumask))
cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD));
}
static void kthread_affine_node(void)
{
struct kthread *kthread = to_kthread(current);
cpumask_var_t affinity;
WARN_ON_ONCE(kthread_is_per_cpu(current));
if (kthread->node == NUMA_NO_NODE) {
housekeeping_affine(current, HK_TYPE_KTHREAD);
} else {
if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) {
WARN_ON_ONCE(1);
return;
}
mutex_lock(&kthreads_hotplug_lock);
WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
/*
* The node cpumask is racy when read from kthread() but:
* - a racing CPU going down will either fail on the subsequent
* call to set_cpus_allowed_ptr() or be migrated to housekeepers
* afterwards by the scheduler.
* - a racing CPU going up will be handled by kthreads_online_cpu()
*/
kthread_fetch_affinity(kthread, affinity);
set_cpus_allowed_ptr(current, affinity);
mutex_unlock(&kthreads_hotplug_lock);
free_cpumask_var(affinity);
}
}
static int kthread(void *_create)
{
static const struct sched_param param = { .sched_priority = 0 };
@ -369,7 +425,6 @@ static int kthread(void *_create)
* back to default in case they have been changed.
*/
sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD));
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
@ -385,6 +440,9 @@ static int kthread(void *_create)
self->started = 1;
if (!(current->flags & PF_NO_SETAFFINITY))
kthread_affine_node();
ret = -EINTR;
if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
cgroup_kthread_ready();
@ -781,6 +839,52 @@ int kthreadd(void *unused)
return 0;
}
/*
* Re-affine kthreads according to their preferences
* and the newly online CPU. The CPU down part is handled
* by select_fallback_rq() which default re-affines to
* housekeepers in case the preferred affinity doesn't
* apply anymore.
*/
static int kthreads_online_cpu(unsigned int cpu)
{
cpumask_var_t affinity;
struct kthread *k;
int ret;
guard(mutex)(&kthreads_hotplug_lock);
if (list_empty(&kthreads_hotplug))
return 0;
if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
return -ENOMEM;
ret = 0;
list_for_each_entry(k, &kthreads_hotplug, hotplug_node) {
if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) ||
kthread_is_per_cpu(k->task) ||
k->node == NUMA_NO_NODE)) {
ret = -EINVAL;
continue;
}
kthread_fetch_affinity(k, affinity);
set_cpus_allowed_ptr(k->task, affinity);
}
free_cpumask_var(affinity);
return ret;
}
static int kthreads_init(void)
{
return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online",
kthreads_online_cpu, NULL);
}
early_initcall(kthreads_init);
void __kthread_init_worker(struct kthread_worker *worker,
const char *name,
struct lock_class_key *key)