Merge branch 'for-4.5-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup fixes from Tejun Heo:

 - The destruction path of cgroup objects are asynchronous and
   multi-staged and some of them ended up destroying parents before
   children leading to failures in cpu and memory controllers.  Ensure
   that parents are always destroyed after children.

 - cpuset mm node migration was performed synchronously while holding
   threadgroup and cgroup mutexes and the recent threadgroup locking
   update resulted in a possible deadlock.  The migration is best effort
   and shouldn't have been performed under those locks to begin with.
   Made asynchronous.

 - Minor documentation fix.

* 'for-4.5-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  Documentation: cgroup: Fix 'cgroup-legacy' -> 'cgroup-v1'
  cgroup: make sure a parent css isn't freed before its children
  cgroup: make sure a parent css isn't offlined before its children
  cpuset: make mm migration asynchronous
This commit is contained in:
Linus Torvalds 2016-02-10 11:36:19 -08:00
commit fb0dc5f129
5 changed files with 85 additions and 31 deletions

View File

@ -7,7 +7,7 @@ This is the authoritative documentation on the design, interface and
conventions of cgroup v2. It describes all userland-visible aspects
of cgroup including core and specific controller behaviors. All
future changes must be reflected in this document. Documentation for
v1 is available under Documentation/cgroup-legacy/.
v1 is available under Documentation/cgroup-v1/.
CONTENTS

View File

@ -127,6 +127,12 @@ struct cgroup_subsys_state {
*/
u64 serial_nr;
/*
* Incremented by online self and children. Used to guarantee that
* parents are not offlined before their children.
*/
atomic_t online_cnt;
/* percpu_ref killing and RCU release */
struct rcu_head rcu_head;
struct work_struct destroy_work;

View File

@ -137,6 +137,8 @@ static inline void set_mems_allowed(nodemask_t nodemask)
task_unlock(current);
}
extern void cpuset_post_attach_flush(void);
#else /* !CONFIG_CPUSETS */
static inline bool cpusets_enabled(void) { return false; }
@ -243,6 +245,10 @@ static inline bool read_mems_allowed_retry(unsigned int seq)
return false;
}
static inline void cpuset_post_attach_flush(void)
{
}
#endif /* !CONFIG_CPUSETS */
#endif /* _LINUX_CPUSET_H */

View File

@ -58,6 +58,7 @@
#include <linux/kthread.h>
#include <linux/delay.h>
#include <linux/atomic.h>
#include <linux/cpuset.h>
#include <net/sock.h>
/*
@ -2739,6 +2740,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
out_unlock_threadgroup:
percpu_up_write(&cgroup_threadgroup_rwsem);
cgroup_kn_unlock(of->kn);
cpuset_post_attach_flush();
return ret ?: nbytes;
}
@ -4655,14 +4657,15 @@ static void css_free_work_fn(struct work_struct *work)
if (ss) {
/* css free path */
struct cgroup_subsys_state *parent = css->parent;
int id = css->id;
if (css->parent)
css_put(css->parent);
ss->css_free(css);
cgroup_idr_remove(&ss->css_idr, id);
cgroup_put(cgrp);
if (parent)
css_put(parent);
} else {
/* cgroup free path */
atomic_dec(&cgrp->root->nr_cgrps);
@ -4758,6 +4761,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
INIT_LIST_HEAD(&css->sibling);
INIT_LIST_HEAD(&css->children);
css->serial_nr = css_serial_nr_next++;
atomic_set(&css->online_cnt, 0);
if (cgroup_parent(cgrp)) {
css->parent = cgroup_css(cgroup_parent(cgrp), ss);
@ -4780,6 +4784,10 @@ static int online_css(struct cgroup_subsys_state *css)
if (!ret) {
css->flags |= CSS_ONLINE;
rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
atomic_inc(&css->online_cnt);
if (css->parent)
atomic_inc(&css->parent->online_cnt);
}
return ret;
}
@ -5017,10 +5025,15 @@ static void css_killed_work_fn(struct work_struct *work)
container_of(work, struct cgroup_subsys_state, destroy_work);
mutex_lock(&cgroup_mutex);
offline_css(css);
mutex_unlock(&cgroup_mutex);
css_put(css);
do {
offline_css(css);
css_put(css);
/* @css can't go away while we're holding cgroup_mutex */
css = css->parent;
} while (css && atomic_dec_and_test(&css->online_cnt));
mutex_unlock(&cgroup_mutex);
}
/* css kill confirmation processing requires process context, bounce */
@ -5029,8 +5042,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
struct cgroup_subsys_state *css =
container_of(ref, struct cgroup_subsys_state, refcnt);
INIT_WORK(&css->destroy_work, css_killed_work_fn);
queue_work(cgroup_destroy_wq, &css->destroy_work);
if (atomic_dec_and_test(&css->online_cnt)) {
INIT_WORK(&css->destroy_work, css_killed_work_fn);
queue_work(cgroup_destroy_wq, &css->destroy_work);
}
}
/**

View File

@ -287,6 +287,8 @@ static struct cpuset top_cpuset = {
static DEFINE_MUTEX(cpuset_mutex);
static DEFINE_SPINLOCK(callback_lock);
static struct workqueue_struct *cpuset_migrate_mm_wq;
/*
* CPU / memory hotplug is handled asynchronously.
*/
@ -972,31 +974,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
}
/*
* cpuset_migrate_mm
*
* Migrate memory region from one set of nodes to another.
*
* Temporarilly set tasks mems_allowed to target nodes of migration,
* so that the migration code can allocate pages on these nodes.
*
* While the mm_struct we are migrating is typically from some
* other task, the task_struct mems_allowed that we are hacking
* is for our current task, which must allocate new pages for that
* migrating memory region.
* Migrate memory region from one set of nodes to another. This is
* performed asynchronously as it can be called from process migration path
* holding locks involved in process management. All mm migrations are
* performed in the queued order and can be waited for by flushing
* cpuset_migrate_mm_wq.
*/
struct cpuset_migrate_mm_work {
struct work_struct work;
struct mm_struct *mm;
nodemask_t from;
nodemask_t to;
};
static void cpuset_migrate_mm_workfn(struct work_struct *work)
{
struct cpuset_migrate_mm_work *mwork =
container_of(work, struct cpuset_migrate_mm_work, work);
/* on a wq worker, no need to worry about %current's mems_allowed */
do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
mmput(mwork->mm);
kfree(mwork);
}
static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to)
{
struct task_struct *tsk = current;
struct cpuset_migrate_mm_work *mwork;
tsk->mems_allowed = *to;
mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
if (mwork) {
mwork->mm = mm;
mwork->from = *from;
mwork->to = *to;
INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
queue_work(cpuset_migrate_mm_wq, &mwork->work);
} else {
mmput(mm);
}
}
do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
rcu_read_lock();
guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
rcu_read_unlock();
void cpuset_post_attach_flush(void)
{
flush_workqueue(cpuset_migrate_mm_wq);
}
/*
@ -1097,7 +1119,8 @@ static void update_tasks_nodemask(struct cpuset *cs)
mpol_rebind_mm(mm, &cs->mems_allowed);
if (migrate)
cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
mmput(mm);
else
mmput(mm);
}
css_task_iter_end(&it);
@ -1545,11 +1568,11 @@ static void cpuset_attach(struct cgroup_taskset *tset)
* @old_mems_allowed is the right nodesets that we
* migrate mm from.
*/
if (is_memory_migrate(cs)) {
if (is_memory_migrate(cs))
cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
&cpuset_attach_nodemask_to);
}
mmput(mm);
else
mmput(mm);
}
}
@ -1714,6 +1737,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
mutex_unlock(&cpuset_mutex);
kernfs_unbreak_active_protection(of->kn);
css_put(&cs->css);
flush_workqueue(cpuset_migrate_mm_wq);
return retval ?: nbytes;
}
@ -2359,6 +2383,9 @@ void __init cpuset_init_smp(void)
top_cpuset.effective_mems = node_states[N_MEMORY];
register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
BUG_ON(!cpuset_migrate_mm_wq);
}
/**