From 024914477e15ef8b17f271ec47f1bb8a589f0806 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Wed, 10 Mar 2010 15:22:17 -0800 Subject: [PATCH] memcg: move charges of anonymous swap This patch is another core part of this move-charge-at-task-migration feature. It enables moving charges of anonymous swaps. To move the charge of swap, we need to exchange swap_cgroup's record. In current implementation, swap_cgroup's record is protected by: - page lock: if the entry is on swap cache. - swap_lock: if the entry is not on swap cache. This works well in usual swap-in/out activity. But this behavior make the feature of moving swap charge check many conditions to exchange swap_cgroup's record safely. So I changed modification of swap_cgroup's recored(swap_cgroup_record()) to use xchg, and define a new function to cmpxchg swap_cgroup's record. This patch also enables moving charge of non pte_present but not uncharged swap caches, which can be exist on swap-out path, by getting the target pages via find_get_page() as do_mincore() does. [kosaki.motohiro@jp.fujitsu.com: fix ia64 build] [akpm@linux-foundation.org: fix typos] Signed-off-by: Daisuke Nishimura Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Li Zefan Cc: Paul Menage Cc: Daisuke Nishimura Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memory.txt | 2 + include/linux/page_cgroup.h | 2 + include/linux/swap.h | 9 ++ mm/memcontrol.c | 187 ++++++++++++++++++++++++------- mm/page_cgroup.c | 34 +++++- mm/swapfile.c | 31 +++++ 6 files changed, 225 insertions(+), 40 deletions(-) diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index e726fb0df719..1f59a1a38bd9 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -420,6 +420,8 @@ NOTE2: It is recommended to set the soft limit always below the hard limit, Users can move charges associated with a task along with task migration, that is, uncharge task's pages from the old cgroup and charge them to the new cgroup. +This feature is not supported in !CONFIG_MMU environments because of lack of +page tables. 8.1 Interface diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index b0e4eb126236..30b08136fdf3 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -118,6 +118,8 @@ static inline void __init page_cgroup_init_flatmem(void) #include #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, + unsigned short old, unsigned short new); extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id); extern unsigned short lookup_swap_cgroup(swp_entry_t ent); extern int swap_cgroup_swapon(int type, unsigned long max_pages); diff --git a/include/linux/swap.h b/include/linux/swap.h index a2602a8207a6..1f59d9340c4d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -355,6 +355,7 @@ static inline void disable_swap_token(void) #ifdef CONFIG_CGROUP_MEM_RES_CTLR extern void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout); +extern int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep); #else static inline void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) @@ -485,6 +486,14 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) { } +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +static inline int +mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) +{ + return 0; +} +#endif + #endif /* CONFIG_SWAP */ #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 589084f00b70..e883198baf81 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -2270,6 +2271,54 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) } rcu_read_unlock(); } + +/** + * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. + * @entry: swap entry to be moved + * @from: mem_cgroup which the entry is moved from + * @to: mem_cgroup which the entry is moved to + * + * It succeeds only when the swap_cgroup's record for this entry is the same + * as the mem_cgroup's id of @from. + * + * Returns 0 on success, -EINVAL on failure. + * + * The caller must have charged to @to, IOW, called res_counter_charge() about + * both res and memsw, and called css_get(). + */ +static int mem_cgroup_move_swap_account(swp_entry_t entry, + struct mem_cgroup *from, struct mem_cgroup *to) +{ + unsigned short old_id, new_id; + + old_id = css_id(&from->css); + new_id = css_id(&to->css); + + if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { + if (!mem_cgroup_is_root(from)) + res_counter_uncharge(&from->memsw, PAGE_SIZE); + mem_cgroup_swap_statistics(from, false); + mem_cgroup_put(from); + /* + * we charged both to->res and to->memsw, so we should uncharge + * to->res. + */ + if (!mem_cgroup_is_root(to)) + res_counter_uncharge(&to->res, PAGE_SIZE); + mem_cgroup_swap_statistics(to, true); + mem_cgroup_get(to); + css_put(&to->css); + + return 0; + } + return -EINVAL; +} +#else +static inline int mem_cgroup_move_swap_account(swp_entry_t entry, + struct mem_cgroup *from, struct mem_cgroup *to) +{ + return -EINVAL; +} #endif /* @@ -2949,6 +2998,7 @@ static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; } +#ifdef CONFIG_MMU static int mem_cgroup_move_charge_write(struct cgroup *cgrp, struct cftype *cft, u64 val) { @@ -2967,6 +3017,13 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, return 0; } +#else +static int mem_cgroup_move_charge_write(struct cgroup *cgrp, + struct cftype *cft, u64 val) +{ + return -ENOSYS; +} +#endif /* For read statistics */ @@ -3489,6 +3546,7 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, return ret; } +#ifdef CONFIG_MMU /* Handlers for move charge at task migration. */ #define PRECHARGE_COUNT_AT_ONCE 256 static int mem_cgroup_do_precharge(unsigned long count) @@ -3544,77 +3602,124 @@ static int mem_cgroup_do_precharge(unsigned long count) } return ret; } +#else /* !CONFIG_MMU */ +static int mem_cgroup_can_attach(struct cgroup_subsys *ss, + struct cgroup *cgroup, + struct task_struct *p, + bool threadgroup) +{ + return 0; +} +static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, + struct cgroup *cgroup, + struct task_struct *p, + bool threadgroup) +{ +} +static void mem_cgroup_move_task(struct cgroup_subsys *ss, + struct cgroup *cont, + struct cgroup *old_cont, + struct task_struct *p, + bool threadgroup) +{ +} +#endif /** * is_target_pte_for_mc - check a pte whether it is valid for move charge * @vma: the vma the pte to be checked belongs * @addr: the address corresponding to the pte to be checked * @ptent: the pte to be checked - * @target: the pointer the target page will be stored(can be NULL) + * @target: the pointer the target page or swap ent will be stored(can be NULL) * * Returns * 0(MC_TARGET_NONE): if the pte is not a target for move charge. * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for * move charge. if @target is not NULL, the page is stored in target->page * with extra refcnt got(Callers should handle it). + * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a + * target for charge migration. if @target is not NULL, the entry is stored + * in target->ent. * * Called with pte lock held. */ -/* We add a new member later. */ union mc_target { struct page *page; + swp_entry_t ent; }; -/* We add a new type later. */ enum mc_target_type { MC_TARGET_NONE, /* not used */ MC_TARGET_PAGE, + MC_TARGET_SWAP, }; static int is_target_pte_for_mc(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, union mc_target *target) { - struct page *page; + struct page *page = NULL; struct page_cgroup *pc; int ret = 0; + swp_entry_t ent = { .val = 0 }; + int usage_count = 0; bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON, &mc.to->move_charge_at_immigrate); - if (!pte_present(ptent)) - return 0; - - page = vm_normal_page(vma, addr, ptent); - if (!page || !page_mapped(page)) - return 0; - /* - * TODO: We don't move charges of file(including shmem/tmpfs) pages for - * now. - */ - if (!move_anon || !PageAnon(page)) - return 0; - /* - * TODO: We don't move charges of shared(used by multiple processes) - * pages for now. - */ - if (page_mapcount(page) > 1) - return 0; - if (!get_page_unless_zero(page)) - return 0; - - pc = lookup_page_cgroup(page); - /* - * Do only loose check w/o page_cgroup lock. mem_cgroup_move_account() - * checks the pc is valid or not under the lock. - */ - if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { - ret = MC_TARGET_PAGE; - if (target) - target->page = page; + if (!pte_present(ptent)) { + /* TODO: handle swap of shmes/tmpfs */ + if (pte_none(ptent) || pte_file(ptent)) + return 0; + else if (is_swap_pte(ptent)) { + ent = pte_to_swp_entry(ptent); + if (!move_anon || non_swap_entry(ent)) + return 0; + usage_count = mem_cgroup_count_swap_user(ent, &page); + } + } else { + page = vm_normal_page(vma, addr, ptent); + if (!page || !page_mapped(page)) + return 0; + /* + * TODO: We don't move charges of file(including shmem/tmpfs) + * pages for now. + */ + if (!move_anon || !PageAnon(page)) + return 0; + if (!get_page_unless_zero(page)) + return 0; + usage_count = page_mapcount(page); + } + if (usage_count > 1) { + /* + * TODO: We don't move charges of shared(used by multiple + * processes) pages for now. + */ + if (page) + put_page(page); + return 0; + } + if (page) { + pc = lookup_page_cgroup(page); + /* + * Do only loose check w/o page_cgroup lock. + * mem_cgroup_move_account() checks the pc is valid or not under + * the lock. + */ + if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { + ret = MC_TARGET_PAGE; + if (target) + target->page = page; + } + if (!ret || !target) + put_page(page); + } + /* throught */ + if (ent.val && do_swap_account && !ret && + css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { + ret = MC_TARGET_SWAP; + if (target) + target->ent = ent; } - - if (!ret || !target) - put_page(page); - return ret; } @@ -3754,6 +3859,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, int type; struct page *page; struct page_cgroup *pc; + swp_entry_t ent; if (!mc.precharge) break; @@ -3775,6 +3881,11 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, put: /* is_target_pte_for_mc() gets the page */ put_page(page); break; + case MC_TARGET_SWAP: + ent = target.ent; + if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) + mc.precharge--; + break; default: break; } diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 3d535d594826..3dd88539a0e6 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -334,6 +334,37 @@ static int swap_cgroup_prepare(int type) return -ENOMEM; } +/** + * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. + * @end: swap entry to be cmpxchged + * @old: old id + * @new: new id + * + * Returns old id at success, 0 at failure. + * (There is no mem_cgroup useing 0 as its id) + */ +unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, + unsigned short old, unsigned short new) +{ + int type = swp_type(ent); + unsigned long offset = swp_offset(ent); + unsigned long idx = offset / SC_PER_PAGE; + unsigned long pos = offset & SC_POS_MASK; + struct swap_cgroup_ctrl *ctrl; + struct page *mappage; + struct swap_cgroup *sc; + + ctrl = &swap_cgroup_ctrl[type]; + + mappage = ctrl->map[idx]; + sc = page_address(mappage); + sc += pos; + if (cmpxchg(&sc->id, old, new) == old) + return old; + else + return 0; +} + /** * swap_cgroup_record - record mem_cgroup for this swp_entry. * @ent: swap entry to be recorded into @@ -358,8 +389,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) mappage = ctrl->map[idx]; sc = page_address(mappage); sc += pos; - old = sc->id; - sc->id = id; + old = xchg(&sc->id, id); return old; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 84374d8cf814..6cd0a8f90dc7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -723,6 +723,37 @@ int free_swap_and_cache(swp_entry_t entry) return p != NULL; } +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +/** + * mem_cgroup_count_swap_user - count the user of a swap entry + * @ent: the swap entry to be checked + * @pagep: the pointer for the swap cache page of the entry to be stored + * + * Returns the number of the user of the swap entry. The number is valid only + * for swaps of anonymous pages. + * If the entry is found on swap cache, the page is stored to pagep with + * refcount of it being incremented. + */ +int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) +{ + struct page *page; + struct swap_info_struct *p; + int count = 0; + + page = find_get_page(&swapper_space, ent.val); + if (page) + count += page_mapcount(page); + p = swap_info_get(ent); + if (p) { + count += swap_count(p->swap_map[swp_offset(ent)]); + spin_unlock(&swap_lock); + } + + *pagep = page; + return count; +} +#endif + #ifdef CONFIG_HIBERNATION /* * Find the swap type that corresponds to given device (if any).