2008-02-07 00:13:50 -08:00
|
|
|
/* memcontrol.h - Memory Controller
|
|
|
|
*
|
|
|
|
* Copyright IBM Corporation, 2007
|
|
|
|
* Author Balbir Singh <balbir@linux.vnet.ibm.com>
|
|
|
|
*
|
2008-02-07 00:13:51 -08:00
|
|
|
* Copyright 2007 OpenVZ SWsoft Inc
|
|
|
|
* Author: Pavel Emelianov <xemul@openvz.org>
|
|
|
|
*
|
2008-02-07 00:13:50 -08:00
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _LINUX_MEMCONTROL_H
|
|
|
|
#define _LINUX_MEMCONTROL_H
|
2009-01-07 18:08:02 -08:00
|
|
|
#include <linux/cgroup.h>
|
2011-05-26 16:25:38 -07:00
|
|
|
#include <linux/vm_event_item.h>
|
|
|
|
|
2008-02-07 00:13:51 -08:00
|
|
|
struct mem_cgroup;
|
|
|
|
struct page_cgroup;
|
2008-02-07 00:13:59 -08:00
|
|
|
struct page;
|
|
|
|
struct mm_struct;
|
2008-02-07 00:13:51 -08:00
|
|
|
|
2011-01-13 15:47:37 -08:00
|
|
|
/* Stats that can be updated by kernel. */
|
|
|
|
enum mem_cgroup_page_stat_item {
|
|
|
|
MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */
|
|
|
|
};
|
|
|
|
|
2012-01-12 17:17:59 -08:00
|
|
|
struct mem_cgroup_reclaim_cookie {
|
|
|
|
struct zone *zone;
|
|
|
|
int priority;
|
|
|
|
unsigned int generation;
|
|
|
|
};
|
|
|
|
|
2008-03-04 14:28:39 -08:00
|
|
|
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
|
2009-01-07 18:08:10 -08:00
|
|
|
/*
|
|
|
|
* All "charge" functions with gfp_mask should use GFP_KERNEL or
|
|
|
|
* (gfp_mask & GFP_RECLAIM_MASK). In current implementatin, memcg doesn't
|
|
|
|
* alloc memory but reclaims memory from all available zones. So, "where I want
|
|
|
|
* memory from" bits of gfp_mask has no meaning. So any bits of that field is
|
|
|
|
* available but adding a rule is better. charge functions' gfp_mask should
|
|
|
|
* be set to GFP_KERNEL or gfp_mask & GFP_RECLAIM_MASK for avoiding ambiguous
|
|
|
|
* codes.
|
|
|
|
* (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.)
|
|
|
|
*/
|
2008-02-07 00:13:51 -08:00
|
|
|
|
2009-01-07 18:07:48 -08:00
|
|
|
extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
|
2008-02-07 00:14:02 -08:00
|
|
|
gfp_t gfp_mask);
|
2009-01-07 18:07:48 -08:00
|
|
|
/* for swap handling */
|
2009-01-07 18:08:00 -08:00
|
|
|
extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
|
2012-01-12 17:18:32 -08:00
|
|
|
struct page *page, gfp_t mask, struct mem_cgroup **memcgp);
|
2009-01-07 18:07:48 -08:00
|
|
|
extern void mem_cgroup_commit_charge_swapin(struct page *page,
|
2012-01-12 17:18:32 -08:00
|
|
|
struct mem_cgroup *memcg);
|
|
|
|
extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg);
|
2009-01-07 18:07:48 -08:00
|
|
|
|
2008-03-04 14:29:08 -08:00
|
|
|
extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
|
|
|
|
gfp_t gfp_mask);
|
2012-01-12 17:18:15 -08:00
|
|
|
|
|
|
|
struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
|
|
|
|
struct lruvec *mem_cgroup_lru_add_list(struct zone *, struct page *,
|
|
|
|
enum lru_list);
|
|
|
|
void mem_cgroup_lru_del_list(struct page *, enum lru_list);
|
|
|
|
struct lruvec *mem_cgroup_lru_move_lists(struct zone *, struct page *,
|
|
|
|
enum lru_list, enum lru_list);
|
2009-12-15 16:47:03 -08:00
|
|
|
|
|
|
|
/* For coalescing uncharge for reducing memcg' overhead*/
|
|
|
|
extern void mem_cgroup_uncharge_start(void);
|
|
|
|
extern void mem_cgroup_uncharge_end(void);
|
|
|
|
|
2008-02-07 00:14:41 -08:00
|
|
|
extern void mem_cgroup_uncharge_page(struct page *page);
|
memcg: remove refcnt from page_cgroup
memcg: performance improvements
Patch Description
1/5 ... remove refcnt fron page_cgroup patch (shmem handling is fixed)
2/5 ... swapcache handling patch
3/5 ... add helper function for shmem's memory reclaim patch
4/5 ... optimize by likely/unlikely ppatch
5/5 ... remove redundunt check patch (shmem handling is fixed.)
Unix bench result.
== 2.6.26-rc2-mm1 + memory resource controller
Execl Throughput 2915.4 lps (29.6 secs, 3 samples)
C Compiler Throughput 1019.3 lpm (60.0 secs, 3 samples)
Shell Scripts (1 concurrent) 5796.0 lpm (60.0 secs, 3 samples)
Shell Scripts (8 concurrent) 1097.7 lpm (60.0 secs, 3 samples)
Shell Scripts (16 concurrent) 565.3 lpm (60.0 secs, 3 samples)
File Read 1024 bufsize 2000 maxblocks 1022128.0 KBps (30.0 secs, 3 samples)
File Write 1024 bufsize 2000 maxblocks 544057.0 KBps (30.0 secs, 3 samples)
File Copy 1024 bufsize 2000 maxblocks 346481.0 KBps (30.0 secs, 3 samples)
File Read 256 bufsize 500 maxblocks 319325.0 KBps (30.0 secs, 3 samples)
File Write 256 bufsize 500 maxblocks 148788.0 KBps (30.0 secs, 3 samples)
File Copy 256 bufsize 500 maxblocks 99051.0 KBps (30.0 secs, 3 samples)
File Read 4096 bufsize 8000 maxblocks 2058917.0 KBps (30.0 secs, 3 samples)
File Write 4096 bufsize 8000 maxblocks 1606109.0 KBps (30.0 secs, 3 samples)
File Copy 4096 bufsize 8000 maxblocks 854789.0 KBps (30.0 secs, 3 samples)
Dc: sqrt(2) to 99 decimal places 126145.2 lpm (30.0 secs, 3 samples)
INDEX VALUES
TEST BASELINE RESULT INDEX
Execl Throughput 43.0 2915.4 678.0
File Copy 1024 bufsize 2000 maxblocks 3960.0 346481.0 875.0
File Copy 256 bufsize 500 maxblocks 1655.0 99051.0 598.5
File Copy 4096 bufsize 8000 maxblocks 5800.0 854789.0 1473.8
Shell Scripts (8 concurrent) 6.0 1097.7 1829.5
=========
FINAL SCORE 991.3
== 2.6.26-rc2-mm1 + this set ==
Execl Throughput 3012.9 lps (29.9 secs, 3 samples)
C Compiler Throughput 981.0 lpm (60.0 secs, 3 samples)
Shell Scripts (1 concurrent) 5872.0 lpm (60.0 secs, 3 samples)
Shell Scripts (8 concurrent) 1120.3 lpm (60.0 secs, 3 samples)
Shell Scripts (16 concurrent) 578.0 lpm (60.0 secs, 3 samples)
File Read 1024 bufsize 2000 maxblocks 1003993.0 KBps (30.0 secs, 3 samples)
File Write 1024 bufsize 2000 maxblocks 550452.0 KBps (30.0 secs, 3 samples)
File Copy 1024 bufsize 2000 maxblocks 347159.0 KBps (30.0 secs, 3 samples)
File Read 256 bufsize 500 maxblocks 314644.0 KBps (30.0 secs, 3 samples)
File Write 256 bufsize 500 maxblocks 151852.0 KBps (30.0 secs, 3 samples)
File Copy 256 bufsize 500 maxblocks 101000.0 KBps (30.0 secs, 3 samples)
File Read 4096 bufsize 8000 maxblocks 2033256.0 KBps (30.0 secs, 3 samples)
File Write 4096 bufsize 8000 maxblocks 1611814.0 KBps (30.0 secs, 3 samples)
File Copy 4096 bufsize 8000 maxblocks 847979.0 KBps (30.0 secs, 3 samples)
Dc: sqrt(2) to 99 decimal places 128148.7 lpm (30.0 secs, 3 samples)
INDEX VALUES
TEST BASELINE RESULT INDEX
Execl Throughput 43.0 3012.9 700.7
File Copy 1024 bufsize 2000 maxblocks 3960.0 347159.0 876.7
File Copy 256 bufsize 500 maxblocks 1655.0 101000.0 610.3
File Copy 4096 bufsize 8000 maxblocks 5800.0 847979.0 1462.0
Shell Scripts (8 concurrent) 6.0 1120.3 1867.2
=========
FINAL SCORE 1004.6
This patch:
Remove refcnt from page_cgroup().
After this,
* A page is charged only when !page_mapped() && no page_cgroup is assigned.
* Anon page is newly mapped.
* File page is added to mapping->tree.
* A page is uncharged only when
* Anon page is fully unmapped.
* File page is removed from LRU.
There is no change in behavior from user's view.
This patch also removes unnecessary calls in rmap.c which was used only for
refcnt mangement.
[akpm@linux-foundation.org: fix warning]
[hugh@veritas.com: fix shmem_unuse_inode charging]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Paul Menage <menage@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-25 01:47:14 -07:00
|
|
|
extern void mem_cgroup_uncharge_cache_page(struct page *page);
|
2008-07-25 01:47:15 -07:00
|
|
|
|
2012-03-21 16:34:10 -07:00
|
|
|
extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
|
|
|
int order);
|
2012-05-29 15:06:25 -07:00
|
|
|
bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
|
|
|
|
struct mem_cgroup *memcg);
|
2011-11-02 13:38:15 -07:00
|
|
|
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg);
|
2008-02-07 00:14:03 -08:00
|
|
|
|
2009-12-16 12:19:59 +01:00
|
|
|
extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
|
cgroups: add an owner to the mm_struct
Remove the mem_cgroup member from mm_struct and instead adds an owner.
This approach was suggested by Paul Menage. The advantage of this approach
is that, once the mm->owner is known, using the subsystem id, the cgroup
can be determined. It also allows several control groups that are
virtually grouped by mm_struct, to exist independent of the memory
controller i.e., without adding mem_cgroup's for each controller, to
mm_struct.
A new config option CONFIG_MM_OWNER is added and the memory resource
controller selects this config option.
This patch also adds cgroup callbacks to notify subsystems when mm->owner
changes. The mm_cgroup_changed callback is called with the task_lock() of
the new task held and is called just prior to changing the mm->owner.
I am indebted to Paul Menage for the several reviews of this patchset and
helping me make it lighter and simpler.
This patch was tested on a powerpc box, it was compiled with both the
MM_OWNER config turned on and off.
After the thread group leader exits, it's moved to init_css_state by
cgroup_exit(), thus all future charges from runnings threads would be
redirected to the init_css_set's subsystem.
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Sudhir Kumar <skumar@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Hirokazu Takahashi <taka@valinux.co.jp>
Cc: David Rientjes <rientjes@google.com>,
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 01:00:16 -07:00
|
|
|
extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
|
2011-06-15 15:08:13 -07:00
|
|
|
extern struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm);
|
cgroups: add an owner to the mm_struct
Remove the mem_cgroup member from mm_struct and instead adds an owner.
This approach was suggested by Paul Menage. The advantage of this approach
is that, once the mm->owner is known, using the subsystem id, the cgroup
can be determined. It also allows several control groups that are
virtually grouped by mm_struct, to exist independent of the memory
controller i.e., without adding mem_cgroup's for each controller, to
mm_struct.
A new config option CONFIG_MM_OWNER is added and the memory resource
controller selects this config option.
This patch also adds cgroup callbacks to notify subsystems when mm->owner
changes. The mm_cgroup_changed callback is called with the task_lock() of
the new task held and is called just prior to changing the mm->owner.
I am indebted to Paul Menage for the several reviews of this patchset and
helping me make it lighter and simpler.
This patch was tested on a powerpc box, it was compiled with both the
MM_OWNER config turned on and off.
After the thread group leader exits, it's moved to init_css_state by
cgroup_exit(), thus all future charges from runnings threads would be
redirected to the init_css_set's subsystem.
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Sudhir Kumar <skumar@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Hirokazu Takahashi <taka@valinux.co.jp>
Cc: David Rientjes <rientjes@google.com>,
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 01:00:16 -07:00
|
|
|
|
2011-12-11 21:47:03 +00:00
|
|
|
extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
|
2011-12-11 21:47:04 +00:00
|
|
|
extern struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont);
|
2011-12-11 21:47:03 +00:00
|
|
|
|
2009-01-07 18:08:07 -08:00
|
|
|
static inline
|
|
|
|
int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup)
|
|
|
|
{
|
2011-11-02 13:38:15 -07:00
|
|
|
struct mem_cgroup *memcg;
|
2012-05-29 15:06:25 -07:00
|
|
|
int match;
|
|
|
|
|
2009-01-07 18:08:07 -08:00
|
|
|
rcu_read_lock();
|
2011-11-02 13:38:15 -07:00
|
|
|
memcg = mem_cgroup_from_task(rcu_dereference((mm)->owner));
|
2012-05-29 15:06:25 -07:00
|
|
|
match = __mem_cgroup_same_or_subtree(cgroup, memcg);
|
2009-01-07 18:08:07 -08:00
|
|
|
rcu_read_unlock();
|
2012-05-29 15:06:25 -07:00
|
|
|
return match;
|
2009-01-07 18:08:07 -08:00
|
|
|
}
|
2008-02-07 00:13:53 -08:00
|
|
|
|
2011-11-02 13:38:15 -07:00
|
|
|
extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
|
2009-12-16 12:19:59 +01:00
|
|
|
|
2008-07-25 01:47:10 -07:00
|
|
|
extern int
|
memcg: fix mis-accounting of file mapped racy with migration
FILE_MAPPED per memcg of migrated file cache is not properly updated,
because our hook in page_add_file_rmap() can't know to which memcg
FILE_MAPPED should be counted.
Basically, this patch is for fixing the bug but includes some big changes
to fix up other messes.
Now, at migrating mapped file, events happen in following sequence.
1. allocate a new page.
2. get memcg of an old page.
3. charge ageinst a new page before migration. But at this point,
no changes to new page's page_cgroup, no commit for the charge.
(IOW, PCG_USED bit is not set.)
4. page migration replaces radix-tree, old-page and new-page.
5. page migration remaps the new page if the old page was mapped.
6. Here, the new page is unlocked.
7. memcg commits the charge for newpage, Mark the new page's page_cgroup
as PCG_USED.
Because "commit" happens after page-remap, we can count FILE_MAPPED
at "5", because we should avoid to trust page_cgroup->mem_cgroup.
if PCG_USED bit is unset.
(Note: memcg's LRU removal code does that but LRU-isolation logic is used
for helping it. When we overwrite page_cgroup->mem_cgroup, page_cgroup is
not on LRU or page_cgroup->mem_cgroup is NULL.)
We can lose file_mapped accounting information at 5 because FILE_MAPPED
is updated only when mapcount changes 0->1. So we should catch it.
BTW, historically, above implemntation comes from migration-failure
of anonymous page. Because we charge both of old page and new page
with mapcount=0, we can't catch
- the page is really freed before remap.
- migration fails but it's freed before remap
or .....corner cases.
New migration sequence with memcg is:
1. allocate a new page.
2. mark PageCgroupMigration to the old page.
3. charge against a new page onto the old page's memcg. (here, new page's pc
is marked as PageCgroupUsed.)
4. page migration replaces radix-tree, page table, etc...
5. At remapping, new page's page_cgroup is now makrked as "USED"
We can catch 0->1 event and FILE_MAPPED will be properly updated.
And we can catch SWAPOUT event after unlock this and freeing this
page by unmap() can be caught.
7. Clear PageCgroupMigration of the old page.
So, FILE_MAPPED will be correctly updated.
Then, for what MIGRATION flag is ?
Without it, at migration failure, we may have to charge old page again
because it may be fully unmapped. "charge" means that we have to dive into
memory reclaim or something complated. So, it's better to avoid
charge it again. Before this patch, __commit_charge() was working for
both of the old/new page and fixed up all. But this technique has some
racy condtion around FILE_MAPPED and SWAPOUT etc...
Now, the kernel use MIGRATION flag and don't uncharge old page until
the end of migration.
I hope this change will make memcg's page migration much simpler. This
page migration has caused several troubles. Worth to add a flag for
simplification.
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reported-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-05-26 14:42:46 -07:00
|
|
|
mem_cgroup_prepare_migration(struct page *page,
|
2012-01-12 17:18:32 -08:00
|
|
|
struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask);
|
2011-11-02 13:38:15 -07:00
|
|
|
extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
|
2011-01-13 15:47:43 -08:00
|
|
|
struct page *oldpage, struct page *newpage, bool migration_ok);
|
2008-02-07 00:14:10 -08:00
|
|
|
|
2012-01-12 17:17:59 -08:00
|
|
|
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
|
|
|
|
struct mem_cgroup *,
|
|
|
|
struct mem_cgroup_reclaim_cookie *);
|
|
|
|
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
|
|
|
|
|
2008-02-07 00:14:32 -08:00
|
|
|
/*
|
|
|
|
* For memory reclaim.
|
|
|
|
*/
|
2011-11-02 13:38:23 -07:00
|
|
|
int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg,
|
|
|
|
struct zone *zone);
|
|
|
|
int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg,
|
|
|
|
struct zone *zone);
|
2011-05-26 16:25:33 -07:00
|
|
|
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
|
2012-05-29 15:07:00 -07:00
|
|
|
unsigned long mem_cgroup_get_lruvec_size(struct lruvec *lruvec, enum lru_list);
|
2009-01-07 18:08:20 -08:00
|
|
|
struct zone_reclaim_stat*
|
|
|
|
mem_cgroup_get_reclaim_stat_from_page(struct page *page);
|
2009-04-02 16:57:39 -07:00
|
|
|
extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
|
|
|
|
struct task_struct *p);
|
memcg: add mem_cgroup_replace_page_cache() to fix LRU issue
Commit ef6a3c6311 ("mm: add replace_page_cache_page() function") added a
function replace_page_cache_page(). This function replaces a page in the
radix-tree with a new page. WHen doing this, memory cgroup needs to fix
up the accounting information. memcg need to check PCG_USED bit etc.
In some(many?) cases, 'newpage' is on LRU before calling
replace_page_cache(). So, memcg's LRU accounting information should be
fixed, too.
This patch adds mem_cgroup_replace_page_cache() and removes the old hooks.
In that function, old pages will be unaccounted without touching
res_counter and new page will be accounted to the memcg (of old page).
WHen overwriting pc->mem_cgroup of newpage, take zone->lru_lock and avoid
races with LRU handling.
Background:
replace_page_cache_page() is called by FUSE code in its splice() handling.
Here, 'newpage' is replacing oldpage but this newpage is not a newly allocated
page and may be on LRU. LRU mis-accounting will be critical for memory cgroup
because rmdir() checks the whole LRU is empty and there is no account leak.
If a page is on the other LRU than it should be, rmdir() will fail.
This bug was added in March 2011, but no bug report yet. I guess there
are not many people who use memcg and FUSE at the same time with upstream
kernels.
The result of this bug is that admin cannot destroy a memcg because of
account leak. So, no panic, no deadlock. And, even if an active cgroup
exist, umount can succseed. So no problem at shutdown.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Miklos Szeredi <mszeredi@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-01-12 17:17:44 -08:00
|
|
|
extern void mem_cgroup_replace_page_cache(struct page *oldpage,
|
|
|
|
struct page *newpage);
|
2008-02-07 00:14:32 -08:00
|
|
|
|
2009-01-07 18:07:57 -08:00
|
|
|
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
|
|
|
|
extern int do_swap_account;
|
|
|
|
#endif
|
2009-01-07 18:08:02 -08:00
|
|
|
|
|
|
|
static inline bool mem_cgroup_disabled(void)
|
|
|
|
{
|
|
|
|
if (mem_cgroup_subsys.disabled)
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
memcg: use new logic for page stat accounting
Now, page-stat-per-memcg is recorded into per page_cgroup flag by
duplicating page's status into the flag. The reason is that memcg has a
feature to move a page from a group to another group and we have race
between "move" and "page stat accounting",
Under current logic, assume CPU-A and CPU-B. CPU-A does "move" and CPU-B
does "page stat accounting".
When CPU-A goes 1st,
CPU-A CPU-B
update "struct page" info.
move_lock_mem_cgroup(memcg)
see pc->flags
copy page stat to new group
overwrite pc->mem_cgroup.
move_unlock_mem_cgroup(memcg)
move_lock_mem_cgroup(mem)
set pc->flags
update page stat accounting
move_unlock_mem_cgroup(mem)
stat accounting is guarded by move_lock_mem_cgroup() and "move" logic
(CPU-A) doesn't see changes in "struct page" information.
But it's costly to have the same information both in 'struct page' and
'struct page_cgroup'. And, there is a potential problem.
For example, assume we have PG_dirty accounting in memcg.
PG_..is a flag for struct page.
PCG_ is a flag for struct page_cgroup.
(This is just an example. The same problem can be found in any
kind of page stat accounting.)
CPU-A CPU-B
TestSet PG_dirty
(delay) TestClear PG_dirty
if (TestClear(PCG_dirty))
memcg->nr_dirty--
if (TestSet(PCG_dirty))
memcg->nr_dirty++
Here, memcg->nr_dirty = +1, this is wrong. This race was reported by Greg
Thelen <gthelen@google.com>. Now, only FILE_MAPPED is supported but
fortunately, it's serialized by page table lock and this is not real bug,
_now_,
If this potential problem is caused by having duplicated information in
struct page and struct page_cgroup, we may be able to fix this by using
original 'struct page' information. But we'll have a problem in "move
account"
Assume we use only PG_dirty.
CPU-A CPU-B
TestSet PG_dirty
(delay) move_lock_mem_cgroup()
if (PageDirty(page))
new_memcg->nr_dirty++
pc->mem_cgroup = new_memcg;
move_unlock_mem_cgroup()
move_lock_mem_cgroup()
memcg = pc->mem_cgroup
new_memcg->nr_dirty++
accounting information may be double-counted. This was original reason to
have PCG_xxx flags but it seems PCG_xxx has another problem.
I think we need a bigger lock as
move_lock_mem_cgroup(page)
TestSetPageDirty(page)
update page stats (without any checks)
move_unlock_mem_cgroup(page)
This fixes both of problems and we don't have to duplicate page flag into
page_cgroup. Please note: move_lock_mem_cgroup() is held only when there
are possibility of "account move" under the system. So, in most path,
status update will go without atomic locks.
This patch introduces mem_cgroup_begin_update_page_stat() and
mem_cgroup_end_update_page_stat() both should be called at modifying
'struct page' information if memcg takes care of it. as
mem_cgroup_begin_update_page_stat()
modify page information
mem_cgroup_update_page_stat()
=> never check any 'struct page' info, just update counters.
mem_cgroup_end_update_page_stat().
This patch is slow because we need to call begin_update_page_stat()/
end_update_page_stat() regardless of accounted will be changed or not. A
following patch adds an easy optimization and reduces the cost.
[akpm@linux-foundation.org: s/lock/locked/]
[hughd@google.com: fix deadlock by avoiding stat lock when anon]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Greg Thelen <gthelen@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 16:34:25 -07:00
|
|
|
void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked,
|
|
|
|
unsigned long *flags);
|
|
|
|
|
2012-03-21 16:34:26 -07:00
|
|
|
extern atomic_t memcg_moving;
|
|
|
|
|
memcg: use new logic for page stat accounting
Now, page-stat-per-memcg is recorded into per page_cgroup flag by
duplicating page's status into the flag. The reason is that memcg has a
feature to move a page from a group to another group and we have race
between "move" and "page stat accounting",
Under current logic, assume CPU-A and CPU-B. CPU-A does "move" and CPU-B
does "page stat accounting".
When CPU-A goes 1st,
CPU-A CPU-B
update "struct page" info.
move_lock_mem_cgroup(memcg)
see pc->flags
copy page stat to new group
overwrite pc->mem_cgroup.
move_unlock_mem_cgroup(memcg)
move_lock_mem_cgroup(mem)
set pc->flags
update page stat accounting
move_unlock_mem_cgroup(mem)
stat accounting is guarded by move_lock_mem_cgroup() and "move" logic
(CPU-A) doesn't see changes in "struct page" information.
But it's costly to have the same information both in 'struct page' and
'struct page_cgroup'. And, there is a potential problem.
For example, assume we have PG_dirty accounting in memcg.
PG_..is a flag for struct page.
PCG_ is a flag for struct page_cgroup.
(This is just an example. The same problem can be found in any
kind of page stat accounting.)
CPU-A CPU-B
TestSet PG_dirty
(delay) TestClear PG_dirty
if (TestClear(PCG_dirty))
memcg->nr_dirty--
if (TestSet(PCG_dirty))
memcg->nr_dirty++
Here, memcg->nr_dirty = +1, this is wrong. This race was reported by Greg
Thelen <gthelen@google.com>. Now, only FILE_MAPPED is supported but
fortunately, it's serialized by page table lock and this is not real bug,
_now_,
If this potential problem is caused by having duplicated information in
struct page and struct page_cgroup, we may be able to fix this by using
original 'struct page' information. But we'll have a problem in "move
account"
Assume we use only PG_dirty.
CPU-A CPU-B
TestSet PG_dirty
(delay) move_lock_mem_cgroup()
if (PageDirty(page))
new_memcg->nr_dirty++
pc->mem_cgroup = new_memcg;
move_unlock_mem_cgroup()
move_lock_mem_cgroup()
memcg = pc->mem_cgroup
new_memcg->nr_dirty++
accounting information may be double-counted. This was original reason to
have PCG_xxx flags but it seems PCG_xxx has another problem.
I think we need a bigger lock as
move_lock_mem_cgroup(page)
TestSetPageDirty(page)
update page stats (without any checks)
move_unlock_mem_cgroup(page)
This fixes both of problems and we don't have to duplicate page flag into
page_cgroup. Please note: move_lock_mem_cgroup() is held only when there
are possibility of "account move" under the system. So, in most path,
status update will go without atomic locks.
This patch introduces mem_cgroup_begin_update_page_stat() and
mem_cgroup_end_update_page_stat() both should be called at modifying
'struct page' information if memcg takes care of it. as
mem_cgroup_begin_update_page_stat()
modify page information
mem_cgroup_update_page_stat()
=> never check any 'struct page' info, just update counters.
mem_cgroup_end_update_page_stat().
This patch is slow because we need to call begin_update_page_stat()/
end_update_page_stat() regardless of accounted will be changed or not. A
following patch adds an easy optimization and reduces the cost.
[akpm@linux-foundation.org: s/lock/locked/]
[hughd@google.com: fix deadlock by avoiding stat lock when anon]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Greg Thelen <gthelen@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 16:34:25 -07:00
|
|
|
static inline void mem_cgroup_begin_update_page_stat(struct page *page,
|
|
|
|
bool *locked, unsigned long *flags)
|
|
|
|
{
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return;
|
|
|
|
rcu_read_lock();
|
|
|
|
*locked = false;
|
2012-03-21 16:34:26 -07:00
|
|
|
if (atomic_read(&memcg_moving))
|
|
|
|
__mem_cgroup_begin_update_page_stat(page, locked, flags);
|
memcg: use new logic for page stat accounting
Now, page-stat-per-memcg is recorded into per page_cgroup flag by
duplicating page's status into the flag. The reason is that memcg has a
feature to move a page from a group to another group and we have race
between "move" and "page stat accounting",
Under current logic, assume CPU-A and CPU-B. CPU-A does "move" and CPU-B
does "page stat accounting".
When CPU-A goes 1st,
CPU-A CPU-B
update "struct page" info.
move_lock_mem_cgroup(memcg)
see pc->flags
copy page stat to new group
overwrite pc->mem_cgroup.
move_unlock_mem_cgroup(memcg)
move_lock_mem_cgroup(mem)
set pc->flags
update page stat accounting
move_unlock_mem_cgroup(mem)
stat accounting is guarded by move_lock_mem_cgroup() and "move" logic
(CPU-A) doesn't see changes in "struct page" information.
But it's costly to have the same information both in 'struct page' and
'struct page_cgroup'. And, there is a potential problem.
For example, assume we have PG_dirty accounting in memcg.
PG_..is a flag for struct page.
PCG_ is a flag for struct page_cgroup.
(This is just an example. The same problem can be found in any
kind of page stat accounting.)
CPU-A CPU-B
TestSet PG_dirty
(delay) TestClear PG_dirty
if (TestClear(PCG_dirty))
memcg->nr_dirty--
if (TestSet(PCG_dirty))
memcg->nr_dirty++
Here, memcg->nr_dirty = +1, this is wrong. This race was reported by Greg
Thelen <gthelen@google.com>. Now, only FILE_MAPPED is supported but
fortunately, it's serialized by page table lock and this is not real bug,
_now_,
If this potential problem is caused by having duplicated information in
struct page and struct page_cgroup, we may be able to fix this by using
original 'struct page' information. But we'll have a problem in "move
account"
Assume we use only PG_dirty.
CPU-A CPU-B
TestSet PG_dirty
(delay) move_lock_mem_cgroup()
if (PageDirty(page))
new_memcg->nr_dirty++
pc->mem_cgroup = new_memcg;
move_unlock_mem_cgroup()
move_lock_mem_cgroup()
memcg = pc->mem_cgroup
new_memcg->nr_dirty++
accounting information may be double-counted. This was original reason to
have PCG_xxx flags but it seems PCG_xxx has another problem.
I think we need a bigger lock as
move_lock_mem_cgroup(page)
TestSetPageDirty(page)
update page stats (without any checks)
move_unlock_mem_cgroup(page)
This fixes both of problems and we don't have to duplicate page flag into
page_cgroup. Please note: move_lock_mem_cgroup() is held only when there
are possibility of "account move" under the system. So, in most path,
status update will go without atomic locks.
This patch introduces mem_cgroup_begin_update_page_stat() and
mem_cgroup_end_update_page_stat() both should be called at modifying
'struct page' information if memcg takes care of it. as
mem_cgroup_begin_update_page_stat()
modify page information
mem_cgroup_update_page_stat()
=> never check any 'struct page' info, just update counters.
mem_cgroup_end_update_page_stat().
This patch is slow because we need to call begin_update_page_stat()/
end_update_page_stat() regardless of accounted will be changed or not. A
following patch adds an easy optimization and reduces the cost.
[akpm@linux-foundation.org: s/lock/locked/]
[hughd@google.com: fix deadlock by avoiding stat lock when anon]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Greg Thelen <gthelen@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 16:34:25 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
void __mem_cgroup_end_update_page_stat(struct page *page,
|
|
|
|
unsigned long *flags);
|
|
|
|
static inline void mem_cgroup_end_update_page_stat(struct page *page,
|
|
|
|
bool *locked, unsigned long *flags)
|
|
|
|
{
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return;
|
|
|
|
if (*locked)
|
|
|
|
__mem_cgroup_end_update_page_stat(page, flags);
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
2011-01-13 15:47:37 -08:00
|
|
|
void mem_cgroup_update_page_stat(struct page *page,
|
|
|
|
enum mem_cgroup_page_stat_item idx,
|
|
|
|
int val);
|
|
|
|
|
|
|
|
static inline void mem_cgroup_inc_page_stat(struct page *page,
|
|
|
|
enum mem_cgroup_page_stat_item idx)
|
|
|
|
{
|
|
|
|
mem_cgroup_update_page_stat(page, idx, 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_dec_page_stat(struct page *page,
|
|
|
|
enum mem_cgroup_page_stat_item idx)
|
|
|
|
{
|
|
|
|
mem_cgroup_update_page_stat(page, idx, -1);
|
|
|
|
}
|
|
|
|
|
2009-09-23 15:56:39 -07:00
|
|
|
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
|
2011-05-26 16:25:25 -07:00
|
|
|
gfp_t gfp_mask,
|
|
|
|
unsigned long *total_scanned);
|
2011-11-02 13:38:15 -07:00
|
|
|
u64 mem_cgroup_get_limit(struct mem_cgroup *memcg);
|
oom: badness heuristic rewrite
This a complete rewrite of the oom killer's badness() heuristic which is
used to determine which task to kill in oom conditions. The goal is to
make it as simple and predictable as possible so the results are better
understood and we end up killing the task which will lead to the most
memory freeing while still respecting the fine-tuning from userspace.
Instead of basing the heuristic on mm->total_vm for each task, the task's
rss and swap space is used instead. This is a better indication of the
amount of memory that will be freeable if the oom killed task is chosen
and subsequently exits. This helps specifically in cases where KDE or
GNOME is chosen for oom kill on desktop systems instead of a memory
hogging task.
The baseline for the heuristic is a proportion of memory that each task is
currently using in memory plus swap compared to the amount of "allowable"
memory. "Allowable," in this sense, means the system-wide resources for
unconstrained oom conditions, the set of mempolicy nodes, the mems
attached to current's cpuset, or a memory controller's limit. The
proportion is given on a scale of 0 (never kill) to 1000 (always kill),
roughly meaning that if a task has a badness() score of 500 that the task
consumes approximately 50% of allowable memory resident in RAM or in swap
space.
The proportion is always relative to the amount of "allowable" memory and
not the total amount of RAM systemwide so that mempolicies and cpusets may
operate in isolation; they shall not need to know the true size of the
machine on which they are running if they are bound to a specific set of
nodes or mems, respectively.
Root tasks are given 3% extra memory just like __vm_enough_memory()
provides in LSMs. In the event of two tasks consuming similar amounts of
memory, it is generally better to save root's task.
Because of the change in the badness() heuristic's baseline, it is also
necessary to introduce a new user interface to tune it. It's not possible
to redefine the meaning of /proc/pid/oom_adj with a new scale since the
ABI cannot be changed for backward compatability. Instead, a new tunable,
/proc/pid/oom_score_adj, is added that ranges from -1000 to +1000. It may
be used to polarize the heuristic such that certain tasks are never
considered for oom kill while others may always be considered. The value
is added directly into the badness() score so a value of -500, for
example, means to discount 50% of its memory consumption in comparison to
other tasks either on the system, bound to the mempolicy, in the cpuset,
or sharing the same memory controller.
/proc/pid/oom_adj is changed so that its meaning is rescaled into the
units used by /proc/pid/oom_score_adj, and vice versa. Changing one of
these per-task tunables will rescale the value of the other to an
equivalent meaning. Although /proc/pid/oom_adj was originally defined as
a bitshift on the badness score, it now shares the same linear growth as
/proc/pid/oom_score_adj but with different granularity. This is required
so the ABI is not broken with userspace applications and allows oom_adj to
be deprecated for future removal.
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-08-09 17:19:46 -07:00
|
|
|
|
2011-05-26 16:25:38 -07:00
|
|
|
void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
|
2011-01-20 14:44:24 -08:00
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
2012-01-12 17:18:20 -08:00
|
|
|
void mem_cgroup_split_huge_fixup(struct page *head);
|
2011-01-20 14:44:24 -08:00
|
|
|
#endif
|
|
|
|
|
2011-03-23 16:42:25 -07:00
|
|
|
#ifdef CONFIG_DEBUG_VM
|
|
|
|
bool mem_cgroup_bad_page_check(struct page *page);
|
|
|
|
void mem_cgroup_print_bad_page(struct page *page);
|
|
|
|
#endif
|
2008-10-18 20:28:16 -07:00
|
|
|
#else /* CONFIG_CGROUP_MEM_RES_CTLR */
|
2009-01-07 18:07:48 -08:00
|
|
|
struct mem_cgroup;
|
|
|
|
|
|
|
|
static inline int mem_cgroup_newpage_charge(struct page *page,
|
2008-03-04 14:29:08 -08:00
|
|
|
struct mm_struct *mm, gfp_t gfp_mask)
|
2008-02-07 00:13:53 -08:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-03-04 14:29:08 -08:00
|
|
|
static inline int mem_cgroup_cache_charge(struct page *page,
|
|
|
|
struct mm_struct *mm, gfp_t gfp_mask)
|
2008-02-07 00:13:53 -08:00
|
|
|
{
|
2008-03-04 14:29:08 -08:00
|
|
|
return 0;
|
2008-02-07 00:13:53 -08:00
|
|
|
}
|
|
|
|
|
2009-01-07 18:08:00 -08:00
|
|
|
static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
|
2012-01-12 17:18:32 -08:00
|
|
|
struct page *page, gfp_t gfp_mask, struct mem_cgroup **memcgp)
|
2009-01-07 18:07:48 -08:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_commit_charge_swapin(struct page *page,
|
2012-01-12 17:18:32 -08:00
|
|
|
struct mem_cgroup *memcg)
|
2009-01-07 18:07:48 -08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2012-01-12 17:18:32 -08:00
|
|
|
static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
|
2009-01-07 18:07:48 -08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2009-12-15 16:47:03 -08:00
|
|
|
static inline void mem_cgroup_uncharge_start(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_uncharge_end(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2008-02-07 00:13:53 -08:00
|
|
|
static inline void mem_cgroup_uncharge_page(struct page *page)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
memcg: remove refcnt from page_cgroup
memcg: performance improvements
Patch Description
1/5 ... remove refcnt fron page_cgroup patch (shmem handling is fixed)
2/5 ... swapcache handling patch
3/5 ... add helper function for shmem's memory reclaim patch
4/5 ... optimize by likely/unlikely ppatch
5/5 ... remove redundunt check patch (shmem handling is fixed.)
Unix bench result.
== 2.6.26-rc2-mm1 + memory resource controller
Execl Throughput 2915.4 lps (29.6 secs, 3 samples)
C Compiler Throughput 1019.3 lpm (60.0 secs, 3 samples)
Shell Scripts (1 concurrent) 5796.0 lpm (60.0 secs, 3 samples)
Shell Scripts (8 concurrent) 1097.7 lpm (60.0 secs, 3 samples)
Shell Scripts (16 concurrent) 565.3 lpm (60.0 secs, 3 samples)
File Read 1024 bufsize 2000 maxblocks 1022128.0 KBps (30.0 secs, 3 samples)
File Write 1024 bufsize 2000 maxblocks 544057.0 KBps (30.0 secs, 3 samples)
File Copy 1024 bufsize 2000 maxblocks 346481.0 KBps (30.0 secs, 3 samples)
File Read 256 bufsize 500 maxblocks 319325.0 KBps (30.0 secs, 3 samples)
File Write 256 bufsize 500 maxblocks 148788.0 KBps (30.0 secs, 3 samples)
File Copy 256 bufsize 500 maxblocks 99051.0 KBps (30.0 secs, 3 samples)
File Read 4096 bufsize 8000 maxblocks 2058917.0 KBps (30.0 secs, 3 samples)
File Write 4096 bufsize 8000 maxblocks 1606109.0 KBps (30.0 secs, 3 samples)
File Copy 4096 bufsize 8000 maxblocks 854789.0 KBps (30.0 secs, 3 samples)
Dc: sqrt(2) to 99 decimal places 126145.2 lpm (30.0 secs, 3 samples)
INDEX VALUES
TEST BASELINE RESULT INDEX
Execl Throughput 43.0 2915.4 678.0
File Copy 1024 bufsize 2000 maxblocks 3960.0 346481.0 875.0
File Copy 256 bufsize 500 maxblocks 1655.0 99051.0 598.5
File Copy 4096 bufsize 8000 maxblocks 5800.0 854789.0 1473.8
Shell Scripts (8 concurrent) 6.0 1097.7 1829.5
=========
FINAL SCORE 991.3
== 2.6.26-rc2-mm1 + this set ==
Execl Throughput 3012.9 lps (29.9 secs, 3 samples)
C Compiler Throughput 981.0 lpm (60.0 secs, 3 samples)
Shell Scripts (1 concurrent) 5872.0 lpm (60.0 secs, 3 samples)
Shell Scripts (8 concurrent) 1120.3 lpm (60.0 secs, 3 samples)
Shell Scripts (16 concurrent) 578.0 lpm (60.0 secs, 3 samples)
File Read 1024 bufsize 2000 maxblocks 1003993.0 KBps (30.0 secs, 3 samples)
File Write 1024 bufsize 2000 maxblocks 550452.0 KBps (30.0 secs, 3 samples)
File Copy 1024 bufsize 2000 maxblocks 347159.0 KBps (30.0 secs, 3 samples)
File Read 256 bufsize 500 maxblocks 314644.0 KBps (30.0 secs, 3 samples)
File Write 256 bufsize 500 maxblocks 151852.0 KBps (30.0 secs, 3 samples)
File Copy 256 bufsize 500 maxblocks 101000.0 KBps (30.0 secs, 3 samples)
File Read 4096 bufsize 8000 maxblocks 2033256.0 KBps (30.0 secs, 3 samples)
File Write 4096 bufsize 8000 maxblocks 1611814.0 KBps (30.0 secs, 3 samples)
File Copy 4096 bufsize 8000 maxblocks 847979.0 KBps (30.0 secs, 3 samples)
Dc: sqrt(2) to 99 decimal places 128148.7 lpm (30.0 secs, 3 samples)
INDEX VALUES
TEST BASELINE RESULT INDEX
Execl Throughput 43.0 3012.9 700.7
File Copy 1024 bufsize 2000 maxblocks 3960.0 347159.0 876.7
File Copy 256 bufsize 500 maxblocks 1655.0 101000.0 610.3
File Copy 4096 bufsize 8000 maxblocks 5800.0 847979.0 1462.0
Shell Scripts (8 concurrent) 6.0 1120.3 1867.2
=========
FINAL SCORE 1004.6
This patch:
Remove refcnt from page_cgroup().
After this,
* A page is charged only when !page_mapped() && no page_cgroup is assigned.
* Anon page is newly mapped.
* File page is added to mapping->tree.
* A page is uncharged only when
* Anon page is fully unmapped.
* File page is removed from LRU.
There is no change in behavior from user's view.
This patch also removes unnecessary calls in rmap.c which was used only for
refcnt mangement.
[akpm@linux-foundation.org: fix warning]
[hugh@veritas.com: fix shmem_unuse_inode charging]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Paul Menage <menage@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-25 01:47:14 -07:00
|
|
|
static inline void mem_cgroup_uncharge_cache_page(struct page *page)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2012-01-12 17:18:15 -08:00
|
|
|
static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
|
|
|
|
struct mem_cgroup *memcg)
|
memcg: synchronized LRU
A big patch for changing memcg's LRU semantics.
Now,
- page_cgroup is linked to mem_cgroup's its own LRU (per zone).
- LRU of page_cgroup is not synchronous with global LRU.
- page and page_cgroup is one-to-one and statically allocated.
- To find page_cgroup is on what LRU, you have to check pc->mem_cgroup as
- lru = page_cgroup_zoneinfo(pc, nid_of_pc, zid_of_pc);
- SwapCache is handled.
And, when we handle LRU list of page_cgroup, we do following.
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc); .....................(1)
mz = page_cgroup_zoneinfo(pc);
spin_lock(&mz->lru_lock);
.....add to LRU
spin_unlock(&mz->lru_lock);
unlock_page_cgroup(pc);
But (1) is spin_lock and we have to be afraid of dead-lock with zone->lru_lock.
So, trylock() is used at (1), now. Without (1), we can't trust "mz" is correct.
This is a trial to remove this dirty nesting of locks.
This patch changes mz->lru_lock to be zone->lru_lock.
Then, above sequence will be written as
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
mem_cgroup_add/remove/etc_lru() {
pc = lookup_page_cgroup(page);
mz = page_cgroup_zoneinfo(pc);
if (PageCgroupUsed(pc)) {
....add to LRU
}
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
This is much simpler.
(*) We're safe even if we don't take lock_page_cgroup(pc). Because..
1. When pc->mem_cgroup can be modified.
- at charge.
- at account_move().
2. at charge
the PCG_USED bit is not set before pc->mem_cgroup is fixed.
3. at account_move()
the page is isolated and not on LRU.
Pros.
- easy for maintenance.
- memcg can make use of laziness of pagevec.
- we don't have to duplicated LRU/Active/Unevictable bit in page_cgroup.
- LRU status of memcg will be synchronized with global LRU's one.
- # of locks are reduced.
- account_move() is simplified very much.
Cons.
- may increase cost of LRU rotation.
(no impact if memcg is not configured.)
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 18:08:01 -08:00
|
|
|
{
|
2012-01-12 17:18:15 -08:00
|
|
|
return &zone->lruvec;
|
memcg: synchronized LRU
A big patch for changing memcg's LRU semantics.
Now,
- page_cgroup is linked to mem_cgroup's its own LRU (per zone).
- LRU of page_cgroup is not synchronous with global LRU.
- page and page_cgroup is one-to-one and statically allocated.
- To find page_cgroup is on what LRU, you have to check pc->mem_cgroup as
- lru = page_cgroup_zoneinfo(pc, nid_of_pc, zid_of_pc);
- SwapCache is handled.
And, when we handle LRU list of page_cgroup, we do following.
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc); .....................(1)
mz = page_cgroup_zoneinfo(pc);
spin_lock(&mz->lru_lock);
.....add to LRU
spin_unlock(&mz->lru_lock);
unlock_page_cgroup(pc);
But (1) is spin_lock and we have to be afraid of dead-lock with zone->lru_lock.
So, trylock() is used at (1), now. Without (1), we can't trust "mz" is correct.
This is a trial to remove this dirty nesting of locks.
This patch changes mz->lru_lock to be zone->lru_lock.
Then, above sequence will be written as
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
mem_cgroup_add/remove/etc_lru() {
pc = lookup_page_cgroup(page);
mz = page_cgroup_zoneinfo(pc);
if (PageCgroupUsed(pc)) {
....add to LRU
}
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
This is much simpler.
(*) We're safe even if we don't take lock_page_cgroup(pc). Because..
1. When pc->mem_cgroup can be modified.
- at charge.
- at account_move().
2. at charge
the PCG_USED bit is not set before pc->mem_cgroup is fixed.
3. at account_move()
the page is isolated and not on LRU.
Pros.
- easy for maintenance.
- memcg can make use of laziness of pagevec.
- we don't have to duplicated LRU/Active/Unevictable bit in page_cgroup.
- LRU status of memcg will be synchronized with global LRU's one.
- # of locks are reduced.
- account_move() is simplified very much.
Cons.
- may increase cost of LRU rotation.
(no impact if memcg is not configured.)
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 18:08:01 -08:00
|
|
|
}
|
|
|
|
|
2012-01-12 17:18:15 -08:00
|
|
|
static inline struct lruvec *mem_cgroup_lru_add_list(struct zone *zone,
|
|
|
|
struct page *page,
|
|
|
|
enum lru_list lru)
|
2011-03-22 16:32:53 -07:00
|
|
|
{
|
2012-01-12 17:18:15 -08:00
|
|
|
return &zone->lruvec;
|
2011-03-22 16:32:53 -07:00
|
|
|
}
|
|
|
|
|
2012-01-12 17:18:15 -08:00
|
|
|
static inline void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
|
memcg: synchronized LRU
A big patch for changing memcg's LRU semantics.
Now,
- page_cgroup is linked to mem_cgroup's its own LRU (per zone).
- LRU of page_cgroup is not synchronous with global LRU.
- page and page_cgroup is one-to-one and statically allocated.
- To find page_cgroup is on what LRU, you have to check pc->mem_cgroup as
- lru = page_cgroup_zoneinfo(pc, nid_of_pc, zid_of_pc);
- SwapCache is handled.
And, when we handle LRU list of page_cgroup, we do following.
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc); .....................(1)
mz = page_cgroup_zoneinfo(pc);
spin_lock(&mz->lru_lock);
.....add to LRU
spin_unlock(&mz->lru_lock);
unlock_page_cgroup(pc);
But (1) is spin_lock and we have to be afraid of dead-lock with zone->lru_lock.
So, trylock() is used at (1), now. Without (1), we can't trust "mz" is correct.
This is a trial to remove this dirty nesting of locks.
This patch changes mz->lru_lock to be zone->lru_lock.
Then, above sequence will be written as
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
mem_cgroup_add/remove/etc_lru() {
pc = lookup_page_cgroup(page);
mz = page_cgroup_zoneinfo(pc);
if (PageCgroupUsed(pc)) {
....add to LRU
}
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
This is much simpler.
(*) We're safe even if we don't take lock_page_cgroup(pc). Because..
1. When pc->mem_cgroup can be modified.
- at charge.
- at account_move().
2. at charge
the PCG_USED bit is not set before pc->mem_cgroup is fixed.
3. at account_move()
the page is isolated and not on LRU.
Pros.
- easy for maintenance.
- memcg can make use of laziness of pagevec.
- we don't have to duplicated LRU/Active/Unevictable bit in page_cgroup.
- LRU status of memcg will be synchronized with global LRU's one.
- # of locks are reduced.
- account_move() is simplified very much.
Cons.
- may increase cost of LRU rotation.
(no impact if memcg is not configured.)
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 18:08:01 -08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2012-01-12 17:18:15 -08:00
|
|
|
static inline struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone,
|
|
|
|
struct page *page,
|
|
|
|
enum lru_list from,
|
|
|
|
enum lru_list to)
|
2008-02-07 00:13:56 -08:00
|
|
|
{
|
2012-01-12 17:18:15 -08:00
|
|
|
return &zone->lruvec;
|
2008-02-07 00:13:56 -08:00
|
|
|
}
|
|
|
|
|
2009-12-16 12:19:59 +01:00
|
|
|
static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2011-06-15 15:08:13 -07:00
|
|
|
static inline struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2011-11-02 13:38:15 -07:00
|
|
|
static inline int mm_match_cgroup(struct mm_struct *mm,
|
|
|
|
struct mem_cgroup *memcg)
|
2008-02-07 00:14:01 -08:00
|
|
|
{
|
2008-02-09 00:10:15 -08:00
|
|
|
return 1;
|
2008-02-07 00:14:01 -08:00
|
|
|
}
|
|
|
|
|
2008-02-07 00:14:06 -08:00
|
|
|
static inline int task_in_mem_cgroup(struct task_struct *task,
|
2011-11-02 13:38:15 -07:00
|
|
|
const struct mem_cgroup *memcg)
|
2008-02-07 00:14:06 -08:00
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2011-11-02 13:38:15 -07:00
|
|
|
static inline struct cgroup_subsys_state
|
|
|
|
*mem_cgroup_css(struct mem_cgroup *memcg)
|
2009-12-16 12:19:59 +01:00
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2008-07-25 01:47:10 -07:00
|
|
|
static inline int
|
memcg: fix mis-accounting of file mapped racy with migration
FILE_MAPPED per memcg of migrated file cache is not properly updated,
because our hook in page_add_file_rmap() can't know to which memcg
FILE_MAPPED should be counted.
Basically, this patch is for fixing the bug but includes some big changes
to fix up other messes.
Now, at migrating mapped file, events happen in following sequence.
1. allocate a new page.
2. get memcg of an old page.
3. charge ageinst a new page before migration. But at this point,
no changes to new page's page_cgroup, no commit for the charge.
(IOW, PCG_USED bit is not set.)
4. page migration replaces radix-tree, old-page and new-page.
5. page migration remaps the new page if the old page was mapped.
6. Here, the new page is unlocked.
7. memcg commits the charge for newpage, Mark the new page's page_cgroup
as PCG_USED.
Because "commit" happens after page-remap, we can count FILE_MAPPED
at "5", because we should avoid to trust page_cgroup->mem_cgroup.
if PCG_USED bit is unset.
(Note: memcg's LRU removal code does that but LRU-isolation logic is used
for helping it. When we overwrite page_cgroup->mem_cgroup, page_cgroup is
not on LRU or page_cgroup->mem_cgroup is NULL.)
We can lose file_mapped accounting information at 5 because FILE_MAPPED
is updated only when mapcount changes 0->1. So we should catch it.
BTW, historically, above implemntation comes from migration-failure
of anonymous page. Because we charge both of old page and new page
with mapcount=0, we can't catch
- the page is really freed before remap.
- migration fails but it's freed before remap
or .....corner cases.
New migration sequence with memcg is:
1. allocate a new page.
2. mark PageCgroupMigration to the old page.
3. charge against a new page onto the old page's memcg. (here, new page's pc
is marked as PageCgroupUsed.)
4. page migration replaces radix-tree, page table, etc...
5. At remapping, new page's page_cgroup is now makrked as "USED"
We can catch 0->1 event and FILE_MAPPED will be properly updated.
And we can catch SWAPOUT event after unlock this and freeing this
page by unmap() can be caught.
7. Clear PageCgroupMigration of the old page.
So, FILE_MAPPED will be correctly updated.
Then, for what MIGRATION flag is ?
Without it, at migration failure, we may have to charge old page again
because it may be fully unmapped. "charge" means that we have to dive into
memory reclaim or something complated. So, it's better to avoid
charge it again. Before this patch, __commit_charge() was working for
both of the old/new page and fixed up all. But this technique has some
racy condtion around FILE_MAPPED and SWAPOUT etc...
Now, the kernel use MIGRATION flag and don't uncharge old page until
the end of migration.
I hope this change will make memcg's page migration much simpler. This
page migration has caused several troubles. Worth to add a flag for
simplification.
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reported-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-05-26 14:42:46 -07:00
|
|
|
mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
|
2012-01-12 17:18:32 -08:00
|
|
|
struct mem_cgroup **memcgp, gfp_t gfp_mask)
|
2008-02-07 00:14:10 -08:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-11-02 13:38:15 -07:00
|
|
|
static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg,
|
2011-01-13 15:47:43 -08:00
|
|
|
struct page *oldpage, struct page *newpage, bool migration_ok)
|
2008-02-07 00:14:10 -08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2012-01-12 17:17:59 -08:00
|
|
|
static inline struct mem_cgroup *
|
|
|
|
mem_cgroup_iter(struct mem_cgroup *root,
|
|
|
|
struct mem_cgroup *prev,
|
|
|
|
struct mem_cgroup_reclaim_cookie *reclaim)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
|
|
|
|
struct mem_cgroup *prev)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2009-01-07 18:08:02 -08:00
|
|
|
static inline bool mem_cgroup_disabled(void)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
2009-01-07 18:08:08 -08:00
|
|
|
|
2009-01-07 18:08:18 -08:00
|
|
|
static inline int
|
2011-11-02 13:38:23 -07:00
|
|
|
mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
|
2009-01-07 18:08:18 -08:00
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2009-06-16 15:32:28 -07:00
|
|
|
static inline int
|
2011-11-02 13:38:23 -07:00
|
|
|
mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
|
2009-06-16 15:32:28 -07:00
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2009-01-07 18:08:19 -08:00
|
|
|
static inline unsigned long
|
2012-05-29 15:07:00 -07:00
|
|
|
mem_cgroup_get_lruvec_size(struct lruvec *lruvec, enum lru_list lru)
|
2009-01-07 18:08:19 -08:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-01-07 18:08:20 -08:00
|
|
|
static inline struct zone_reclaim_stat*
|
|
|
|
mem_cgroup_get_reclaim_stat_from_page(struct page *page)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2009-04-02 16:57:39 -07:00
|
|
|
static inline void
|
|
|
|
mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
memcg: use new logic for page stat accounting
Now, page-stat-per-memcg is recorded into per page_cgroup flag by
duplicating page's status into the flag. The reason is that memcg has a
feature to move a page from a group to another group and we have race
between "move" and "page stat accounting",
Under current logic, assume CPU-A and CPU-B. CPU-A does "move" and CPU-B
does "page stat accounting".
When CPU-A goes 1st,
CPU-A CPU-B
update "struct page" info.
move_lock_mem_cgroup(memcg)
see pc->flags
copy page stat to new group
overwrite pc->mem_cgroup.
move_unlock_mem_cgroup(memcg)
move_lock_mem_cgroup(mem)
set pc->flags
update page stat accounting
move_unlock_mem_cgroup(mem)
stat accounting is guarded by move_lock_mem_cgroup() and "move" logic
(CPU-A) doesn't see changes in "struct page" information.
But it's costly to have the same information both in 'struct page' and
'struct page_cgroup'. And, there is a potential problem.
For example, assume we have PG_dirty accounting in memcg.
PG_..is a flag for struct page.
PCG_ is a flag for struct page_cgroup.
(This is just an example. The same problem can be found in any
kind of page stat accounting.)
CPU-A CPU-B
TestSet PG_dirty
(delay) TestClear PG_dirty
if (TestClear(PCG_dirty))
memcg->nr_dirty--
if (TestSet(PCG_dirty))
memcg->nr_dirty++
Here, memcg->nr_dirty = +1, this is wrong. This race was reported by Greg
Thelen <gthelen@google.com>. Now, only FILE_MAPPED is supported but
fortunately, it's serialized by page table lock and this is not real bug,
_now_,
If this potential problem is caused by having duplicated information in
struct page and struct page_cgroup, we may be able to fix this by using
original 'struct page' information. But we'll have a problem in "move
account"
Assume we use only PG_dirty.
CPU-A CPU-B
TestSet PG_dirty
(delay) move_lock_mem_cgroup()
if (PageDirty(page))
new_memcg->nr_dirty++
pc->mem_cgroup = new_memcg;
move_unlock_mem_cgroup()
move_lock_mem_cgroup()
memcg = pc->mem_cgroup
new_memcg->nr_dirty++
accounting information may be double-counted. This was original reason to
have PCG_xxx flags but it seems PCG_xxx has another problem.
I think we need a bigger lock as
move_lock_mem_cgroup(page)
TestSetPageDirty(page)
update page stats (without any checks)
move_unlock_mem_cgroup(page)
This fixes both of problems and we don't have to duplicate page flag into
page_cgroup. Please note: move_lock_mem_cgroup() is held only when there
are possibility of "account move" under the system. So, in most path,
status update will go without atomic locks.
This patch introduces mem_cgroup_begin_update_page_stat() and
mem_cgroup_end_update_page_stat() both should be called at modifying
'struct page' information if memcg takes care of it. as
mem_cgroup_begin_update_page_stat()
modify page information
mem_cgroup_update_page_stat()
=> never check any 'struct page' info, just update counters.
mem_cgroup_end_update_page_stat().
This patch is slow because we need to call begin_update_page_stat()/
end_update_page_stat() regardless of accounted will be changed or not. A
following patch adds an easy optimization and reduces the cost.
[akpm@linux-foundation.org: s/lock/locked/]
[hughd@google.com: fix deadlock by avoiding stat lock when anon]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Greg Thelen <gthelen@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 16:34:25 -07:00
|
|
|
static inline void mem_cgroup_begin_update_page_stat(struct page *page,
|
|
|
|
bool *locked, unsigned long *flags)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_end_update_page_stat(struct page *page,
|
|
|
|
bool *locked, unsigned long *flags)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2011-01-13 15:47:37 -08:00
|
|
|
static inline void mem_cgroup_inc_page_stat(struct page *page,
|
|
|
|
enum mem_cgroup_page_stat_item idx)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_dec_page_stat(struct page *page,
|
|
|
|
enum mem_cgroup_page_stat_item idx)
|
2009-06-17 16:26:34 -07:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2009-09-23 15:56:39 -07:00
|
|
|
static inline
|
|
|
|
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
|
2011-05-26 16:25:25 -07:00
|
|
|
gfp_t gfp_mask,
|
|
|
|
unsigned long *total_scanned)
|
2009-09-23 15:56:39 -07:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
oom: badness heuristic rewrite
This a complete rewrite of the oom killer's badness() heuristic which is
used to determine which task to kill in oom conditions. The goal is to
make it as simple and predictable as possible so the results are better
understood and we end up killing the task which will lead to the most
memory freeing while still respecting the fine-tuning from userspace.
Instead of basing the heuristic on mm->total_vm for each task, the task's
rss and swap space is used instead. This is a better indication of the
amount of memory that will be freeable if the oom killed task is chosen
and subsequently exits. This helps specifically in cases where KDE or
GNOME is chosen for oom kill on desktop systems instead of a memory
hogging task.
The baseline for the heuristic is a proportion of memory that each task is
currently using in memory plus swap compared to the amount of "allowable"
memory. "Allowable," in this sense, means the system-wide resources for
unconstrained oom conditions, the set of mempolicy nodes, the mems
attached to current's cpuset, or a memory controller's limit. The
proportion is given on a scale of 0 (never kill) to 1000 (always kill),
roughly meaning that if a task has a badness() score of 500 that the task
consumes approximately 50% of allowable memory resident in RAM or in swap
space.
The proportion is always relative to the amount of "allowable" memory and
not the total amount of RAM systemwide so that mempolicies and cpusets may
operate in isolation; they shall not need to know the true size of the
machine on which they are running if they are bound to a specific set of
nodes or mems, respectively.
Root tasks are given 3% extra memory just like __vm_enough_memory()
provides in LSMs. In the event of two tasks consuming similar amounts of
memory, it is generally better to save root's task.
Because of the change in the badness() heuristic's baseline, it is also
necessary to introduce a new user interface to tune it. It's not possible
to redefine the meaning of /proc/pid/oom_adj with a new scale since the
ABI cannot be changed for backward compatability. Instead, a new tunable,
/proc/pid/oom_score_adj, is added that ranges from -1000 to +1000. It may
be used to polarize the heuristic such that certain tasks are never
considered for oom kill while others may always be considered. The value
is added directly into the badness() score so a value of -500, for
example, means to discount 50% of its memory consumption in comparison to
other tasks either on the system, bound to the mempolicy, in the cpuset,
or sharing the same memory controller.
/proc/pid/oom_adj is changed so that its meaning is rescaled into the
units used by /proc/pid/oom_score_adj, and vice versa. Changing one of
these per-task tunables will rescale the value of the other to an
equivalent meaning. Although /proc/pid/oom_adj was originally defined as
a bitshift on the badness score, it now shares the same linear growth as
/proc/pid/oom_score_adj but with different granularity. This is required
so the ABI is not broken with userspace applications and allows oom_adj to
be deprecated for future removal.
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-08-09 17:19:46 -07:00
|
|
|
static inline
|
2011-11-02 13:38:15 -07:00
|
|
|
u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
|
oom: badness heuristic rewrite
This a complete rewrite of the oom killer's badness() heuristic which is
used to determine which task to kill in oom conditions. The goal is to
make it as simple and predictable as possible so the results are better
understood and we end up killing the task which will lead to the most
memory freeing while still respecting the fine-tuning from userspace.
Instead of basing the heuristic on mm->total_vm for each task, the task's
rss and swap space is used instead. This is a better indication of the
amount of memory that will be freeable if the oom killed task is chosen
and subsequently exits. This helps specifically in cases where KDE or
GNOME is chosen for oom kill on desktop systems instead of a memory
hogging task.
The baseline for the heuristic is a proportion of memory that each task is
currently using in memory plus swap compared to the amount of "allowable"
memory. "Allowable," in this sense, means the system-wide resources for
unconstrained oom conditions, the set of mempolicy nodes, the mems
attached to current's cpuset, or a memory controller's limit. The
proportion is given on a scale of 0 (never kill) to 1000 (always kill),
roughly meaning that if a task has a badness() score of 500 that the task
consumes approximately 50% of allowable memory resident in RAM or in swap
space.
The proportion is always relative to the amount of "allowable" memory and
not the total amount of RAM systemwide so that mempolicies and cpusets may
operate in isolation; they shall not need to know the true size of the
machine on which they are running if they are bound to a specific set of
nodes or mems, respectively.
Root tasks are given 3% extra memory just like __vm_enough_memory()
provides in LSMs. In the event of two tasks consuming similar amounts of
memory, it is generally better to save root's task.
Because of the change in the badness() heuristic's baseline, it is also
necessary to introduce a new user interface to tune it. It's not possible
to redefine the meaning of /proc/pid/oom_adj with a new scale since the
ABI cannot be changed for backward compatability. Instead, a new tunable,
/proc/pid/oom_score_adj, is added that ranges from -1000 to +1000. It may
be used to polarize the heuristic such that certain tasks are never
considered for oom kill while others may always be considered. The value
is added directly into the badness() score so a value of -500, for
example, means to discount 50% of its memory consumption in comparison to
other tasks either on the system, bound to the mempolicy, in the cpuset,
or sharing the same memory controller.
/proc/pid/oom_adj is changed so that its meaning is rescaled into the
units used by /proc/pid/oom_score_adj, and vice versa. Changing one of
these per-task tunables will rescale the value of the other to an
equivalent meaning. Although /proc/pid/oom_adj was originally defined as
a bitshift on the badness score, it now shares the same linear growth as
/proc/pid/oom_score_adj but with different granularity. This is required
so the ABI is not broken with userspace applications and allows oom_adj to
be deprecated for future removal.
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-08-09 17:19:46 -07:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-01-12 17:18:20 -08:00
|
|
|
static inline void mem_cgroup_split_huge_fixup(struct page *head)
|
2011-01-20 14:44:24 -08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2011-05-26 16:25:38 -07:00
|
|
|
static inline
|
|
|
|
void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
|
|
|
|
{
|
|
|
|
}
|
memcg: add mem_cgroup_replace_page_cache() to fix LRU issue
Commit ef6a3c6311 ("mm: add replace_page_cache_page() function") added a
function replace_page_cache_page(). This function replaces a page in the
radix-tree with a new page. WHen doing this, memory cgroup needs to fix
up the accounting information. memcg need to check PCG_USED bit etc.
In some(many?) cases, 'newpage' is on LRU before calling
replace_page_cache(). So, memcg's LRU accounting information should be
fixed, too.
This patch adds mem_cgroup_replace_page_cache() and removes the old hooks.
In that function, old pages will be unaccounted without touching
res_counter and new page will be accounted to the memcg (of old page).
WHen overwriting pc->mem_cgroup of newpage, take zone->lru_lock and avoid
races with LRU handling.
Background:
replace_page_cache_page() is called by FUSE code in its splice() handling.
Here, 'newpage' is replacing oldpage but this newpage is not a newly allocated
page and may be on LRU. LRU mis-accounting will be critical for memory cgroup
because rmdir() checks the whole LRU is empty and there is no account leak.
If a page is on the other LRU than it should be, rmdir() will fail.
This bug was added in March 2011, but no bug report yet. I guess there
are not many people who use memcg and FUSE at the same time with upstream
kernels.
The result of this bug is that admin cannot destroy a memcg because of
account leak. So, no panic, no deadlock. And, even if an active cgroup
exist, umount can succseed. So no problem at shutdown.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Miklos Szeredi <mszeredi@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-01-12 17:17:44 -08:00
|
|
|
static inline void mem_cgroup_replace_page_cache(struct page *oldpage,
|
|
|
|
struct page *newpage)
|
|
|
|
{
|
|
|
|
}
|
2012-03-21 16:34:18 -07:00
|
|
|
#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
|
2008-02-07 00:13:51 -08:00
|
|
|
|
2011-03-23 16:42:25 -07:00
|
|
|
#if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM)
|
|
|
|
static inline bool
|
|
|
|
mem_cgroup_bad_page_check(struct page *page)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
mem_cgroup_print_bad_page(struct page *page)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2011-12-11 21:47:03 +00:00
|
|
|
enum {
|
|
|
|
UNDER_LIMIT,
|
|
|
|
SOFT_LIMIT,
|
|
|
|
OVER_LIMIT,
|
|
|
|
};
|
|
|
|
|
|
|
|
struct sock;
|
|
|
|
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
|
|
|
|
void sock_update_memcg(struct sock *sk);
|
|
|
|
void sock_release_memcg(struct sock *sk);
|
|
|
|
#else
|
|
|
|
static inline void sock_update_memcg(struct sock *sk)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
static inline void sock_release_memcg(struct sock *sk)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
|
2008-02-07 00:13:50 -08:00
|
|
|
#endif /* _LINUX_MEMCONTROL_H */
|
|
|
|
|