mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-09 15:29:16 +00:00
Merge branch 'akpm' (patches from Andrew)
Merge misc fixes from Andrew Morton: "12 fixes" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: mm, vmscan: prevent kswapd livelock due to pfmemalloc-throttled process being killed memcg: fix destination cgroup leak on task charges migration mm: memcontrol: switch soft limit default back to infinity mm/debug_pagealloc: remove obsolete Kconfig options vfs: renumber FMODE_NONOTIFY and add to uniqueness check arch/blackfin/mach-bf533/boards/stamp.c: add linux/delay.h ocfs2: fix the wrong directory passed to ocfs2_lookup_ino_from_name() when link file MAINTAINERS: update rydberg's addresses mm: protect set_page_dirty() from ongoing truncation mm: prevent endless growth of anon_vma hierarchy exit: fix race between wait_consider_task() and wait_task_zombie() ocfs2: remove bogus check in dlm_process_recovery_data
This commit is contained in:
commit
b3d574aec7
1
.mailmap
1
.mailmap
@ -51,6 +51,7 @@ Greg Kroah-Hartman <gregkh@suse.de>
|
||||
Greg Kroah-Hartman <greg@kroah.com>
|
||||
Henk Vergonet <Henk.Vergonet@gmail.com>
|
||||
Henrik Kretzschmar <henne@nachtwindheim.de>
|
||||
Henrik Rydberg <rydberg@bitmath.org>
|
||||
Herbert Xu <herbert@gondor.apana.org.au>
|
||||
Jacob Shin <Jacob.Shin@amd.com>
|
||||
James Bottomley <jejb@mulgrave.(none)>
|
||||
|
12
MAINTAINERS
12
MAINTAINERS
@ -724,15 +724,15 @@ F: include/uapi/linux/apm_bios.h
|
||||
F: drivers/char/apm-emulation.c
|
||||
|
||||
APPLE BCM5974 MULTITOUCH DRIVER
|
||||
M: Henrik Rydberg <rydberg@euromail.se>
|
||||
M: Henrik Rydberg <rydberg@bitmath.org>
|
||||
L: linux-input@vger.kernel.org
|
||||
S: Maintained
|
||||
S: Odd fixes
|
||||
F: drivers/input/mouse/bcm5974.c
|
||||
|
||||
APPLE SMC DRIVER
|
||||
M: Henrik Rydberg <rydberg@euromail.se>
|
||||
M: Henrik Rydberg <rydberg@bitmath.org>
|
||||
L: lm-sensors@lm-sensors.org
|
||||
S: Maintained
|
||||
S: Odd fixes
|
||||
F: drivers/hwmon/applesmc.c
|
||||
|
||||
APPLETALK NETWORK LAYER
|
||||
@ -4940,10 +4940,10 @@ F: include/uapi/linux/input.h
|
||||
F: include/linux/input/
|
||||
|
||||
INPUT MULTITOUCH (MT) PROTOCOL
|
||||
M: Henrik Rydberg <rydberg@euromail.se>
|
||||
M: Henrik Rydberg <rydberg@bitmath.org>
|
||||
L: linux-input@vger.kernel.org
|
||||
T: git git://git.kernel.org/pub/scm/linux/kernel/git/rydberg/input-mt.git
|
||||
S: Maintained
|
||||
S: Odd fixes
|
||||
F: Documentation/input/multi-touch-protocol.txt
|
||||
F: drivers/input/input-mt.c
|
||||
K: \b(ABS|SYN)_MT_
|
||||
|
@ -7,6 +7,7 @@
|
||||
*/
|
||||
|
||||
#include <linux/device.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/platform_device.h>
|
||||
#include <linux/mtd/mtd.h>
|
||||
#include <linux/mtd/partitions.h>
|
||||
|
@ -740,14 +740,15 @@ static int __init fcntl_init(void)
|
||||
* Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
|
||||
* is defined as O_NONBLOCK on some platforms and not on others.
|
||||
*/
|
||||
BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
|
||||
BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
|
||||
O_RDONLY | O_WRONLY | O_RDWR |
|
||||
O_CREAT | O_EXCL | O_NOCTTY |
|
||||
O_TRUNC | O_APPEND | /* O_NONBLOCK | */
|
||||
__O_SYNC | O_DSYNC | FASYNC |
|
||||
O_DIRECT | O_LARGEFILE | O_DIRECTORY |
|
||||
O_NOFOLLOW | O_NOATIME | O_CLOEXEC |
|
||||
__FMODE_EXEC | O_PATH | __O_TMPFILE
|
||||
__FMODE_EXEC | O_PATH | __O_TMPFILE |
|
||||
__FMODE_NONOTIFY
|
||||
));
|
||||
|
||||
fasync_cache = kmem_cache_create("fasync_cache",
|
||||
|
@ -2023,11 +2023,8 @@ leave:
|
||||
dlm_lockres_drop_inflight_ref(dlm, res);
|
||||
spin_unlock(&res->spinlock);
|
||||
|
||||
if (ret < 0) {
|
||||
if (ret < 0)
|
||||
mlog_errno(ret);
|
||||
if (newlock)
|
||||
dlm_lock_put(newlock);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -94,6 +94,14 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
|
||||
struct inode *inode,
|
||||
const char *symname);
|
||||
|
||||
static int ocfs2_double_lock(struct ocfs2_super *osb,
|
||||
struct buffer_head **bh1,
|
||||
struct inode *inode1,
|
||||
struct buffer_head **bh2,
|
||||
struct inode *inode2,
|
||||
int rename);
|
||||
|
||||
static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2);
|
||||
/* An orphan dir name is an 8 byte value, printed as a hex string */
|
||||
#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
|
||||
|
||||
@ -678,8 +686,10 @@ static int ocfs2_link(struct dentry *old_dentry,
|
||||
{
|
||||
handle_t *handle;
|
||||
struct inode *inode = old_dentry->d_inode;
|
||||
struct inode *old_dir = old_dentry->d_parent->d_inode;
|
||||
int err;
|
||||
struct buffer_head *fe_bh = NULL;
|
||||
struct buffer_head *old_dir_bh = NULL;
|
||||
struct buffer_head *parent_fe_bh = NULL;
|
||||
struct ocfs2_dinode *fe = NULL;
|
||||
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
|
||||
@ -696,19 +706,33 @@ static int ocfs2_link(struct dentry *old_dentry,
|
||||
|
||||
dquot_initialize(dir);
|
||||
|
||||
err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT);
|
||||
err = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
|
||||
&parent_fe_bh, dir, 0);
|
||||
if (err < 0) {
|
||||
if (err != -ENOENT)
|
||||
mlog_errno(err);
|
||||
return err;
|
||||
}
|
||||
|
||||
/* make sure both dirs have bhs
|
||||
* get an extra ref on old_dir_bh if old==new */
|
||||
if (!parent_fe_bh) {
|
||||
if (old_dir_bh) {
|
||||
parent_fe_bh = old_dir_bh;
|
||||
get_bh(parent_fe_bh);
|
||||
} else {
|
||||
mlog(ML_ERROR, "%s: no old_dir_bh!\n", osb->uuid_str);
|
||||
err = -EIO;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
if (!dir->i_nlink) {
|
||||
err = -ENOENT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
err = ocfs2_lookup_ino_from_name(dir, old_dentry->d_name.name,
|
||||
err = ocfs2_lookup_ino_from_name(old_dir, old_dentry->d_name.name,
|
||||
old_dentry->d_name.len, &old_de_ino);
|
||||
if (err) {
|
||||
err = -ENOENT;
|
||||
@ -801,10 +825,11 @@ out_unlock_inode:
|
||||
ocfs2_inode_unlock(inode, 1);
|
||||
|
||||
out:
|
||||
ocfs2_inode_unlock(dir, 1);
|
||||
ocfs2_double_unlock(old_dir, dir);
|
||||
|
||||
brelse(fe_bh);
|
||||
brelse(parent_fe_bh);
|
||||
brelse(old_dir_bh);
|
||||
|
||||
ocfs2_free_dir_lookup_result(&lookup);
|
||||
|
||||
@ -1072,14 +1097,15 @@ static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
|
||||
}
|
||||
|
||||
/*
|
||||
* The only place this should be used is rename!
|
||||
* The only place this should be used is rename and link!
|
||||
* if they have the same id, then the 1st one is the only one locked.
|
||||
*/
|
||||
static int ocfs2_double_lock(struct ocfs2_super *osb,
|
||||
struct buffer_head **bh1,
|
||||
struct inode *inode1,
|
||||
struct buffer_head **bh2,
|
||||
struct inode *inode2)
|
||||
struct inode *inode2,
|
||||
int rename)
|
||||
{
|
||||
int status;
|
||||
int inode1_is_ancestor, inode2_is_ancestor;
|
||||
@ -1127,7 +1153,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
|
||||
}
|
||||
/* lock id2 */
|
||||
status = ocfs2_inode_lock_nested(inode2, bh2, 1,
|
||||
OI_LS_RENAME1);
|
||||
rename == 1 ? OI_LS_RENAME1 : OI_LS_PARENT);
|
||||
if (status < 0) {
|
||||
if (status != -ENOENT)
|
||||
mlog_errno(status);
|
||||
@ -1136,7 +1162,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
|
||||
}
|
||||
|
||||
/* lock id1 */
|
||||
status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_RENAME2);
|
||||
status = ocfs2_inode_lock_nested(inode1, bh1, 1,
|
||||
rename == 1 ? OI_LS_RENAME2 : OI_LS_PARENT);
|
||||
if (status < 0) {
|
||||
/*
|
||||
* An error return must mean that no cluster locks
|
||||
@ -1252,7 +1279,7 @@ static int ocfs2_rename(struct inode *old_dir,
|
||||
|
||||
/* if old and new are the same, this'll just do one lock. */
|
||||
status = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
|
||||
&new_dir_bh, new_dir);
|
||||
&new_dir_bh, new_dir, 1);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
|
@ -135,7 +135,7 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
|
||||
#define FMODE_CAN_WRITE ((__force fmode_t)0x40000)
|
||||
|
||||
/* File was opened by fanotify and shouldn't generate fanotify events */
|
||||
#define FMODE_NONOTIFY ((__force fmode_t)0x1000000)
|
||||
#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
|
||||
|
||||
/*
|
||||
* Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
|
||||
|
@ -36,6 +36,16 @@ struct anon_vma {
|
||||
*/
|
||||
atomic_t refcount;
|
||||
|
||||
/*
|
||||
* Count of child anon_vmas and VMAs which points to this anon_vma.
|
||||
*
|
||||
* This counter is used for making decision about reusing anon_vma
|
||||
* instead of forking new one. See comments in function anon_vma_clone.
|
||||
*/
|
||||
unsigned degree;
|
||||
|
||||
struct anon_vma *parent; /* Parent of this anon_vma */
|
||||
|
||||
/*
|
||||
* NOTE: the LSB of the rb_root.rb_node is set by
|
||||
* mm_take_all_locks() _after_ taking the above lock. So the
|
||||
|
@ -177,7 +177,6 @@ int write_cache_pages(struct address_space *mapping,
|
||||
struct writeback_control *wbc, writepage_t writepage,
|
||||
void *data);
|
||||
int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
|
||||
void set_page_dirty_balance(struct page *page);
|
||||
void writeback_set_ratelimit(void);
|
||||
void tag_pages_for_writeback(struct address_space *mapping,
|
||||
pgoff_t start, pgoff_t end);
|
||||
|
@ -5,7 +5,7 @@
|
||||
|
||||
/*
|
||||
* FMODE_EXEC is 0x20
|
||||
* FMODE_NONOTIFY is 0x1000000
|
||||
* FMODE_NONOTIFY is 0x4000000
|
||||
* These cannot be used by userspace O_* until internal and external open
|
||||
* flags are split.
|
||||
* -Eric Paris
|
||||
|
@ -1287,9 +1287,15 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
|
||||
static int wait_consider_task(struct wait_opts *wo, int ptrace,
|
||||
struct task_struct *p)
|
||||
{
|
||||
/*
|
||||
* We can race with wait_task_zombie() from another thread.
|
||||
* Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
|
||||
* can't confuse the checks below.
|
||||
*/
|
||||
int exit_state = ACCESS_ONCE(p->exit_state);
|
||||
int ret;
|
||||
|
||||
if (unlikely(p->exit_state == EXIT_DEAD))
|
||||
if (unlikely(exit_state == EXIT_DEAD))
|
||||
return 0;
|
||||
|
||||
ret = eligible_child(wo, p);
|
||||
@ -1310,7 +1316,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (unlikely(p->exit_state == EXIT_TRACE)) {
|
||||
if (unlikely(exit_state == EXIT_TRACE)) {
|
||||
/*
|
||||
* ptrace == 0 means we are the natural parent. In this case
|
||||
* we should clear notask_error, debugger will notify us.
|
||||
@ -1337,7 +1343,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
|
||||
}
|
||||
|
||||
/* slay zombie? */
|
||||
if (p->exit_state == EXIT_ZOMBIE) {
|
||||
if (exit_state == EXIT_ZOMBIE) {
|
||||
/* we don't reap group leaders with subthreads */
|
||||
if (!delay_group_leader(p)) {
|
||||
/*
|
||||
|
@ -14,7 +14,6 @@ config DEBUG_PAGEALLOC
|
||||
depends on !KMEMCHECK
|
||||
select PAGE_EXTENSION
|
||||
select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
|
||||
select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC
|
||||
---help---
|
||||
Unmap pages from the kernel linear mapping after free_pages().
|
||||
This results in a large slowdown, but helps to find certain types
|
||||
@ -27,13 +26,5 @@ config DEBUG_PAGEALLOC
|
||||
that would result in incorrect warnings of memory corruption after
|
||||
a resume because free pages are not saved to the suspend image.
|
||||
|
||||
config WANT_PAGE_DEBUG_FLAGS
|
||||
bool
|
||||
|
||||
config PAGE_POISONING
|
||||
bool
|
||||
select WANT_PAGE_DEBUG_FLAGS
|
||||
|
||||
config PAGE_GUARD
|
||||
bool
|
||||
select WANT_PAGE_DEBUG_FLAGS
|
||||
|
@ -3043,18 +3043,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
|
||||
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
|
||||
mem_cgroup_swap_statistics(from, false);
|
||||
mem_cgroup_swap_statistics(to, true);
|
||||
/*
|
||||
* This function is only called from task migration context now.
|
||||
* It postpones page_counter and refcount handling till the end
|
||||
* of task migration(mem_cgroup_clear_mc()) for performance
|
||||
* improvement. But we cannot postpone css_get(to) because if
|
||||
* the process that has been moved to @to does swap-in, the
|
||||
* refcount of @to might be decreased to 0.
|
||||
*
|
||||
* We are in attach() phase, so the cgroup is guaranteed to be
|
||||
* alive, so we can just call css_get().
|
||||
*/
|
||||
css_get(&to->css);
|
||||
return 0;
|
||||
}
|
||||
return -EINVAL;
|
||||
@ -4679,6 +4667,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
|
||||
if (parent_css == NULL) {
|
||||
root_mem_cgroup = memcg;
|
||||
page_counter_init(&memcg->memory, NULL);
|
||||
memcg->soft_limit = PAGE_COUNTER_MAX;
|
||||
page_counter_init(&memcg->memsw, NULL);
|
||||
page_counter_init(&memcg->kmem, NULL);
|
||||
}
|
||||
@ -4724,6 +4713,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
|
||||
|
||||
if (parent->use_hierarchy) {
|
||||
page_counter_init(&memcg->memory, &parent->memory);
|
||||
memcg->soft_limit = PAGE_COUNTER_MAX;
|
||||
page_counter_init(&memcg->memsw, &parent->memsw);
|
||||
page_counter_init(&memcg->kmem, &parent->kmem);
|
||||
|
||||
@ -4733,6 +4723,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
|
||||
*/
|
||||
} else {
|
||||
page_counter_init(&memcg->memory, NULL);
|
||||
memcg->soft_limit = PAGE_COUNTER_MAX;
|
||||
page_counter_init(&memcg->memsw, NULL);
|
||||
page_counter_init(&memcg->kmem, NULL);
|
||||
/*
|
||||
@ -4807,7 +4798,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
|
||||
mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
|
||||
mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
|
||||
memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
|
||||
memcg->soft_limit = 0;
|
||||
memcg->soft_limit = PAGE_COUNTER_MAX;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
|
27
mm/memory.c
27
mm/memory.c
@ -2137,17 +2137,24 @@ reuse:
|
||||
if (!dirty_page)
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* Yes, Virginia, this is actually required to prevent a race
|
||||
* with clear_page_dirty_for_io() from clearing the page dirty
|
||||
* bit after it clear all dirty ptes, but before a racing
|
||||
* do_wp_page installs a dirty pte.
|
||||
*
|
||||
* do_shared_fault is protected similarly.
|
||||
*/
|
||||
if (!page_mkwrite) {
|
||||
wait_on_page_locked(dirty_page);
|
||||
set_page_dirty_balance(dirty_page);
|
||||
struct address_space *mapping;
|
||||
int dirtied;
|
||||
|
||||
lock_page(dirty_page);
|
||||
dirtied = set_page_dirty(dirty_page);
|
||||
VM_BUG_ON_PAGE(PageAnon(dirty_page), dirty_page);
|
||||
mapping = dirty_page->mapping;
|
||||
unlock_page(dirty_page);
|
||||
|
||||
if (dirtied && mapping) {
|
||||
/*
|
||||
* Some device drivers do not set page.mapping
|
||||
* but still dirty their pages
|
||||
*/
|
||||
balance_dirty_pages_ratelimited(mapping);
|
||||
}
|
||||
|
||||
/* file_update_time outside page_lock */
|
||||
if (vma->vm_file)
|
||||
file_update_time(vma->vm_file);
|
||||
|
@ -1541,16 +1541,6 @@ pause:
|
||||
bdi_start_background_writeback(bdi);
|
||||
}
|
||||
|
||||
void set_page_dirty_balance(struct page *page)
|
||||
{
|
||||
if (set_page_dirty(page)) {
|
||||
struct address_space *mapping = page_mapping(page);
|
||||
|
||||
if (mapping)
|
||||
balance_dirty_pages_ratelimited(mapping);
|
||||
}
|
||||
}
|
||||
|
||||
static DEFINE_PER_CPU(int, bdp_ratelimits);
|
||||
|
||||
/*
|
||||
@ -2123,32 +2113,25 @@ EXPORT_SYMBOL(account_page_dirtied);
|
||||
* page dirty in that case, but not all the buffers. This is a "bottom-up"
|
||||
* dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
|
||||
*
|
||||
* Most callers have locked the page, which pins the address_space in memory.
|
||||
* But zap_pte_range() does not lock the page, however in that case the
|
||||
* mapping is pinned by the vma's ->vm_file reference.
|
||||
*
|
||||
* We take care to handle the case where the page was truncated from the
|
||||
* mapping by re-checking page_mapping() inside tree_lock.
|
||||
* The caller must ensure this doesn't race with truncation. Most will simply
|
||||
* hold the page lock, but e.g. zap_pte_range() calls with the page mapped and
|
||||
* the pte lock held, which also locks out truncation.
|
||||
*/
|
||||
int __set_page_dirty_nobuffers(struct page *page)
|
||||
{
|
||||
if (!TestSetPageDirty(page)) {
|
||||
struct address_space *mapping = page_mapping(page);
|
||||
struct address_space *mapping2;
|
||||
unsigned long flags;
|
||||
|
||||
if (!mapping)
|
||||
return 1;
|
||||
|
||||
spin_lock_irqsave(&mapping->tree_lock, flags);
|
||||
mapping2 = page_mapping(page);
|
||||
if (mapping2) { /* Race with truncate? */
|
||||
BUG_ON(mapping2 != mapping);
|
||||
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
|
||||
account_page_dirtied(page, mapping);
|
||||
radix_tree_tag_set(&mapping->page_tree,
|
||||
page_index(page), PAGECACHE_TAG_DIRTY);
|
||||
}
|
||||
BUG_ON(page_mapping(page) != mapping);
|
||||
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
|
||||
account_page_dirtied(page, mapping);
|
||||
radix_tree_tag_set(&mapping->page_tree, page_index(page),
|
||||
PAGECACHE_TAG_DIRTY);
|
||||
spin_unlock_irqrestore(&mapping->tree_lock, flags);
|
||||
if (mapping->host) {
|
||||
/* !PageAnon && !swapper_space */
|
||||
@ -2305,12 +2288,10 @@ int clear_page_dirty_for_io(struct page *page)
|
||||
/*
|
||||
* We carefully synchronise fault handlers against
|
||||
* installing a dirty pte and marking the page dirty
|
||||
* at this point. We do this by having them hold the
|
||||
* page lock at some point after installing their
|
||||
* pte, but before marking the page dirty.
|
||||
* Pages are always locked coming in here, so we get
|
||||
* the desired exclusion. See mm/memory.c:do_wp_page()
|
||||
* for more comments.
|
||||
* at this point. We do this by having them hold the
|
||||
* page lock while dirtying the page, and pages are
|
||||
* always locked coming in here, so we get the desired
|
||||
* exclusion.
|
||||
*/
|
||||
if (TestClearPageDirty(page)) {
|
||||
dec_zone_page_state(page, NR_FILE_DIRTY);
|
||||
|
42
mm/rmap.c
42
mm/rmap.c
@ -72,6 +72,8 @@ static inline struct anon_vma *anon_vma_alloc(void)
|
||||
anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
|
||||
if (anon_vma) {
|
||||
atomic_set(&anon_vma->refcount, 1);
|
||||
anon_vma->degree = 1; /* Reference for first vma */
|
||||
anon_vma->parent = anon_vma;
|
||||
/*
|
||||
* Initialise the anon_vma root to point to itself. If called
|
||||
* from fork, the root will be reset to the parents anon_vma.
|
||||
@ -188,6 +190,8 @@ int anon_vma_prepare(struct vm_area_struct *vma)
|
||||
if (likely(!vma->anon_vma)) {
|
||||
vma->anon_vma = anon_vma;
|
||||
anon_vma_chain_link(vma, avc, anon_vma);
|
||||
/* vma reference or self-parent link for new root */
|
||||
anon_vma->degree++;
|
||||
allocated = NULL;
|
||||
avc = NULL;
|
||||
}
|
||||
@ -236,6 +240,14 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
|
||||
/*
|
||||
* Attach the anon_vmas from src to dst.
|
||||
* Returns 0 on success, -ENOMEM on failure.
|
||||
*
|
||||
* If dst->anon_vma is NULL this function tries to find and reuse existing
|
||||
* anon_vma which has no vmas and only one child anon_vma. This prevents
|
||||
* degradation of anon_vma hierarchy to endless linear chain in case of
|
||||
* constantly forking task. On the other hand, an anon_vma with more than one
|
||||
* child isn't reused even if there was no alive vma, thus rmap walker has a
|
||||
* good chance of avoiding scanning the whole hierarchy when it searches where
|
||||
* page is mapped.
|
||||
*/
|
||||
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
|
||||
{
|
||||
@ -256,7 +268,21 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
|
||||
anon_vma = pavc->anon_vma;
|
||||
root = lock_anon_vma_root(root, anon_vma);
|
||||
anon_vma_chain_link(dst, avc, anon_vma);
|
||||
|
||||
/*
|
||||
* Reuse existing anon_vma if its degree lower than two,
|
||||
* that means it has no vma and only one anon_vma child.
|
||||
*
|
||||
* Do not chose parent anon_vma, otherwise first child
|
||||
* will always reuse it. Root anon_vma is never reused:
|
||||
* it has self-parent reference and at least one child.
|
||||
*/
|
||||
if (!dst->anon_vma && anon_vma != src->anon_vma &&
|
||||
anon_vma->degree < 2)
|
||||
dst->anon_vma = anon_vma;
|
||||
}
|
||||
if (dst->anon_vma)
|
||||
dst->anon_vma->degree++;
|
||||
unlock_anon_vma_root(root);
|
||||
return 0;
|
||||
|
||||
@ -280,6 +306,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
|
||||
if (!pvma->anon_vma)
|
||||
return 0;
|
||||
|
||||
/* Drop inherited anon_vma, we'll reuse existing or allocate new. */
|
||||
vma->anon_vma = NULL;
|
||||
|
||||
/*
|
||||
* First, attach the new VMA to the parent VMA's anon_vmas,
|
||||
* so rmap can find non-COWed pages in child processes.
|
||||
@ -288,6 +317,10 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
/* An existing anon_vma has been reused, all done then. */
|
||||
if (vma->anon_vma)
|
||||
return 0;
|
||||
|
||||
/* Then add our own anon_vma. */
|
||||
anon_vma = anon_vma_alloc();
|
||||
if (!anon_vma)
|
||||
@ -301,6 +334,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
|
||||
* lock any of the anon_vmas in this anon_vma tree.
|
||||
*/
|
||||
anon_vma->root = pvma->anon_vma->root;
|
||||
anon_vma->parent = pvma->anon_vma;
|
||||
/*
|
||||
* With refcounts, an anon_vma can stay around longer than the
|
||||
* process it belongs to. The root anon_vma needs to be pinned until
|
||||
@ -311,6 +345,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
|
||||
vma->anon_vma = anon_vma;
|
||||
anon_vma_lock_write(anon_vma);
|
||||
anon_vma_chain_link(vma, avc, anon_vma);
|
||||
anon_vma->parent->degree++;
|
||||
anon_vma_unlock_write(anon_vma);
|
||||
|
||||
return 0;
|
||||
@ -341,12 +376,16 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
|
||||
* Leave empty anon_vmas on the list - we'll need
|
||||
* to free them outside the lock.
|
||||
*/
|
||||
if (RB_EMPTY_ROOT(&anon_vma->rb_root))
|
||||
if (RB_EMPTY_ROOT(&anon_vma->rb_root)) {
|
||||
anon_vma->parent->degree--;
|
||||
continue;
|
||||
}
|
||||
|
||||
list_del(&avc->same_vma);
|
||||
anon_vma_chain_free(avc);
|
||||
}
|
||||
if (vma->anon_vma)
|
||||
vma->anon_vma->degree--;
|
||||
unlock_anon_vma_root(root);
|
||||
|
||||
/*
|
||||
@ -357,6 +396,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
|
||||
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
|
||||
struct anon_vma *anon_vma = avc->anon_vma;
|
||||
|
||||
BUG_ON(anon_vma->degree);
|
||||
put_anon_vma(anon_vma);
|
||||
|
||||
list_del(&avc->same_vma);
|
||||
|
24
mm/vmscan.c
24
mm/vmscan.c
@ -2921,18 +2921,20 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
|
||||
return false;
|
||||
|
||||
/*
|
||||
* There is a potential race between when kswapd checks its watermarks
|
||||
* and a process gets throttled. There is also a potential race if
|
||||
* processes get throttled, kswapd wakes, a large process exits therby
|
||||
* balancing the zones that causes kswapd to miss a wakeup. If kswapd
|
||||
* is going to sleep, no process should be sleeping on pfmemalloc_wait
|
||||
* so wake them now if necessary. If necessary, processes will wake
|
||||
* kswapd and get throttled again
|
||||
* The throttled processes are normally woken up in balance_pgdat() as
|
||||
* soon as pfmemalloc_watermark_ok() is true. But there is a potential
|
||||
* race between when kswapd checks the watermarks and a process gets
|
||||
* throttled. There is also a potential race if processes get
|
||||
* throttled, kswapd wakes, a large process exits thereby balancing the
|
||||
* zones, which causes kswapd to exit balance_pgdat() before reaching
|
||||
* the wake up checks. If kswapd is going to sleep, no process should
|
||||
* be sleeping on pfmemalloc_wait, so wake them now if necessary. If
|
||||
* the wake up is premature, processes will wake kswapd and get
|
||||
* throttled again. The difference from wake ups in balance_pgdat() is
|
||||
* that here we are under prepare_to_wait().
|
||||
*/
|
||||
if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
|
||||
wake_up(&pgdat->pfmemalloc_wait);
|
||||
return false;
|
||||
}
|
||||
if (waitqueue_active(&pgdat->pfmemalloc_wait))
|
||||
wake_up_all(&pgdat->pfmemalloc_wait);
|
||||
|
||||
return pgdat_balanced(pgdat, order, classzone_idx);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user