2019-05-19 12:08:55 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2008-07-26 02:44:36 +00:00
|
|
|
#include <linux/mm.h>
|
2006-01-08 09:01:43 +00:00
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/string.h>
|
2014-04-07 22:37:26 +00:00
|
|
|
#include <linux/compiler.h>
|
2011-10-16 06:01:52 +00:00
|
|
|
#include <linux/export.h>
|
2006-03-24 11:18:42 +00:00
|
|
|
#include <linux/err.h>
|
2008-07-26 22:22:28 +00:00
|
|
|
#include <linux/sched.h>
|
2017-02-08 17:51:29 +00:00
|
|
|
#include <linux/sched/mm.h>
|
2019-07-16 23:30:54 +00:00
|
|
|
#include <linux/sched/signal.h>
|
2017-02-08 17:51:37 +00:00
|
|
|
#include <linux/sched/task_stack.h>
|
2012-05-31 00:17:35 +00:00
|
|
|
#include <linux/security.h>
|
2013-02-23 00:34:35 +00:00
|
|
|
#include <linux/swap.h>
|
2013-02-23 00:34:37 +00:00
|
|
|
#include <linux/swapops.h>
|
2013-11-12 23:08:31 +00:00
|
|
|
#include <linux/mman.h>
|
|
|
|
#include <linux/hugetlb.h>
|
2014-05-06 18:02:53 +00:00
|
|
|
#include <linux/vmalloc.h>
|
2017-02-24 22:58:22 +00:00
|
|
|
#include <linux/userfaultfd_k.h>
|
2019-09-23 22:38:37 +00:00
|
|
|
#include <linux/elf.h>
|
2019-09-23 22:38:47 +00:00
|
|
|
#include <linux/elf-randomize.h>
|
|
|
|
#include <linux/personality.h>
|
2019-09-23 22:38:37 +00:00
|
|
|
#include <linux/random.h>
|
2019-09-23 22:38:47 +00:00
|
|
|
#include <linux/processor.h>
|
|
|
|
#include <linux/sizes.h>
|
|
|
|
#include <linux/compat.h>
|
2013-11-12 23:08:31 +00:00
|
|
|
|
2016-12-24 19:46:01 +00:00
|
|
|
#include <linux/uaccess.h>
|
2006-01-08 09:01:43 +00:00
|
|
|
|
mm: nommu: sort mm->mmap list properly
When I was reading nommu code, I found that it handles the vma list/tree
in an unusual way. IIUC, because there can be more than one
identical/overrapped vmas in the list/tree, it sorts the tree more
strictly and does a linear search on the tree. But it doesn't applied to
the list (i.e. the list could be constructed in a different order than
the tree so that we can't use the list when finding the first vma in that
order).
Since inserting/sorting a vma in the tree and link is done at the same
time, we can easily construct both of them in the same order. And linear
searching on the tree could be more costly than doing it on the list, it
can be converted to use the list.
Also, after the commit 297c5eee3724 ("mm: make the vma list be doubly
linked") made the list be doubly linked, there were a couple of code need
to be fixed to construct the list properly.
Patch 1/6 is a preparation. It maintains the list sorted same as the tree
and construct doubly-linked list properly. Patch 2/6 is a simple
optimization for the vma deletion. Patch 3/6 and 4/6 convert tree
traversal to list traversal and the rest are simple fixes and cleanups.
This patch:
@vma added into @mm should be sorted by start addr, end addr and VMA
struct addr in that order because we may get identical VMAs in the @mm.
However this was true only for the rbtree, not for the list.
This patch fixes this by remembering 'rb_prev' during the tree traversal
like find_vma_prepare() does and linking the @vma via __vma_link_list().
After this patch, we can iterate the whole VMAs in correct order simply by
using @mm->mmap list.
[akpm@linux-foundation.org: avoid duplicating __vma_link_list()]
Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: Greg Ungerer <gerg@uclinux.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-05-25 00:11:22 +00:00
|
|
|
#include "internal.h"
|
2022-05-10 01:20:47 +00:00
|
|
|
#include "swap.h"
|
mm: nommu: sort mm->mmap list properly
When I was reading nommu code, I found that it handles the vma list/tree
in an unusual way. IIUC, because there can be more than one
identical/overrapped vmas in the list/tree, it sorts the tree more
strictly and does a linear search on the tree. But it doesn't applied to
the list (i.e. the list could be constructed in a different order than
the tree so that we can't use the list when finding the first vma in that
order).
Since inserting/sorting a vma in the tree and link is done at the same
time, we can easily construct both of them in the same order. And linear
searching on the tree could be more costly than doing it on the list, it
can be converted to use the list.
Also, after the commit 297c5eee3724 ("mm: make the vma list be doubly
linked") made the list be doubly linked, there were a couple of code need
to be fixed to construct the list properly.
Patch 1/6 is a preparation. It maintains the list sorted same as the tree
and construct doubly-linked list properly. Patch 2/6 is a simple
optimization for the vma deletion. Patch 3/6 and 4/6 convert tree
traversal to list traversal and the rest are simple fixes and cleanups.
This patch:
@vma added into @mm should be sorted by start addr, end addr and VMA
struct addr in that order because we may get identical VMAs in the @mm.
However this was true only for the rbtree, not for the list.
This patch fixes this by remembering 'rb_prev' during the tree traversal
like find_vma_prepare() does and linking the @vma via __vma_link_list().
After this patch, we can iterate the whole VMAs in correct order simply by
using @mm->mmap list.
[akpm@linux-foundation.org: avoid duplicating __vma_link_list()]
Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: Greg Ungerer <gerg@uclinux.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-05-25 00:11:22 +00:00
|
|
|
|
2015-02-13 22:36:24 +00:00
|
|
|
/**
|
|
|
|
* kfree_const - conditionally free memory
|
|
|
|
* @x: pointer to the memory
|
|
|
|
*
|
|
|
|
* Function calls kfree only if @x is not in .rodata section.
|
|
|
|
*/
|
|
|
|
void kfree_const(const void *x)
|
|
|
|
{
|
|
|
|
if (!is_kernel_rodata((unsigned long)x))
|
|
|
|
kfree(x);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kfree_const);
|
|
|
|
|
2006-01-08 09:01:43 +00:00
|
|
|
/**
|
|
|
|
* kstrdup - allocate space for and copy an existing string
|
|
|
|
* @s: the string to duplicate
|
|
|
|
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
|
2019-03-05 23:48:42 +00:00
|
|
|
*
|
|
|
|
* Return: newly allocated copy of @s or %NULL in case of error
|
2006-01-08 09:01:43 +00:00
|
|
|
*/
|
2023-02-28 15:34:38 +00:00
|
|
|
noinline
|
2006-01-08 09:01:43 +00:00
|
|
|
char *kstrdup(const char *s, gfp_t gfp)
|
|
|
|
{
|
|
|
|
size_t len;
|
|
|
|
char *buf;
|
|
|
|
|
|
|
|
if (!s)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
len = strlen(s) + 1;
|
2006-10-04 09:15:25 +00:00
|
|
|
buf = kmalloc_track_caller(len, gfp);
|
2006-01-08 09:01:43 +00:00
|
|
|
if (buf)
|
|
|
|
memcpy(buf, s, len);
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kstrdup);
|
2006-03-24 11:18:42 +00:00
|
|
|
|
2015-02-13 22:36:24 +00:00
|
|
|
/**
|
|
|
|
* kstrdup_const - conditionally duplicate an existing const string
|
|
|
|
* @s: the string to duplicate
|
|
|
|
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
|
|
|
|
*
|
2020-10-16 03:07:39 +00:00
|
|
|
* Note: Strings allocated by kstrdup_const should be freed by kfree_const and
|
|
|
|
* must not be passed to krealloc().
|
2019-03-05 23:48:42 +00:00
|
|
|
*
|
|
|
|
* Return: source string if it is in .rodata section otherwise
|
|
|
|
* fallback to kstrdup.
|
2015-02-13 22:36:24 +00:00
|
|
|
*/
|
|
|
|
const char *kstrdup_const(const char *s, gfp_t gfp)
|
|
|
|
{
|
|
|
|
if (is_kernel_rodata((unsigned long)s))
|
|
|
|
return s;
|
|
|
|
|
|
|
|
return kstrdup(s, gfp);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kstrdup_const);
|
|
|
|
|
2007-07-18 01:37:02 +00:00
|
|
|
/**
|
|
|
|
* kstrndup - allocate space for and copy an existing string
|
|
|
|
* @s: the string to duplicate
|
|
|
|
* @max: read at most @max chars from @s
|
|
|
|
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
|
2017-07-04 16:25:02 +00:00
|
|
|
*
|
|
|
|
* Note: Use kmemdup_nul() instead if the size is known exactly.
|
2019-03-05 23:48:42 +00:00
|
|
|
*
|
|
|
|
* Return: newly allocated copy of @s or %NULL in case of error
|
2007-07-18 01:37:02 +00:00
|
|
|
*/
|
|
|
|
char *kstrndup(const char *s, size_t max, gfp_t gfp)
|
|
|
|
{
|
|
|
|
size_t len;
|
|
|
|
char *buf;
|
|
|
|
|
|
|
|
if (!s)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
len = strnlen(s, max);
|
|
|
|
buf = kmalloc_track_caller(len+1, gfp);
|
|
|
|
if (buf) {
|
|
|
|
memcpy(buf, s, len);
|
|
|
|
buf[len] = '\0';
|
|
|
|
}
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kstrndup);
|
|
|
|
|
[PATCH] kmemdup: introduce
One of idiomatic ways to duplicate a region of memory is
dst = kmalloc(len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
memcpy(dst, src, len);
which is neat code except a programmer needs to write size twice. Which
sometimes leads to mistakes. If len passed to kmalloc is smaller that len
passed to memcpy, it's straight overwrite-beyond-end. If len passed to
memcpy is smaller than len passed to kmalloc, it's either a) legit
behaviour ;-), or b) cloned buffer will contain garbage in second half.
Slight trolling of commit lists shows several duplications bugs
done exactly because of diverged lenghts:
Linux:
[CRYPTO]: Fix memcpy/memset args.
[PATCH] memcpy/memset fixes
OpenBSD:
kerberosV/src/lib/asn1: der_copy.c:1.4
If programmer is given only one place to play with lengths, I believe, such
mistakes could be avoided.
With kmemdup, the snippet above will be rewritten as:
dst = kmemdup(src, len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
This also leads to smaller code (kzalloc effect). Quick grep shows
200+ places where kmemdup() can be used.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-01 06:27:20 +00:00
|
|
|
/**
|
|
|
|
* kmemdup - duplicate region of memory
|
|
|
|
*
|
|
|
|
* @src: memory region to duplicate
|
|
|
|
* @len: memory region length
|
|
|
|
* @gfp: GFP mask to use
|
2019-03-05 23:48:42 +00:00
|
|
|
*
|
2022-12-21 14:42:45 +00:00
|
|
|
* Return: newly allocated copy of @src or %NULL in case of error,
|
|
|
|
* result is physically contiguous. Use kfree() to free.
|
[PATCH] kmemdup: introduce
One of idiomatic ways to duplicate a region of memory is
dst = kmalloc(len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
memcpy(dst, src, len);
which is neat code except a programmer needs to write size twice. Which
sometimes leads to mistakes. If len passed to kmalloc is smaller that len
passed to memcpy, it's straight overwrite-beyond-end. If len passed to
memcpy is smaller than len passed to kmalloc, it's either a) legit
behaviour ;-), or b) cloned buffer will contain garbage in second half.
Slight trolling of commit lists shows several duplications bugs
done exactly because of diverged lenghts:
Linux:
[CRYPTO]: Fix memcpy/memset args.
[PATCH] memcpy/memset fixes
OpenBSD:
kerberosV/src/lib/asn1: der_copy.c:1.4
If programmer is given only one place to play with lengths, I believe, such
mistakes could be avoided.
With kmemdup, the snippet above will be rewritten as:
dst = kmemdup(src, len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
This also leads to smaller code (kzalloc effect). Quick grep shows
200+ places where kmemdup() can be used.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-01 06:27:20 +00:00
|
|
|
*/
|
2024-03-21 16:36:47 +00:00
|
|
|
void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp)
|
[PATCH] kmemdup: introduce
One of idiomatic ways to duplicate a region of memory is
dst = kmalloc(len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
memcpy(dst, src, len);
which is neat code except a programmer needs to write size twice. Which
sometimes leads to mistakes. If len passed to kmalloc is smaller that len
passed to memcpy, it's straight overwrite-beyond-end. If len passed to
memcpy is smaller than len passed to kmalloc, it's either a) legit
behaviour ;-), or b) cloned buffer will contain garbage in second half.
Slight trolling of commit lists shows several duplications bugs
done exactly because of diverged lenghts:
Linux:
[CRYPTO]: Fix memcpy/memset args.
[PATCH] memcpy/memset fixes
OpenBSD:
kerberosV/src/lib/asn1: der_copy.c:1.4
If programmer is given only one place to play with lengths, I believe, such
mistakes could be avoided.
With kmemdup, the snippet above will be rewritten as:
dst = kmemdup(src, len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
This also leads to smaller code (kzalloc effect). Quick grep shows
200+ places where kmemdup() can be used.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-01 06:27:20 +00:00
|
|
|
{
|
|
|
|
void *p;
|
|
|
|
|
2024-03-21 16:36:47 +00:00
|
|
|
p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_);
|
[PATCH] kmemdup: introduce
One of idiomatic ways to duplicate a region of memory is
dst = kmalloc(len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
memcpy(dst, src, len);
which is neat code except a programmer needs to write size twice. Which
sometimes leads to mistakes. If len passed to kmalloc is smaller that len
passed to memcpy, it's straight overwrite-beyond-end. If len passed to
memcpy is smaller than len passed to kmalloc, it's either a) legit
behaviour ;-), or b) cloned buffer will contain garbage in second half.
Slight trolling of commit lists shows several duplications bugs
done exactly because of diverged lenghts:
Linux:
[CRYPTO]: Fix memcpy/memset args.
[PATCH] memcpy/memset fixes
OpenBSD:
kerberosV/src/lib/asn1: der_copy.c:1.4
If programmer is given only one place to play with lengths, I believe, such
mistakes could be avoided.
With kmemdup, the snippet above will be rewritten as:
dst = kmemdup(src, len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
This also leads to smaller code (kzalloc effect). Quick grep shows
200+ places where kmemdup() can be used.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-01 06:27:20 +00:00
|
|
|
if (p)
|
|
|
|
memcpy(p, src, len);
|
|
|
|
return p;
|
|
|
|
}
|
2024-03-21 16:36:47 +00:00
|
|
|
EXPORT_SYMBOL(kmemdup_noprof);
|
[PATCH] kmemdup: introduce
One of idiomatic ways to duplicate a region of memory is
dst = kmalloc(len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
memcpy(dst, src, len);
which is neat code except a programmer needs to write size twice. Which
sometimes leads to mistakes. If len passed to kmalloc is smaller that len
passed to memcpy, it's straight overwrite-beyond-end. If len passed to
memcpy is smaller than len passed to kmalloc, it's either a) legit
behaviour ;-), or b) cloned buffer will contain garbage in second half.
Slight trolling of commit lists shows several duplications bugs
done exactly because of diverged lenghts:
Linux:
[CRYPTO]: Fix memcpy/memset args.
[PATCH] memcpy/memset fixes
OpenBSD:
kerberosV/src/lib/asn1: der_copy.c:1.4
If programmer is given only one place to play with lengths, I believe, such
mistakes could be avoided.
With kmemdup, the snippet above will be rewritten as:
dst = kmemdup(src, len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
This also leads to smaller code (kzalloc effect). Quick grep shows
200+ places where kmemdup() can be used.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-01 06:27:20 +00:00
|
|
|
|
2023-10-17 05:23:15 +00:00
|
|
|
/**
|
|
|
|
* kmemdup_array - duplicate a given array.
|
|
|
|
*
|
|
|
|
* @src: array to duplicate.
|
|
|
|
* @element_size: size of each element of array.
|
|
|
|
* @count: number of elements to duplicate from array.
|
|
|
|
* @gfp: GFP mask to use.
|
|
|
|
*
|
|
|
|
* Return: duplicated array of @src or %NULL in case of error,
|
|
|
|
* result is physically contiguous. Use kfree() to free.
|
|
|
|
*/
|
|
|
|
void *kmemdup_array(const void *src, size_t element_size, size_t count, gfp_t gfp)
|
|
|
|
{
|
|
|
|
return kmemdup(src, size_mul(element_size, count), gfp);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kmemdup_array);
|
|
|
|
|
2022-12-21 14:42:45 +00:00
|
|
|
/**
|
|
|
|
* kvmemdup - duplicate region of memory
|
|
|
|
*
|
|
|
|
* @src: memory region to duplicate
|
|
|
|
* @len: memory region length
|
|
|
|
* @gfp: GFP mask to use
|
|
|
|
*
|
|
|
|
* Return: newly allocated copy of @src or %NULL in case of error,
|
|
|
|
* result may be not physically contiguous. Use kvfree() to free.
|
|
|
|
*/
|
|
|
|
void *kvmemdup(const void *src, size_t len, gfp_t gfp)
|
|
|
|
{
|
|
|
|
void *p;
|
|
|
|
|
|
|
|
p = kvmalloc(len, gfp);
|
|
|
|
if (p)
|
|
|
|
memcpy(p, src, len);
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kvmemdup);
|
|
|
|
|
2017-07-04 16:25:02 +00:00
|
|
|
/**
|
|
|
|
* kmemdup_nul - Create a NUL-terminated string from unterminated data
|
|
|
|
* @s: The data to stringify
|
|
|
|
* @len: The size of the data
|
|
|
|
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
|
2019-03-05 23:48:42 +00:00
|
|
|
*
|
|
|
|
* Return: newly allocated copy of @s with NUL-termination or %NULL in
|
|
|
|
* case of error
|
2017-07-04 16:25:02 +00:00
|
|
|
*/
|
|
|
|
char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
|
|
|
|
{
|
|
|
|
char *buf;
|
|
|
|
|
|
|
|
if (!s)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
buf = kmalloc_track_caller(len + 1, gfp);
|
|
|
|
if (buf) {
|
|
|
|
memcpy(buf, s, len);
|
|
|
|
buf[len] = '\0';
|
|
|
|
}
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kmemdup_nul);
|
|
|
|
|
2009-03-31 22:23:16 +00:00
|
|
|
/**
|
|
|
|
* memdup_user - duplicate memory region from user space
|
|
|
|
*
|
|
|
|
* @src: source address in user space
|
|
|
|
* @len: number of bytes to copy
|
|
|
|
*
|
2019-03-05 23:48:42 +00:00
|
|
|
* Return: an ERR_PTR() on failure. Result is physically
|
2018-01-07 18:06:15 +00:00
|
|
|
* contiguous, to be freed by kfree().
|
2009-03-31 22:23:16 +00:00
|
|
|
*/
|
|
|
|
void *memdup_user(const void __user *src, size_t len)
|
|
|
|
{
|
|
|
|
void *p;
|
|
|
|
|
2019-02-21 06:20:42 +00:00
|
|
|
p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
|
2009-03-31 22:23:16 +00:00
|
|
|
if (!p)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
if (copy_from_user(p, src, len)) {
|
|
|
|
kfree(p);
|
|
|
|
return ERR_PTR(-EFAULT);
|
|
|
|
}
|
|
|
|
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(memdup_user);
|
|
|
|
|
2018-01-07 18:06:15 +00:00
|
|
|
/**
|
|
|
|
* vmemdup_user - duplicate memory region from user space
|
|
|
|
*
|
|
|
|
* @src: source address in user space
|
|
|
|
* @len: number of bytes to copy
|
|
|
|
*
|
2019-03-05 23:48:42 +00:00
|
|
|
* Return: an ERR_PTR() on failure. Result may be not
|
2018-01-07 18:06:15 +00:00
|
|
|
* physically contiguous. Use kvfree() to free.
|
|
|
|
*/
|
|
|
|
void *vmemdup_user(const void __user *src, size_t len)
|
|
|
|
{
|
|
|
|
void *p;
|
|
|
|
|
|
|
|
p = kvmalloc(len, GFP_USER);
|
|
|
|
if (!p)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
if (copy_from_user(p, src, len)) {
|
|
|
|
kvfree(p);
|
|
|
|
return ERR_PTR(-EFAULT);
|
|
|
|
}
|
|
|
|
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(vmemdup_user);
|
|
|
|
|
2018-08-24 00:00:59 +00:00
|
|
|
/**
|
2006-03-24 11:18:42 +00:00
|
|
|
* strndup_user - duplicate an existing string from user space
|
|
|
|
* @s: The string to duplicate
|
|
|
|
* @n: Maximum number of bytes to copy, including the trailing NUL.
|
2019-03-05 23:48:42 +00:00
|
|
|
*
|
2019-04-06 01:39:34 +00:00
|
|
|
* Return: newly allocated copy of @s or an ERR_PTR() in case of error
|
2006-03-24 11:18:42 +00:00
|
|
|
*/
|
|
|
|
char *strndup_user(const char __user *s, long n)
|
|
|
|
{
|
|
|
|
char *p;
|
|
|
|
long length;
|
|
|
|
|
|
|
|
length = strnlen_user(s, n);
|
|
|
|
|
|
|
|
if (!length)
|
|
|
|
return ERR_PTR(-EFAULT);
|
|
|
|
|
|
|
|
if (length > n)
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
2010-08-10 00:18:26 +00:00
|
|
|
p = memdup_user(s, length);
|
2006-03-24 11:18:42 +00:00
|
|
|
|
2010-08-10 00:18:26 +00:00
|
|
|
if (IS_ERR(p))
|
|
|
|
return p;
|
2006-03-24 11:18:42 +00:00
|
|
|
|
|
|
|
p[length - 1] = '\0';
|
|
|
|
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(strndup_user);
|
2008-07-26 02:44:36 +00:00
|
|
|
|
2015-12-24 05:06:05 +00:00
|
|
|
/**
|
|
|
|
* memdup_user_nul - duplicate memory region from user space and NUL-terminate
|
|
|
|
*
|
|
|
|
* @src: source address in user space
|
|
|
|
* @len: number of bytes to copy
|
|
|
|
*
|
2019-03-05 23:48:42 +00:00
|
|
|
* Return: an ERR_PTR() on failure.
|
2015-12-24 05:06:05 +00:00
|
|
|
*/
|
|
|
|
void *memdup_user_nul(const void __user *src, size_t len)
|
|
|
|
{
|
|
|
|
char *p;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Always use GFP_KERNEL, since copy_from_user() can sleep and
|
|
|
|
* cause pagefault, which makes it pointless to use GFP_NOFS
|
|
|
|
* or GFP_ATOMIC.
|
|
|
|
*/
|
|
|
|
p = kmalloc_track_caller(len + 1, GFP_KERNEL);
|
|
|
|
if (!p)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
if (copy_from_user(p, src, len)) {
|
|
|
|
kfree(p);
|
|
|
|
return ERR_PTR(-EFAULT);
|
|
|
|
}
|
|
|
|
p[len] = '\0';
|
|
|
|
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(memdup_user_nul);
|
|
|
|
|
procfs: mark thread stack correctly in proc/<pid>/maps
Stack for a new thread is mapped by userspace code and passed via
sys_clone. This memory is currently seen as anonymous in
/proc/<pid>/maps, which makes it difficult to ascertain which mappings
are being used for thread stacks. This patch uses the individual task
stack pointers to determine which vmas are actually thread stacks.
For a multithreaded program like the following:
#include <pthread.h>
void *thread_main(void *foo)
{
while(1);
}
int main()
{
pthread_t t;
pthread_create(&t, NULL, thread_main, NULL);
pthread_join(t, NULL);
}
proc/PID/maps looks like the following:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
Here, one could guess that 7f8a44492000-7f8a44c92000 is a stack since
the earlier vma that has no permissions (7f8a44e3d000-7f8a4503d000) but
that is not always a reliable way to find out which vma is a thread
stack. Also, /proc/PID/maps and /proc/PID/task/TID/maps has the same
content.
With this patch in place, /proc/PID/task/TID/maps are treated as 'maps
as the task would see it' and hence, only the vma that that task uses as
stack is marked as [stack]. All other 'stack' vmas are marked as
anonymous memory. /proc/PID/maps acts as a thread group level view,
where all thread stack vmas are marked as [stack:TID] where TID is the
process ID of the task that uses that vma as stack, while the process
stack is marked as [stack].
So /proc/PID/maps will look like this:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack:1442]
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
Thus marking all vmas that are used as stacks by the threads in the
thread group along with the process stack. The task level maps will
however like this:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack]
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
where only the vma that is being used as a stack by *that* task is
marked as [stack].
Analogous changes have been made to /proc/PID/smaps,
/proc/PID/numa_maps, /proc/PID/task/TID/smaps and
/proc/PID/task/TID/numa_maps. Relevant snippets from smaps and
numa_maps:
[siddhesh@localhost ~ ]$ pgrep a.out
1441
[siddhesh@localhost ~ ]$ cat /proc/1441/smaps | grep "\[stack"
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack:1442]
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1442/smaps | grep "\[stack"
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1441/smaps | grep "\[stack"
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/numa_maps | grep "stack"
7f8a44492000 default stack:1442 anon=2 dirty=2 N0=2
7fff6273a000 default stack anon=3 dirty=3 N0=3
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1442/numa_maps | grep "stack"
7f8a44492000 default stack anon=2 dirty=2 N0=2
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1441/numa_maps | grep "stack"
7fff6273a000 default stack anon=3 dirty=3 N0=3
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix build]
Signed-off-by: Siddhesh Poyarekar <siddhesh.poyarekar@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jamie Lokier <jamie@shareable.org>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 23:34:04 +00:00
|
|
|
/* Check if the vma is being used as a stack by this task */
|
2016-09-30 17:58:58 +00:00
|
|
|
int vma_is_stack_for_current(struct vm_area_struct *vma)
|
procfs: mark thread stack correctly in proc/<pid>/maps
Stack for a new thread is mapped by userspace code and passed via
sys_clone. This memory is currently seen as anonymous in
/proc/<pid>/maps, which makes it difficult to ascertain which mappings
are being used for thread stacks. This patch uses the individual task
stack pointers to determine which vmas are actually thread stacks.
For a multithreaded program like the following:
#include <pthread.h>
void *thread_main(void *foo)
{
while(1);
}
int main()
{
pthread_t t;
pthread_create(&t, NULL, thread_main, NULL);
pthread_join(t, NULL);
}
proc/PID/maps looks like the following:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
Here, one could guess that 7f8a44492000-7f8a44c92000 is a stack since
the earlier vma that has no permissions (7f8a44e3d000-7f8a4503d000) but
that is not always a reliable way to find out which vma is a thread
stack. Also, /proc/PID/maps and /proc/PID/task/TID/maps has the same
content.
With this patch in place, /proc/PID/task/TID/maps are treated as 'maps
as the task would see it' and hence, only the vma that that task uses as
stack is marked as [stack]. All other 'stack' vmas are marked as
anonymous memory. /proc/PID/maps acts as a thread group level view,
where all thread stack vmas are marked as [stack:TID] where TID is the
process ID of the task that uses that vma as stack, while the process
stack is marked as [stack].
So /proc/PID/maps will look like this:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack:1442]
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
Thus marking all vmas that are used as stacks by the threads in the
thread group along with the process stack. The task level maps will
however like this:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack]
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
where only the vma that is being used as a stack by *that* task is
marked as [stack].
Analogous changes have been made to /proc/PID/smaps,
/proc/PID/numa_maps, /proc/PID/task/TID/smaps and
/proc/PID/task/TID/numa_maps. Relevant snippets from smaps and
numa_maps:
[siddhesh@localhost ~ ]$ pgrep a.out
1441
[siddhesh@localhost ~ ]$ cat /proc/1441/smaps | grep "\[stack"
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack:1442]
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1442/smaps | grep "\[stack"
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1441/smaps | grep "\[stack"
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/numa_maps | grep "stack"
7f8a44492000 default stack:1442 anon=2 dirty=2 N0=2
7fff6273a000 default stack anon=3 dirty=3 N0=3
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1442/numa_maps | grep "stack"
7f8a44492000 default stack anon=2 dirty=2 N0=2
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1441/numa_maps | grep "stack"
7fff6273a000 default stack anon=3 dirty=3 N0=3
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix build]
Signed-off-by: Siddhesh Poyarekar <siddhesh.poyarekar@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jamie Lokier <jamie@shareable.org>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 23:34:04 +00:00
|
|
|
{
|
2016-09-30 17:58:58 +00:00
|
|
|
struct task_struct * __maybe_unused t = current;
|
|
|
|
|
procfs: mark thread stack correctly in proc/<pid>/maps
Stack for a new thread is mapped by userspace code and passed via
sys_clone. This memory is currently seen as anonymous in
/proc/<pid>/maps, which makes it difficult to ascertain which mappings
are being used for thread stacks. This patch uses the individual task
stack pointers to determine which vmas are actually thread stacks.
For a multithreaded program like the following:
#include <pthread.h>
void *thread_main(void *foo)
{
while(1);
}
int main()
{
pthread_t t;
pthread_create(&t, NULL, thread_main, NULL);
pthread_join(t, NULL);
}
proc/PID/maps looks like the following:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
Here, one could guess that 7f8a44492000-7f8a44c92000 is a stack since
the earlier vma that has no permissions (7f8a44e3d000-7f8a4503d000) but
that is not always a reliable way to find out which vma is a thread
stack. Also, /proc/PID/maps and /proc/PID/task/TID/maps has the same
content.
With this patch in place, /proc/PID/task/TID/maps are treated as 'maps
as the task would see it' and hence, only the vma that that task uses as
stack is marked as [stack]. All other 'stack' vmas are marked as
anonymous memory. /proc/PID/maps acts as a thread group level view,
where all thread stack vmas are marked as [stack:TID] where TID is the
process ID of the task that uses that vma as stack, while the process
stack is marked as [stack].
So /proc/PID/maps will look like this:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack:1442]
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
Thus marking all vmas that are used as stacks by the threads in the
thread group along with the process stack. The task level maps will
however like this:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack]
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
where only the vma that is being used as a stack by *that* task is
marked as [stack].
Analogous changes have been made to /proc/PID/smaps,
/proc/PID/numa_maps, /proc/PID/task/TID/smaps and
/proc/PID/task/TID/numa_maps. Relevant snippets from smaps and
numa_maps:
[siddhesh@localhost ~ ]$ pgrep a.out
1441
[siddhesh@localhost ~ ]$ cat /proc/1441/smaps | grep "\[stack"
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack:1442]
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1442/smaps | grep "\[stack"
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1441/smaps | grep "\[stack"
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/numa_maps | grep "stack"
7f8a44492000 default stack:1442 anon=2 dirty=2 N0=2
7fff6273a000 default stack anon=3 dirty=3 N0=3
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1442/numa_maps | grep "stack"
7f8a44492000 default stack anon=2 dirty=2 N0=2
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1441/numa_maps | grep "stack"
7fff6273a000 default stack anon=3 dirty=3 N0=3
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix build]
Signed-off-by: Siddhesh Poyarekar <siddhesh.poyarekar@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jamie Lokier <jamie@shareable.org>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 23:34:04 +00:00
|
|
|
return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
|
|
|
|
}
|
|
|
|
|
2020-09-14 13:09:33 +00:00
|
|
|
/*
|
|
|
|
* Change backing file, only valid to use during initial VMA setup.
|
|
|
|
*/
|
|
|
|
void vma_set_file(struct vm_area_struct *vma, struct file *file)
|
|
|
|
{
|
|
|
|
/* Changing an anonymous vma with this is illegal */
|
|
|
|
get_file(file);
|
|
|
|
swap(vma->vm_file, file);
|
|
|
|
fput(file);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(vma_set_file);
|
|
|
|
|
2019-09-23 22:38:37 +00:00
|
|
|
#ifndef STACK_RND_MASK
|
|
|
|
#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
|
|
|
|
#endif
|
|
|
|
|
|
|
|
unsigned long randomize_stack_top(unsigned long stack_top)
|
|
|
|
{
|
|
|
|
unsigned long random_variable = 0;
|
|
|
|
|
|
|
|
if (current->flags & PF_RANDOMIZE) {
|
|
|
|
random_variable = get_random_long();
|
|
|
|
random_variable &= STACK_RND_MASK;
|
|
|
|
random_variable <<= PAGE_SHIFT;
|
|
|
|
}
|
|
|
|
#ifdef CONFIG_STACK_GROWSUP
|
|
|
|
return PAGE_ALIGN(stack_top) + random_variable;
|
|
|
|
#else
|
|
|
|
return PAGE_ALIGN(stack_top) - random_variable;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2022-05-14 11:59:30 +00:00
|
|
|
/**
|
|
|
|
* randomize_page - Generate a random, page aligned address
|
|
|
|
* @start: The smallest acceptable address the caller will take.
|
|
|
|
* @range: The size of the area, starting at @start, within which the
|
|
|
|
* random address must fall.
|
|
|
|
*
|
|
|
|
* If @start + @range would overflow, @range is capped.
|
|
|
|
*
|
|
|
|
* NOTE: Historical use of randomize_range, which this replaces, presumed that
|
|
|
|
* @start was already page aligned. We now align it regardless.
|
|
|
|
*
|
|
|
|
* Return: A page aligned address within [start, start + range). On error,
|
|
|
|
* @start is returned.
|
|
|
|
*/
|
|
|
|
unsigned long randomize_page(unsigned long start, unsigned long range)
|
|
|
|
{
|
|
|
|
if (!PAGE_ALIGNED(start)) {
|
|
|
|
range -= PAGE_ALIGN(start) - start;
|
|
|
|
start = PAGE_ALIGN(start);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (start > ULONG_MAX - range)
|
|
|
|
range = ULONG_MAX - start;
|
|
|
|
|
|
|
|
range >>= PAGE_SHIFT;
|
|
|
|
|
|
|
|
if (range == 0)
|
|
|
|
return start;
|
|
|
|
|
|
|
|
return start + (get_random_long() % range << PAGE_SHIFT);
|
|
|
|
}
|
|
|
|
|
2019-09-23 22:38:47 +00:00
|
|
|
#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
|
2022-04-09 17:17:26 +00:00
|
|
|
unsigned long __weak arch_randomize_brk(struct mm_struct *mm)
|
2019-09-23 22:38:50 +00:00
|
|
|
{
|
|
|
|
/* Is the current task 32bit ? */
|
|
|
|
if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
|
|
|
|
return randomize_page(mm->brk, SZ_32M);
|
|
|
|
|
|
|
|
return randomize_page(mm->brk, SZ_1G);
|
|
|
|
}
|
|
|
|
|
2019-09-23 22:38:47 +00:00
|
|
|
unsigned long arch_mmap_rnd(void)
|
|
|
|
{
|
|
|
|
unsigned long rnd;
|
|
|
|
|
|
|
|
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
|
|
|
|
if (is_compat_task())
|
|
|
|
rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
|
|
|
|
else
|
|
|
|
#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
|
|
|
|
rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
|
|
|
|
|
|
|
|
return rnd << PAGE_SHIFT;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int mmap_is_legacy(struct rlimit *rlim_stack)
|
|
|
|
{
|
|
|
|
if (current->personality & ADDR_COMPAT_LAYOUT)
|
|
|
|
return 1;
|
|
|
|
|
2023-08-18 22:53:28 +00:00
|
|
|
/* On parisc the stack always grows up - so a unlimited stack should
|
|
|
|
* not be an indicator to use the legacy memory layout. */
|
|
|
|
if (rlim_stack->rlim_cur == RLIM_INFINITY &&
|
|
|
|
!IS_ENABLED(CONFIG_STACK_GROWSUP))
|
2019-09-23 22:38:47 +00:00
|
|
|
return 1;
|
|
|
|
|
|
|
|
return sysctl_legacy_va_layout;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Leave enough space between the mmap area and the stack to honour ulimit in
|
|
|
|
* the face of randomisation.
|
|
|
|
*/
|
|
|
|
#define MIN_GAP (SZ_128M)
|
|
|
|
#define MAX_GAP (STACK_TOP / 6 * 5)
|
|
|
|
|
|
|
|
static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
|
|
|
|
{
|
2023-11-13 10:12:57 +00:00
|
|
|
#ifdef CONFIG_STACK_GROWSUP
|
|
|
|
/*
|
|
|
|
* For an upwards growing stack the calculation is much simpler.
|
|
|
|
* Memory for the maximum stack size is reserved at the top of the
|
|
|
|
* task. mmap_base starts directly below the stack and grows
|
|
|
|
* downwards.
|
|
|
|
*/
|
|
|
|
return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd);
|
|
|
|
#else
|
2019-09-23 22:38:47 +00:00
|
|
|
unsigned long gap = rlim_stack->rlim_cur;
|
|
|
|
unsigned long pad = stack_guard_gap;
|
|
|
|
|
|
|
|
/* Account for stack randomization if necessary */
|
|
|
|
if (current->flags & PF_RANDOMIZE)
|
|
|
|
pad += (STACK_RND_MASK << PAGE_SHIFT);
|
|
|
|
|
|
|
|
/* Values close to RLIM_INFINITY can overflow. */
|
|
|
|
if (gap + pad > gap)
|
|
|
|
gap += pad;
|
|
|
|
|
|
|
|
if (gap < MIN_GAP)
|
|
|
|
gap = MIN_GAP;
|
|
|
|
else if (gap > MAX_GAP)
|
|
|
|
gap = MAX_GAP;
|
|
|
|
|
|
|
|
return PAGE_ALIGN(STACK_TOP - gap - rnd);
|
2023-11-13 10:12:57 +00:00
|
|
|
#endif
|
2019-09-23 22:38:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
|
|
|
|
{
|
|
|
|
unsigned long random_factor = 0UL;
|
|
|
|
|
|
|
|
if (current->flags & PF_RANDOMIZE)
|
|
|
|
random_factor = arch_mmap_rnd();
|
|
|
|
|
|
|
|
if (mmap_is_legacy(rlim_stack)) {
|
|
|
|
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
|
mm: switch mm->get_unmapped_area() to a flag
The mm_struct contains a function pointer *get_unmapped_area(), which is
set to either arch_get_unmapped_area() or arch_get_unmapped_area_topdown()
during the initialization of the mm.
Since the function pointer only ever points to two functions that are
named the same across all arch's, a function pointer is not really
required. In addition future changes will want to add versions of the
functions that take additional arguments. So to save a pointers worth of
bytes in mm_struct, and prevent adding additional function pointers to
mm_struct in future changes, remove it and keep the information about
which get_unmapped_area() to use in a flag.
Add the new flag to MMF_INIT_MASK so it doesn't get clobbered on fork by
mmf_init_flags(). Most MM flags get clobbered on fork. In the
pre-existing behavior mm->get_unmapped_area() would get copied to the new
mm in dup_mm(), so not clobbering the flag preserves the existing behavior
around inheriting the topdown-ness.
Introduce a helper, mm_get_unmapped_area(), to easily convert code that
refers to the old function pointer to instead select and call either
arch_get_unmapped_area() or arch_get_unmapped_area_topdown() based on the
flag. Then drop the mm->get_unmapped_area() function pointer. Leave the
get_unmapped_area() pointer in struct file_operations alone. The main
purpose of this change is to reorganize in preparation for future changes,
but it also converts the calls of mm->get_unmapped_area() from indirect
branches into a direct ones.
The stress-ng bigheap benchmark calls realloc a lot, which calls through
get_unmapped_area() in the kernel. On x86, the change yielded a ~1%
improvement there on a retpoline config.
In testing a few x86 configs, removing the pointer unfortunately didn't
result in any actual size reductions in the compiled layout of mm_struct.
But depending on compiler or arch alignment requirements, the change could
shrink the size of mm_struct.
Link: https://lkml.kernel.org/r/20240326021656.202649-3-rick.p.edgecombe@intel.com
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Deepak Gupta <debug@rivosinc.com>
Cc: Guo Ren <guoren@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: H. Peter Anvin (Intel) <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-03-26 02:16:44 +00:00
|
|
|
clear_bit(MMF_TOPDOWN, &mm->flags);
|
2019-09-23 22:38:47 +00:00
|
|
|
} else {
|
|
|
|
mm->mmap_base = mmap_base(random_factor, rlim_stack);
|
mm: switch mm->get_unmapped_area() to a flag
The mm_struct contains a function pointer *get_unmapped_area(), which is
set to either arch_get_unmapped_area() or arch_get_unmapped_area_topdown()
during the initialization of the mm.
Since the function pointer only ever points to two functions that are
named the same across all arch's, a function pointer is not really
required. In addition future changes will want to add versions of the
functions that take additional arguments. So to save a pointers worth of
bytes in mm_struct, and prevent adding additional function pointers to
mm_struct in future changes, remove it and keep the information about
which get_unmapped_area() to use in a flag.
Add the new flag to MMF_INIT_MASK so it doesn't get clobbered on fork by
mmf_init_flags(). Most MM flags get clobbered on fork. In the
pre-existing behavior mm->get_unmapped_area() would get copied to the new
mm in dup_mm(), so not clobbering the flag preserves the existing behavior
around inheriting the topdown-ness.
Introduce a helper, mm_get_unmapped_area(), to easily convert code that
refers to the old function pointer to instead select and call either
arch_get_unmapped_area() or arch_get_unmapped_area_topdown() based on the
flag. Then drop the mm->get_unmapped_area() function pointer. Leave the
get_unmapped_area() pointer in struct file_operations alone. The main
purpose of this change is to reorganize in preparation for future changes,
but it also converts the calls of mm->get_unmapped_area() from indirect
branches into a direct ones.
The stress-ng bigheap benchmark calls realloc a lot, which calls through
get_unmapped_area() in the kernel. On x86, the change yielded a ~1%
improvement there on a retpoline config.
In testing a few x86 configs, removing the pointer unfortunately didn't
result in any actual size reductions in the compiled layout of mm_struct.
But depending on compiler or arch alignment requirements, the change could
shrink the size of mm_struct.
Link: https://lkml.kernel.org/r/20240326021656.202649-3-rick.p.edgecombe@intel.com
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Deepak Gupta <debug@rivosinc.com>
Cc: Guo Ren <guoren@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: H. Peter Anvin (Intel) <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-03-26 02:16:44 +00:00
|
|
|
set_bit(MMF_TOPDOWN, &mm->flags);
|
2019-09-23 22:38:47 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
|
2018-04-10 23:34:53 +00:00
|
|
|
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
|
2008-07-26 02:44:36 +00:00
|
|
|
{
|
|
|
|
mm->mmap_base = TASK_UNMAPPED_BASE;
|
mm: switch mm->get_unmapped_area() to a flag
The mm_struct contains a function pointer *get_unmapped_area(), which is
set to either arch_get_unmapped_area() or arch_get_unmapped_area_topdown()
during the initialization of the mm.
Since the function pointer only ever points to two functions that are
named the same across all arch's, a function pointer is not really
required. In addition future changes will want to add versions of the
functions that take additional arguments. So to save a pointers worth of
bytes in mm_struct, and prevent adding additional function pointers to
mm_struct in future changes, remove it and keep the information about
which get_unmapped_area() to use in a flag.
Add the new flag to MMF_INIT_MASK so it doesn't get clobbered on fork by
mmf_init_flags(). Most MM flags get clobbered on fork. In the
pre-existing behavior mm->get_unmapped_area() would get copied to the new
mm in dup_mm(), so not clobbering the flag preserves the existing behavior
around inheriting the topdown-ness.
Introduce a helper, mm_get_unmapped_area(), to easily convert code that
refers to the old function pointer to instead select and call either
arch_get_unmapped_area() or arch_get_unmapped_area_topdown() based on the
flag. Then drop the mm->get_unmapped_area() function pointer. Leave the
get_unmapped_area() pointer in struct file_operations alone. The main
purpose of this change is to reorganize in preparation for future changes,
but it also converts the calls of mm->get_unmapped_area() from indirect
branches into a direct ones.
The stress-ng bigheap benchmark calls realloc a lot, which calls through
get_unmapped_area() in the kernel. On x86, the change yielded a ~1%
improvement there on a retpoline config.
In testing a few x86 configs, removing the pointer unfortunately didn't
result in any actual size reductions in the compiled layout of mm_struct.
But depending on compiler or arch alignment requirements, the change could
shrink the size of mm_struct.
Link: https://lkml.kernel.org/r/20240326021656.202649-3-rick.p.edgecombe@intel.com
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Deepak Gupta <debug@rivosinc.com>
Cc: Guo Ren <guoren@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: H. Peter Anvin (Intel) <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-03-26 02:16:44 +00:00
|
|
|
clear_bit(MMF_TOPDOWN, &mm->flags);
|
2008-07-26 02:44:36 +00:00
|
|
|
}
|
|
|
|
#endif
|
2008-08-12 22:52:52 +00:00
|
|
|
|
2019-07-16 23:30:54 +00:00
|
|
|
/**
|
|
|
|
* __account_locked_vm - account locked pages to an mm's locked_vm
|
|
|
|
* @mm: mm to account against
|
|
|
|
* @pages: number of pages to account
|
|
|
|
* @inc: %true if @pages should be considered positive, %false if not
|
|
|
|
* @task: task used to check RLIMIT_MEMLOCK
|
|
|
|
* @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
|
|
|
|
*
|
|
|
|
* Assumes @task and @mm are valid (i.e. at least one reference on each), and
|
2020-06-09 04:33:54 +00:00
|
|
|
* that mmap_lock is held as writer.
|
2019-07-16 23:30:54 +00:00
|
|
|
*
|
|
|
|
* Return:
|
|
|
|
* * 0 on success
|
|
|
|
* * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
|
|
|
|
*/
|
|
|
|
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
|
|
|
|
struct task_struct *task, bool bypass_rlim)
|
|
|
|
{
|
|
|
|
unsigned long locked_vm, limit;
|
|
|
|
int ret = 0;
|
|
|
|
|
2020-06-09 04:33:44 +00:00
|
|
|
mmap_assert_write_locked(mm);
|
2019-07-16 23:30:54 +00:00
|
|
|
|
|
|
|
locked_vm = mm->locked_vm;
|
|
|
|
if (inc) {
|
|
|
|
if (!bypass_rlim) {
|
|
|
|
limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
|
|
|
|
if (locked_vm + pages > limit)
|
|
|
|
ret = -ENOMEM;
|
|
|
|
}
|
|
|
|
if (!ret)
|
|
|
|
mm->locked_vm = locked_vm + pages;
|
|
|
|
} else {
|
|
|
|
WARN_ON_ONCE(pages > locked_vm);
|
|
|
|
mm->locked_vm = locked_vm - pages;
|
|
|
|
}
|
|
|
|
|
|
|
|
pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
|
|
|
|
(void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
|
|
|
|
locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
|
|
|
|
ret ? " - exceeded" : "");
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(__account_locked_vm);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* account_locked_vm - account locked pages to an mm's locked_vm
|
|
|
|
* @mm: mm to account against, may be NULL
|
|
|
|
* @pages: number of pages to account
|
|
|
|
* @inc: %true if @pages should be considered positive, %false if not
|
|
|
|
*
|
|
|
|
* Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
|
|
|
|
*
|
|
|
|
* Return:
|
|
|
|
* * 0 on success, or if mm is NULL
|
|
|
|
* * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
|
|
|
|
*/
|
|
|
|
int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (pages == 0 || !mm)
|
|
|
|
return 0;
|
|
|
|
|
2020-06-09 04:33:25 +00:00
|
|
|
mmap_write_lock(mm);
|
2019-07-16 23:30:54 +00:00
|
|
|
ret = __account_locked_vm(mm, pages, inc, current,
|
|
|
|
capable(CAP_IPC_LOCK));
|
2020-06-09 04:33:25 +00:00
|
|
|
mmap_write_unlock(mm);
|
2019-07-16 23:30:54 +00:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(account_locked_vm);
|
|
|
|
|
2012-05-31 00:17:35 +00:00
|
|
|
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long prot,
|
2016-05-23 23:25:30 +00:00
|
|
|
unsigned long flag, unsigned long pgoff)
|
2012-05-31 00:17:35 +00:00
|
|
|
{
|
|
|
|
unsigned long ret;
|
|
|
|
struct mm_struct *mm = current->mm;
|
2013-02-23 00:32:47 +00:00
|
|
|
unsigned long populate;
|
2017-02-24 22:58:22 +00:00
|
|
|
LIST_HEAD(uf);
|
2012-05-31 00:17:35 +00:00
|
|
|
|
|
|
|
ret = security_mmap_file(file, prot, flag);
|
|
|
|
if (!ret) {
|
2020-06-09 04:33:25 +00:00
|
|
|
if (mmap_write_lock_killable(mm))
|
2016-05-23 23:25:30 +00:00
|
|
|
return -EINTR;
|
2023-06-13 00:10:30 +00:00
|
|
|
ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate,
|
2020-08-07 06:23:37 +00:00
|
|
|
&uf);
|
2020-06-09 04:33:25 +00:00
|
|
|
mmap_write_unlock(mm);
|
2017-02-24 22:58:22 +00:00
|
|
|
userfaultfd_unmap_complete(mm, &uf);
|
2013-02-23 00:32:47 +00:00
|
|
|
if (populate)
|
|
|
|
mm_populate(ret, populate);
|
2012-05-31 00:17:35 +00:00
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned long vm_mmap(struct file *file, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long prot,
|
|
|
|
unsigned long flag, unsigned long offset)
|
|
|
|
{
|
|
|
|
if (unlikely(offset + PAGE_ALIGN(len) < offset))
|
|
|
|
return -EINVAL;
|
2015-11-06 02:46:46 +00:00
|
|
|
if (unlikely(offset_in_page(offset)))
|
2012-05-31 00:17:35 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
2016-05-23 23:25:30 +00:00
|
|
|
return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
|
2012-05-31 00:17:35 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(vm_mmap);
|
|
|
|
|
2017-05-08 22:57:09 +00:00
|
|
|
/**
|
|
|
|
* kvmalloc_node - attempt to allocate physically contiguous memory, but upon
|
|
|
|
* failure, fall back to non-contiguous (vmalloc) allocation.
|
|
|
|
* @size: size of the request.
|
|
|
|
* @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
|
|
|
|
* @node: numa node to allocate from
|
|
|
|
*
|
|
|
|
* Uses kmalloc to get the memory but if the allocation fails then falls back
|
|
|
|
* to the vmalloc allocator. Use kvfree for freeing the memory.
|
|
|
|
*
|
2022-01-14 22:07:07 +00:00
|
|
|
* GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
|
2017-07-12 21:36:52 +00:00
|
|
|
* __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
|
|
|
|
* preferable to the vmalloc fallback, due to visible performance drawbacks.
|
2017-05-08 22:57:09 +00:00
|
|
|
*
|
2019-03-05 23:48:42 +00:00
|
|
|
* Return: pointer to the allocated memory of %NULL in case of failure
|
2017-05-08 22:57:09 +00:00
|
|
|
*/
|
2024-03-21 16:36:47 +00:00
|
|
|
void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node)
|
2017-05-08 22:57:09 +00:00
|
|
|
{
|
|
|
|
gfp_t kmalloc_flags = flags;
|
|
|
|
void *ret;
|
|
|
|
|
|
|
|
/*
|
2017-06-02 21:46:19 +00:00
|
|
|
* We want to attempt a large physically contiguous block first because
|
|
|
|
* it is less likely to fragment multiple larger blocks and therefore
|
|
|
|
* contribute to a long term fragmentation less than vmalloc fallback.
|
|
|
|
* However make sure that larger requests are not too disruptive - no
|
|
|
|
* OOM killer and no allocation failure warnings as we have a fallback.
|
2017-05-08 22:57:09 +00:00
|
|
|
*/
|
2017-05-08 22:57:15 +00:00
|
|
|
if (size > PAGE_SIZE) {
|
|
|
|
kmalloc_flags |= __GFP_NOWARN;
|
|
|
|
|
2017-07-12 21:36:52 +00:00
|
|
|
if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
|
2017-05-08 22:57:15 +00:00
|
|
|
kmalloc_flags |= __GFP_NORETRY;
|
2022-01-14 22:07:07 +00:00
|
|
|
|
|
|
|
/* nofail semantic is implemented by the vmalloc fallback */
|
|
|
|
kmalloc_flags &= ~__GFP_NOFAIL;
|
2017-05-08 22:57:15 +00:00
|
|
|
}
|
2017-05-08 22:57:09 +00:00
|
|
|
|
2024-03-21 16:36:47 +00:00
|
|
|
ret = kmalloc_node_noprof(size, kmalloc_flags, node);
|
2017-05-08 22:57:09 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* It doesn't really make sense to fallback to vmalloc for sub page
|
|
|
|
* requests
|
|
|
|
*/
|
|
|
|
if (ret || size <= PAGE_SIZE)
|
|
|
|
return ret;
|
|
|
|
|
2022-09-26 15:16:50 +00:00
|
|
|
/* non-sleeping allocations are not supported by vmalloc */
|
|
|
|
if (!gfpflags_allow_blocking(flags))
|
|
|
|
return NULL;
|
|
|
|
|
2021-07-14 16:45:49 +00:00
|
|
|
/* Don't even allow crazy sizes */
|
mm: Consider __GFP_NOWARN flag for oversized kvmalloc() calls
syzkaller was recently triggering an oversized kvmalloc() warning via
xdp_umem_create().
The triggered warning was added back in 7661809d493b ("mm: don't allow
oversized kvmalloc() calls"). The rationale for the warning for huge
kvmalloc sizes was as a reaction to a security bug where the size was
more than UINT_MAX but not everything was prepared to handle unsigned
long sizes.
Anyway, the AF_XDP related call trace from this syzkaller report was:
kvmalloc include/linux/mm.h:806 [inline]
kvmalloc_array include/linux/mm.h:824 [inline]
kvcalloc include/linux/mm.h:829 [inline]
xdp_umem_pin_pages net/xdp/xdp_umem.c:102 [inline]
xdp_umem_reg net/xdp/xdp_umem.c:219 [inline]
xdp_umem_create+0x6a5/0xf00 net/xdp/xdp_umem.c:252
xsk_setsockopt+0x604/0x790 net/xdp/xsk.c:1068
__sys_setsockopt+0x1fd/0x4e0 net/socket.c:2176
__do_sys_setsockopt net/socket.c:2187 [inline]
__se_sys_setsockopt net/socket.c:2184 [inline]
__x64_sys_setsockopt+0xb5/0x150 net/socket.c:2184
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
Björn mentioned that requests for >2GB allocation can still be valid:
The structure that is being allocated is the page-pinning accounting.
AF_XDP has an internal limit of U32_MAX pages, which is *a lot*, but
still fewer than what memcg allows (PAGE_COUNTER_MAX is a LONG_MAX/
PAGE_SIZE on 64 bit systems). [...]
I could just change from U32_MAX to INT_MAX, but as I stated earlier
that has a hacky feeling to it. [...] From my perspective, the code
isn't broken, with the memcg limits in consideration. [...]
Linus says:
[...] Pretty much every time this has come up, the kernel warning has
shown that yes, the code was broken and there really wasn't a reason
for doing allocations that big.
Of course, some people would be perfectly fine with the allocation
failing, they just don't want the warning. I didn't want __GFP_NOWARN
to shut it up originally because I wanted people to see all those
cases, but these days I think we can just say "yeah, people can shut
it up explicitly by saying 'go ahead and fail this allocation, don't
warn about it'".
So enough time has passed that by now I'd certainly be ok with [it].
Thus allow call-sites to silence such userspace triggered splats if the
allocation requests have __GFP_NOWARN. For xdp_umem_pin_pages()'s call
to kvcalloc() this is already the case, so nothing else needed there.
Fixes: 7661809d493b ("mm: don't allow oversized kvmalloc() calls")
Reported-by: syzbot+11421fbbff99b989670e@syzkaller.appspotmail.com
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: syzbot+11421fbbff99b989670e@syzkaller.appspotmail.com
Cc: Björn Töpel <bjorn@kernel.org>
Cc: Magnus Karlsson <magnus.karlsson@intel.com>
Cc: Willy Tarreau <w@1wt.eu>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: David S. Miller <davem@davemloft.net>
Link: https://lore.kernel.org/bpf/CAJ+HfNhyfsT5cS_U9EC213ducHs9k9zNxX9+abqC0kTrPbQ0gg@mail.gmail.com
Link: https://lore.kernel.org/bpf/20211201202905.b9892171e3f5b9a60f9da251@linux-foundation.org
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Ackd-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-03-04 14:26:32 +00:00
|
|
|
if (unlikely(size > INT_MAX)) {
|
|
|
|
WARN_ON_ONCE(!(flags & __GFP_NOWARN));
|
2021-07-14 16:45:49 +00:00
|
|
|
return NULL;
|
mm: Consider __GFP_NOWARN flag for oversized kvmalloc() calls
syzkaller was recently triggering an oversized kvmalloc() warning via
xdp_umem_create().
The triggered warning was added back in 7661809d493b ("mm: don't allow
oversized kvmalloc() calls"). The rationale for the warning for huge
kvmalloc sizes was as a reaction to a security bug where the size was
more than UINT_MAX but not everything was prepared to handle unsigned
long sizes.
Anyway, the AF_XDP related call trace from this syzkaller report was:
kvmalloc include/linux/mm.h:806 [inline]
kvmalloc_array include/linux/mm.h:824 [inline]
kvcalloc include/linux/mm.h:829 [inline]
xdp_umem_pin_pages net/xdp/xdp_umem.c:102 [inline]
xdp_umem_reg net/xdp/xdp_umem.c:219 [inline]
xdp_umem_create+0x6a5/0xf00 net/xdp/xdp_umem.c:252
xsk_setsockopt+0x604/0x790 net/xdp/xsk.c:1068
__sys_setsockopt+0x1fd/0x4e0 net/socket.c:2176
__do_sys_setsockopt net/socket.c:2187 [inline]
__se_sys_setsockopt net/socket.c:2184 [inline]
__x64_sys_setsockopt+0xb5/0x150 net/socket.c:2184
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
Björn mentioned that requests for >2GB allocation can still be valid:
The structure that is being allocated is the page-pinning accounting.
AF_XDP has an internal limit of U32_MAX pages, which is *a lot*, but
still fewer than what memcg allows (PAGE_COUNTER_MAX is a LONG_MAX/
PAGE_SIZE on 64 bit systems). [...]
I could just change from U32_MAX to INT_MAX, but as I stated earlier
that has a hacky feeling to it. [...] From my perspective, the code
isn't broken, with the memcg limits in consideration. [...]
Linus says:
[...] Pretty much every time this has come up, the kernel warning has
shown that yes, the code was broken and there really wasn't a reason
for doing allocations that big.
Of course, some people would be perfectly fine with the allocation
failing, they just don't want the warning. I didn't want __GFP_NOWARN
to shut it up originally because I wanted people to see all those
cases, but these days I think we can just say "yeah, people can shut
it up explicitly by saying 'go ahead and fail this allocation, don't
warn about it'".
So enough time has passed that by now I'd certainly be ok with [it].
Thus allow call-sites to silence such userspace triggered splats if the
allocation requests have __GFP_NOWARN. For xdp_umem_pin_pages()'s call
to kvcalloc() this is already the case, so nothing else needed there.
Fixes: 7661809d493b ("mm: don't allow oversized kvmalloc() calls")
Reported-by: syzbot+11421fbbff99b989670e@syzkaller.appspotmail.com
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: syzbot+11421fbbff99b989670e@syzkaller.appspotmail.com
Cc: Björn Töpel <bjorn@kernel.org>
Cc: Magnus Karlsson <magnus.karlsson@intel.com>
Cc: Willy Tarreau <w@1wt.eu>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: David S. Miller <davem@davemloft.net>
Link: https://lore.kernel.org/bpf/CAJ+HfNhyfsT5cS_U9EC213ducHs9k9zNxX9+abqC0kTrPbQ0gg@mail.gmail.com
Link: https://lore.kernel.org/bpf/20211201202905.b9892171e3f5b9a60f9da251@linux-foundation.org
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Ackd-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-03-04 14:26:32 +00:00
|
|
|
}
|
2021-07-14 16:45:49 +00:00
|
|
|
|
kvmalloc: use vmalloc_huge for vmalloc allocations
Since commit 559089e0a93d ("vmalloc: replace VM_NO_HUGE_VMAP with
VM_ALLOW_HUGE_VMAP"), the use of hugepage mappings for vmalloc is an
opt-in strategy, because it caused a number of problems that weren't
noticed until x86 enabled it too.
One of the issues was fixed by Nick Piggin in commit 3b8000ae185c
("mm/vmalloc: huge vmalloc backing pages should be split rather than
compound"), but I'm still worried about page protection issues, and
VM_FLUSH_RESET_PERMS in particular.
However, like the hash table allocation case (commit f2edd118d02d:
"page_alloc: use vmalloc_huge for large system hash"), the use of
kvmalloc() should be safe from any such games, since the returned
pointer might be a SLUB allocation, and as such no user should
reasonably be using it in any odd ways.
We also know that the allocations are fairly large, since it falls back
to the vmalloc case only when a kmalloc() fails. So using a hugepage
mapping seems both safe and relevant.
This patch does show a weakness in the opt-in strategy: since the opt-in
flag is in the 'vm_flags', not the usual gfp_t allocation flags, very
few of the usual interfaces actually expose it.
That's not much of an issue in this case that already used one of the
fairly specialized low-level vmalloc interfaces for the allocation, but
for a lot of other vmalloc() users that might want to opt in, it's going
to be very inconvenient.
We'll either have to fix any compatibility problems, or expose it in the
gfp flags (__GFP_COMP would have made a lot of sense) to allow normal
vmalloc() users to use hugepage mappings. That said, the cases that
really matter were probably already taken care of by the hash tabel
allocation.
Link: https://lore.kernel.org/all/20220415164413.2727220-1-song@kernel.org/
Link: https://lore.kernel.org/all/CAHk-=whao=iosX1s5Z4SF-ZGa-ebAukJoAdUJFk5SPwnofV+Vg@mail.gmail.com/
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Paul Menzel <pmenzel@molgen.mpg.de>
Cc: Song Liu <songliubraving@fb.com>
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-04-22 18:41:38 +00:00
|
|
|
/*
|
|
|
|
* kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
|
|
|
|
* since the callers already cannot assume anything
|
|
|
|
* about the resulting pointer, and cannot play
|
|
|
|
* protection games.
|
|
|
|
*/
|
2024-03-21 16:36:52 +00:00
|
|
|
return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
|
kvmalloc: use vmalloc_huge for vmalloc allocations
Since commit 559089e0a93d ("vmalloc: replace VM_NO_HUGE_VMAP with
VM_ALLOW_HUGE_VMAP"), the use of hugepage mappings for vmalloc is an
opt-in strategy, because it caused a number of problems that weren't
noticed until x86 enabled it too.
One of the issues was fixed by Nick Piggin in commit 3b8000ae185c
("mm/vmalloc: huge vmalloc backing pages should be split rather than
compound"), but I'm still worried about page protection issues, and
VM_FLUSH_RESET_PERMS in particular.
However, like the hash table allocation case (commit f2edd118d02d:
"page_alloc: use vmalloc_huge for large system hash"), the use of
kvmalloc() should be safe from any such games, since the returned
pointer might be a SLUB allocation, and as such no user should
reasonably be using it in any odd ways.
We also know that the allocations are fairly large, since it falls back
to the vmalloc case only when a kmalloc() fails. So using a hugepage
mapping seems both safe and relevant.
This patch does show a weakness in the opt-in strategy: since the opt-in
flag is in the 'vm_flags', not the usual gfp_t allocation flags, very
few of the usual interfaces actually expose it.
That's not much of an issue in this case that already used one of the
fairly specialized low-level vmalloc interfaces for the allocation, but
for a lot of other vmalloc() users that might want to opt in, it's going
to be very inconvenient.
We'll either have to fix any compatibility problems, or expose it in the
gfp flags (__GFP_COMP would have made a lot of sense) to allow normal
vmalloc() users to use hugepage mappings. That said, the cases that
really matter were probably already taken care of by the hash tabel
allocation.
Link: https://lore.kernel.org/all/20220415164413.2727220-1-song@kernel.org/
Link: https://lore.kernel.org/all/CAHk-=whao=iosX1s5Z4SF-ZGa-ebAukJoAdUJFk5SPwnofV+Vg@mail.gmail.com/
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Paul Menzel <pmenzel@molgen.mpg.de>
Cc: Song Liu <songliubraving@fb.com>
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-04-22 18:41:38 +00:00
|
|
|
flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
|
|
|
|
node, __builtin_return_address(0));
|
2017-05-08 22:57:09 +00:00
|
|
|
}
|
2024-03-21 16:36:47 +00:00
|
|
|
EXPORT_SYMBOL(kvmalloc_node_noprof);
|
2017-05-08 22:57:09 +00:00
|
|
|
|
2018-08-24 00:01:02 +00:00
|
|
|
/**
|
2018-09-04 22:45:55 +00:00
|
|
|
* kvfree() - Free memory.
|
|
|
|
* @addr: Pointer to allocated memory.
|
2018-08-24 00:01:02 +00:00
|
|
|
*
|
2018-09-04 22:45:55 +00:00
|
|
|
* kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
|
|
|
|
* It is slightly more efficient to use kfree() or vfree() if you are certain
|
|
|
|
* that you know which one to use.
|
|
|
|
*
|
2018-10-26 22:07:00 +00:00
|
|
|
* Context: Either preemptible task context or not-NMI interrupt.
|
2018-08-24 00:01:02 +00:00
|
|
|
*/
|
2014-05-06 18:02:53 +00:00
|
|
|
void kvfree(const void *addr)
|
|
|
|
{
|
|
|
|
if (is_vmalloc_addr(addr))
|
|
|
|
vfree(addr);
|
|
|
|
else
|
|
|
|
kfree(addr);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kvfree);
|
|
|
|
|
2020-06-04 23:48:21 +00:00
|
|
|
/**
|
|
|
|
* kvfree_sensitive - Free a data object containing sensitive information.
|
|
|
|
* @addr: address of the data object to be freed.
|
|
|
|
* @len: length of the data object.
|
|
|
|
*
|
|
|
|
* Use the special memzero_explicit() function to clear the content of a
|
|
|
|
* kvmalloc'ed object containing sensitive data to make sure that the
|
|
|
|
* compiler won't optimize out the data clearing.
|
|
|
|
*/
|
|
|
|
void kvfree_sensitive(const void *addr, size_t len)
|
|
|
|
{
|
|
|
|
if (likely(!ZERO_OR_NULL_PTR(addr))) {
|
|
|
|
memzero_explicit((void *)addr, len);
|
|
|
|
kvfree(addr);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kvfree_sensitive);
|
|
|
|
|
2024-03-21 16:36:47 +00:00
|
|
|
void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
|
2021-08-09 17:10:00 +00:00
|
|
|
{
|
|
|
|
void *newp;
|
|
|
|
|
|
|
|
if (oldsize >= newsize)
|
|
|
|
return (void *)p;
|
|
|
|
newp = kvmalloc(newsize, flags);
|
|
|
|
if (!newp)
|
|
|
|
return NULL;
|
|
|
|
memcpy(newp, p, oldsize);
|
|
|
|
kvfree(p);
|
|
|
|
return newp;
|
|
|
|
}
|
2024-03-21 16:36:47 +00:00
|
|
|
EXPORT_SYMBOL(kvrealloc_noprof);
|
2021-08-09 17:10:00 +00:00
|
|
|
|
2022-03-08 09:47:22 +00:00
|
|
|
/**
|
|
|
|
* __vmalloc_array - allocate memory for a virtually contiguous array.
|
|
|
|
* @n: number of elements.
|
|
|
|
* @size: element size.
|
|
|
|
* @flags: the type of memory to allocate (see kmalloc).
|
|
|
|
*/
|
2024-03-21 16:36:52 +00:00
|
|
|
void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags)
|
2022-03-08 09:47:22 +00:00
|
|
|
{
|
|
|
|
size_t bytes;
|
|
|
|
|
|
|
|
if (unlikely(check_mul_overflow(n, size, &bytes)))
|
|
|
|
return NULL;
|
|
|
|
return __vmalloc(bytes, flags);
|
|
|
|
}
|
2024-03-21 16:36:52 +00:00
|
|
|
EXPORT_SYMBOL(__vmalloc_array_noprof);
|
2022-03-08 09:47:22 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* vmalloc_array - allocate memory for a virtually contiguous array.
|
|
|
|
* @n: number of elements.
|
|
|
|
* @size: element size.
|
|
|
|
*/
|
2024-03-21 16:36:52 +00:00
|
|
|
void *vmalloc_array_noprof(size_t n, size_t size)
|
2022-03-08 09:47:22 +00:00
|
|
|
{
|
|
|
|
return __vmalloc_array(n, size, GFP_KERNEL);
|
|
|
|
}
|
2024-03-21 16:36:52 +00:00
|
|
|
EXPORT_SYMBOL(vmalloc_array_noprof);
|
2022-03-08 09:47:22 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* __vcalloc - allocate and zero memory for a virtually contiguous array.
|
|
|
|
* @n: number of elements.
|
|
|
|
* @size: element size.
|
|
|
|
* @flags: the type of memory to allocate (see kmalloc).
|
|
|
|
*/
|
2024-03-21 16:36:52 +00:00
|
|
|
void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags)
|
2022-03-08 09:47:22 +00:00
|
|
|
{
|
|
|
|
return __vmalloc_array(n, size, flags | __GFP_ZERO);
|
|
|
|
}
|
2024-03-21 16:36:52 +00:00
|
|
|
EXPORT_SYMBOL(__vcalloc_noprof);
|
2022-03-08 09:47:22 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* vcalloc - allocate and zero memory for a virtually contiguous array.
|
|
|
|
* @n: number of elements.
|
|
|
|
* @size: element size.
|
|
|
|
*/
|
2024-03-21 16:36:52 +00:00
|
|
|
void *vcalloc_noprof(size_t n, size_t size)
|
2022-03-08 09:47:22 +00:00
|
|
|
{
|
|
|
|
return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO);
|
|
|
|
}
|
2024-03-21 16:36:52 +00:00
|
|
|
EXPORT_SYMBOL(vcalloc_noprof);
|
2022-03-08 09:47:22 +00:00
|
|
|
|
2022-01-29 16:52:52 +00:00
|
|
|
struct anon_vma *folio_anon_vma(struct folio *folio)
|
2015-04-15 23:14:53 +00:00
|
|
|
{
|
2021-05-07 15:17:34 +00:00
|
|
|
unsigned long mapping = (unsigned long)folio->mapping;
|
2015-04-15 23:14:53 +00:00
|
|
|
|
|
|
|
if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
|
|
|
|
return NULL;
|
2021-05-07 15:17:34 +00:00
|
|
|
return (void *)(mapping - PAGE_MAPPING_ANON);
|
2015-04-15 23:14:53 +00:00
|
|
|
}
|
|
|
|
|
mm/util: Add folio_mapping() and folio_file_mapping()
These are the folio equivalent of page_mapping() and page_file_mapping().
Add an out-of-line page_mapping() wrapper around folio_mapping()
in order to prevent the page_folio() call from bloating every caller
of page_mapping(). Adjust page_file_mapping() and page_mapping_file()
to use folios internally. Rename __page_file_mapping() to
swapcache_mapping() and change it to take a folio.
This ends up saving 122 bytes of text overall. folio_mapping() is
45 bytes shorter than page_mapping() was, but the new page_mapping()
wrapper is 30 bytes. The major reduction is a few bytes less in dozens
of nfs functions (which call page_file_mapping()). Most of these appear
to be a slight change in gcc's register allocation decisions, which allow:
48 8b 56 08 mov 0x8(%rsi),%rdx
48 8d 42 ff lea -0x1(%rdx),%rax
83 e2 01 and $0x1,%edx
48 0f 44 c6 cmove %rsi,%rax
to become:
48 8b 46 08 mov 0x8(%rsi),%rax
48 8d 78 ff lea -0x1(%rax),%rdi
a8 01 test $0x1,%al
48 0f 44 fe cmove %rsi,%rdi
for a reduction of a single byte. Once the NFS client is converted to
use folios, this entire sequence will disappear.
Also add folio_mapping() documentation.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
2020-12-10 15:55:05 +00:00
|
|
|
/**
|
|
|
|
* folio_mapping - Find the mapping where this folio is stored.
|
|
|
|
* @folio: The folio.
|
|
|
|
*
|
|
|
|
* For folios which are in the page cache, return the mapping that this
|
|
|
|
* page belongs to. Folios in the swap cache return the swap mapping
|
|
|
|
* this page is stored in (which is different from the mapping for the
|
|
|
|
* swap file or swap device where the data is stored).
|
|
|
|
*
|
|
|
|
* You can call this for folios which aren't in the swap cache or page
|
|
|
|
* cache and it will return NULL.
|
|
|
|
*/
|
|
|
|
struct address_space *folio_mapping(struct folio *folio)
|
2013-02-23 00:34:35 +00:00
|
|
|
{
|
2016-01-16 00:52:07 +00:00
|
|
|
struct address_space *mapping;
|
|
|
|
|
2014-01-15 01:56:40 +00:00
|
|
|
/* This happens if someone calls flush_dcache_page on slab page */
|
mm/util: Add folio_mapping() and folio_file_mapping()
These are the folio equivalent of page_mapping() and page_file_mapping().
Add an out-of-line page_mapping() wrapper around folio_mapping()
in order to prevent the page_folio() call from bloating every caller
of page_mapping(). Adjust page_file_mapping() and page_mapping_file()
to use folios internally. Rename __page_file_mapping() to
swapcache_mapping() and change it to take a folio.
This ends up saving 122 bytes of text overall. folio_mapping() is
45 bytes shorter than page_mapping() was, but the new page_mapping()
wrapper is 30 bytes. The major reduction is a few bytes less in dozens
of nfs functions (which call page_file_mapping()). Most of these appear
to be a slight change in gcc's register allocation decisions, which allow:
48 8b 56 08 mov 0x8(%rsi),%rdx
48 8d 42 ff lea -0x1(%rdx),%rax
83 e2 01 and $0x1,%edx
48 0f 44 c6 cmove %rsi,%rax
to become:
48 8b 46 08 mov 0x8(%rsi),%rax
48 8d 78 ff lea -0x1(%rax),%rdi
a8 01 test $0x1,%al
48 0f 44 fe cmove %rsi,%rdi
for a reduction of a single byte. Once the NFS client is converted to
use folios, this entire sequence will disappear.
Also add folio_mapping() documentation.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
2020-12-10 15:55:05 +00:00
|
|
|
if (unlikely(folio_test_slab(folio)))
|
2014-01-15 01:56:40 +00:00
|
|
|
return NULL;
|
|
|
|
|
mm/util: Add folio_mapping() and folio_file_mapping()
These are the folio equivalent of page_mapping() and page_file_mapping().
Add an out-of-line page_mapping() wrapper around folio_mapping()
in order to prevent the page_folio() call from bloating every caller
of page_mapping(). Adjust page_file_mapping() and page_mapping_file()
to use folios internally. Rename __page_file_mapping() to
swapcache_mapping() and change it to take a folio.
This ends up saving 122 bytes of text overall. folio_mapping() is
45 bytes shorter than page_mapping() was, but the new page_mapping()
wrapper is 30 bytes. The major reduction is a few bytes less in dozens
of nfs functions (which call page_file_mapping()). Most of these appear
to be a slight change in gcc's register allocation decisions, which allow:
48 8b 56 08 mov 0x8(%rsi),%rdx
48 8d 42 ff lea -0x1(%rdx),%rax
83 e2 01 and $0x1,%edx
48 0f 44 c6 cmove %rsi,%rax
to become:
48 8b 46 08 mov 0x8(%rsi),%rax
48 8d 78 ff lea -0x1(%rax),%rdi
a8 01 test $0x1,%al
48 0f 44 fe cmove %rsi,%rdi
for a reduction of a single byte. Once the NFS client is converted to
use folios, this entire sequence will disappear.
Also add folio_mapping() documentation.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
2020-12-10 15:55:05 +00:00
|
|
|
if (unlikely(folio_test_swapcache(folio)))
|
2023-08-21 16:08:48 +00:00
|
|
|
return swap_address_space(folio->swap);
|
2015-04-15 23:14:53 +00:00
|
|
|
|
mm/util: Add folio_mapping() and folio_file_mapping()
These are the folio equivalent of page_mapping() and page_file_mapping().
Add an out-of-line page_mapping() wrapper around folio_mapping()
in order to prevent the page_folio() call from bloating every caller
of page_mapping(). Adjust page_file_mapping() and page_mapping_file()
to use folios internally. Rename __page_file_mapping() to
swapcache_mapping() and change it to take a folio.
This ends up saving 122 bytes of text overall. folio_mapping() is
45 bytes shorter than page_mapping() was, but the new page_mapping()
wrapper is 30 bytes. The major reduction is a few bytes less in dozens
of nfs functions (which call page_file_mapping()). Most of these appear
to be a slight change in gcc's register allocation decisions, which allow:
48 8b 56 08 mov 0x8(%rsi),%rdx
48 8d 42 ff lea -0x1(%rdx),%rax
83 e2 01 and $0x1,%edx
48 0f 44 c6 cmove %rsi,%rax
to become:
48 8b 46 08 mov 0x8(%rsi),%rax
48 8d 78 ff lea -0x1(%rax),%rdi
a8 01 test $0x1,%al
48 0f 44 fe cmove %rsi,%rdi
for a reduction of a single byte. Once the NFS client is converted to
use folios, this entire sequence will disappear.
Also add folio_mapping() documentation.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
2020-12-10 15:55:05 +00:00
|
|
|
mapping = folio->mapping;
|
2022-06-07 19:38:48 +00:00
|
|
|
if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
|
2015-04-15 23:14:53 +00:00
|
|
|
return NULL;
|
mm: migrate: support non-lru movable page migration
We have allowed migration for only LRU pages until now and it was enough
to make high-order pages. But recently, embedded system(e.g., webOS,
android) uses lots of non-movable pages(e.g., zram, GPU memory) so we
have seen several reports about troubles of small high-order allocation.
For fixing the problem, there were several efforts (e,g,. enhance
compaction algorithm, SLUB fallback to 0-order page, reserved memory,
vmalloc and so on) but if there are lots of non-movable pages in system,
their solutions are void in the long run.
So, this patch is to support facility to change non-movable pages with
movable. For the feature, this patch introduces functions related to
migration to address_space_operations as well as some page flags.
If a driver want to make own pages movable, it should define three
functions which are function pointers of struct
address_space_operations.
1. bool (*isolate_page) (struct page *page, isolate_mode_t mode);
What VM expects on isolate_page function of driver is to return *true*
if driver isolates page successfully. On returing true, VM marks the
page as PG_isolated so concurrent isolation in several CPUs skip the
page for isolation. If a driver cannot isolate the page, it should
return *false*.
Once page is successfully isolated, VM uses page.lru fields so driver
shouldn't expect to preserve values in that fields.
2. int (*migratepage) (struct address_space *mapping,
struct page *newpage, struct page *oldpage, enum migrate_mode);
After isolation, VM calls migratepage of driver with isolated page. The
function of migratepage is to move content of the old page to new page
and set up fields of struct page newpage. Keep in mind that you should
indicate to the VM the oldpage is no longer movable via
__ClearPageMovable() under page_lock if you migrated the oldpage
successfully and returns 0. If driver cannot migrate the page at the
moment, driver can return -EAGAIN. On -EAGAIN, VM will retry page
migration in a short time because VM interprets -EAGAIN as "temporal
migration failure". On returning any error except -EAGAIN, VM will give
up the page migration without retrying in this time.
Driver shouldn't touch page.lru field VM using in the functions.
3. void (*putback_page)(struct page *);
If migration fails on isolated page, VM should return the isolated page
to the driver so VM calls driver's putback_page with migration failed
page. In this function, driver should put the isolated page back to the
own data structure.
4. non-lru movable page flags
There are two page flags for supporting non-lru movable page.
* PG_movable
Driver should use the below function to make page movable under
page_lock.
void __SetPageMovable(struct page *page, struct address_space *mapping)
It needs argument of address_space for registering migration family
functions which will be called by VM. Exactly speaking, PG_movable is
not a real flag of struct page. Rather than, VM reuses page->mapping's
lower bits to represent it.
#define PAGE_MAPPING_MOVABLE 0x2
page->mapping = page->mapping | PAGE_MAPPING_MOVABLE;
so driver shouldn't access page->mapping directly. Instead, driver
should use page_mapping which mask off the low two bits of page->mapping
so it can get right struct address_space.
For testing of non-lru movable page, VM supports __PageMovable function.
However, it doesn't guarantee to identify non-lru movable page because
page->mapping field is unified with other variables in struct page. As
well, if driver releases the page after isolation by VM, page->mapping
doesn't have stable value although it has PAGE_MAPPING_MOVABLE (Look at
__ClearPageMovable). But __PageMovable is cheap to catch whether page
is LRU or non-lru movable once the page has been isolated. Because LRU
pages never can have PAGE_MAPPING_MOVABLE in page->mapping. It is also
good for just peeking to test non-lru movable pages before more
expensive checking with lock_page in pfn scanning to select victim.
For guaranteeing non-lru movable page, VM provides PageMovable function.
Unlike __PageMovable, PageMovable functions validates page->mapping and
mapping->a_ops->isolate_page under lock_page. The lock_page prevents
sudden destroying of page->mapping.
Driver using __SetPageMovable should clear the flag via
__ClearMovablePage under page_lock before the releasing the page.
* PG_isolated
To prevent concurrent isolation among several CPUs, VM marks isolated
page as PG_isolated under lock_page. So if a CPU encounters PG_isolated
non-lru movable page, it can skip it. Driver doesn't need to manipulate
the flag because VM will set/clear it automatically. Keep in mind that
if driver sees PG_isolated page, it means the page have been isolated by
VM so it shouldn't touch page.lru field. PG_isolated is alias with
PG_reclaim flag so driver shouldn't use the flag for own purpose.
[opensource.ganesh@gmail.com: mm/compaction: remove local variable is_lru]
Link: http://lkml.kernel.org/r/20160618014841.GA7422@leo-test
Link: http://lkml.kernel.org/r/1464736881-24886-3-git-send-email-minchan@kernel.org
Signed-off-by: Gioh Kim <gi-oh.kim@profitbricks.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rafael Aquini <aquini@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: John Einar Reitan <john.reitan@foss.arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-07-26 22:23:05 +00:00
|
|
|
|
2022-06-07 19:38:48 +00:00
|
|
|
return mapping;
|
2013-02-23 00:34:35 +00:00
|
|
|
}
|
mm/util: Add folio_mapping() and folio_file_mapping()
These are the folio equivalent of page_mapping() and page_file_mapping().
Add an out-of-line page_mapping() wrapper around folio_mapping()
in order to prevent the page_folio() call from bloating every caller
of page_mapping(). Adjust page_file_mapping() and page_mapping_file()
to use folios internally. Rename __page_file_mapping() to
swapcache_mapping() and change it to take a folio.
This ends up saving 122 bytes of text overall. folio_mapping() is
45 bytes shorter than page_mapping() was, but the new page_mapping()
wrapper is 30 bytes. The major reduction is a few bytes less in dozens
of nfs functions (which call page_file_mapping()). Most of these appear
to be a slight change in gcc's register allocation decisions, which allow:
48 8b 56 08 mov 0x8(%rsi),%rdx
48 8d 42 ff lea -0x1(%rdx),%rax
83 e2 01 and $0x1,%edx
48 0f 44 c6 cmove %rsi,%rax
to become:
48 8b 46 08 mov 0x8(%rsi),%rax
48 8d 78 ff lea -0x1(%rax),%rdi
a8 01 test $0x1,%al
48 0f 44 fe cmove %rsi,%rdi
for a reduction of a single byte. Once the NFS client is converted to
use folios, this entire sequence will disappear.
Also add folio_mapping() documentation.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: David Howells <dhowells@redhat.com>
2020-12-10 15:55:05 +00:00
|
|
|
EXPORT_SYMBOL(folio_mapping);
|
2013-02-23 00:34:35 +00:00
|
|
|
|
2021-05-07 19:05:06 +00:00
|
|
|
/**
|
|
|
|
* folio_copy - Copy the contents of one folio to another.
|
|
|
|
* @dst: Folio to copy to.
|
|
|
|
* @src: Folio to copy from.
|
|
|
|
*
|
|
|
|
* The bytes in the folio represented by @src are copied to @dst.
|
|
|
|
* Assumes the caller has validated that @dst is at least as large as @src.
|
|
|
|
* Can be called in atomic context for order-0 folios, but if the folio is
|
|
|
|
* larger, it may sleep.
|
|
|
|
*/
|
|
|
|
void folio_copy(struct folio *dst, struct folio *src)
|
2021-07-12 15:32:07 +00:00
|
|
|
{
|
2021-05-07 19:05:06 +00:00
|
|
|
long i = 0;
|
|
|
|
long nr = folio_nr_pages(src);
|
2021-07-12 15:32:07 +00:00
|
|
|
|
2021-05-07 19:05:06 +00:00
|
|
|
for (;;) {
|
|
|
|
copy_highpage(folio_page(dst, i), folio_page(src, i));
|
|
|
|
if (++i == nr)
|
|
|
|
break;
|
2021-07-12 15:32:07 +00:00
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
}
|
2023-10-16 20:10:59 +00:00
|
|
|
EXPORT_SYMBOL(folio_copy);
|
2021-07-12 15:32:07 +00:00
|
|
|
|
2016-03-17 21:18:50 +00:00
|
|
|
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
|
|
|
|
int sysctl_overcommit_ratio __read_mostly = 50;
|
|
|
|
unsigned long sysctl_overcommit_kbytes __read_mostly;
|
|
|
|
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
|
|
|
|
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
|
|
|
|
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
|
|
|
|
|
2020-04-24 06:43:38 +00:00
|
|
|
int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
|
|
|
|
size_t *lenp, loff_t *ppos)
|
2014-01-21 23:49:14 +00:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = proc_dointvec(table, write, buffer, lenp, ppos);
|
|
|
|
if (ret == 0 && write)
|
|
|
|
sysctl_overcommit_kbytes = 0;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-08-07 06:23:15 +00:00
|
|
|
static void sync_overcommit_as(struct work_struct *dummy)
|
|
|
|
{
|
|
|
|
percpu_counter_sync(&vm_committed_as);
|
|
|
|
}
|
|
|
|
|
|
|
|
int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
|
|
|
|
size_t *lenp, loff_t *ppos)
|
|
|
|
{
|
|
|
|
struct ctl_table t;
|
mm: fix uninitialized use in overcommit_policy_handler
We get an unexpected value of /proc/sys/vm/overcommit_memory after
running the following program:
int main()
{
int fd = open("/proc/sys/vm/overcommit_memory", O_RDWR);
write(fd, "1", 1);
write(fd, "2", 1);
close(fd);
}
write(fd, "2", 1) will pass *ppos = 1 to proc_dointvec_minmax.
proc_dointvec_minmax will return 0 without setting new_policy.
t.data = &new_policy;
ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos)
-->do_proc_dointvec
-->__do_proc_dointvec
if (write) {
if (proc_first_pos_non_zero_ignore(ppos, table))
goto out;
sysctl_overcommit_memory = new_policy;
so sysctl_overcommit_memory will be set to an uninitialized value.
Check whether new_policy has been changed by proc_dointvec_minmax.
Link: https://lkml.kernel.org/r/20210923020524.13289-1-chenjun102@huawei.com
Fixes: 56f3547bfa4d ("mm: adjust vm_committed_as_batch according to vm overcommit policy")
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Feng Tang <feng.tang@intel.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Rui Xiang <rui.xiang@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-24 22:44:06 +00:00
|
|
|
int new_policy = -1;
|
2020-08-07 06:23:15 +00:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The deviation of sync_overcommit_as could be big with loose policy
|
|
|
|
* like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
|
|
|
|
* strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
|
2021-05-05 01:38:35 +00:00
|
|
|
* with the strict "NEVER", and to avoid possible race condition (even
|
2020-08-07 06:23:15 +00:00
|
|
|
* though user usually won't too frequently do the switching to policy
|
|
|
|
* OVERCOMMIT_NEVER), the switch is done in the following order:
|
|
|
|
* 1. changing the batch
|
|
|
|
* 2. sync percpu count on each CPU
|
|
|
|
* 3. switch the policy
|
|
|
|
*/
|
|
|
|
if (write) {
|
|
|
|
t = *table;
|
|
|
|
t.data = &new_policy;
|
|
|
|
ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
|
mm: fix uninitialized use in overcommit_policy_handler
We get an unexpected value of /proc/sys/vm/overcommit_memory after
running the following program:
int main()
{
int fd = open("/proc/sys/vm/overcommit_memory", O_RDWR);
write(fd, "1", 1);
write(fd, "2", 1);
close(fd);
}
write(fd, "2", 1) will pass *ppos = 1 to proc_dointvec_minmax.
proc_dointvec_minmax will return 0 without setting new_policy.
t.data = &new_policy;
ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos)
-->do_proc_dointvec
-->__do_proc_dointvec
if (write) {
if (proc_first_pos_non_zero_ignore(ppos, table))
goto out;
sysctl_overcommit_memory = new_policy;
so sysctl_overcommit_memory will be set to an uninitialized value.
Check whether new_policy has been changed by proc_dointvec_minmax.
Link: https://lkml.kernel.org/r/20210923020524.13289-1-chenjun102@huawei.com
Fixes: 56f3547bfa4d ("mm: adjust vm_committed_as_batch according to vm overcommit policy")
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Feng Tang <feng.tang@intel.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Rui Xiang <rui.xiang@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-24 22:44:06 +00:00
|
|
|
if (ret || new_policy == -1)
|
2020-08-07 06:23:15 +00:00
|
|
|
return ret;
|
|
|
|
|
|
|
|
mm_compute_batch(new_policy);
|
|
|
|
if (new_policy == OVERCOMMIT_NEVER)
|
|
|
|
schedule_on_each_cpu(sync_overcommit_as);
|
|
|
|
sysctl_overcommit_memory = new_policy;
|
|
|
|
} else {
|
|
|
|
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-04-24 06:43:38 +00:00
|
|
|
int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer,
|
|
|
|
size_t *lenp, loff_t *ppos)
|
2014-01-21 23:49:14 +00:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
|
|
|
|
if (ret == 0 && write)
|
|
|
|
sysctl_overcommit_ratio = 0;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-11-12 23:08:31 +00:00
|
|
|
/*
|
|
|
|
* Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
|
|
|
|
*/
|
|
|
|
unsigned long vm_commit_limit(void)
|
|
|
|
{
|
2014-01-21 23:49:14 +00:00
|
|
|
unsigned long allowed;
|
|
|
|
|
|
|
|
if (sysctl_overcommit_kbytes)
|
|
|
|
allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
|
|
|
|
else
|
2018-12-28 08:34:29 +00:00
|
|
|
allowed = ((totalram_pages() - hugetlb_total_pages())
|
2014-01-21 23:49:14 +00:00
|
|
|
* sysctl_overcommit_ratio / 100);
|
|
|
|
allowed += total_swap_pages;
|
|
|
|
|
|
|
|
return allowed;
|
2013-11-12 23:08:31 +00:00
|
|
|
}
|
|
|
|
|
2016-03-17 21:18:50 +00:00
|
|
|
/*
|
|
|
|
* Make sure vm_committed_as in one cacheline and not cacheline shared with
|
|
|
|
* other variables. It can be updated by several CPUs frequently.
|
|
|
|
*/
|
|
|
|
struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The global memory commitment made in the system can be a metric
|
|
|
|
* that can be used to drive ballooning decisions when Linux is hosted
|
|
|
|
* as a guest. On Hyper-V, the host implements a policy engine for dynamically
|
|
|
|
* balancing memory across competing virtual machines that are hosted.
|
|
|
|
* Several metrics drive this policy engine including the guest reported
|
|
|
|
* memory commitment.
|
2020-08-07 06:23:07 +00:00
|
|
|
*
|
|
|
|
* The time cost of this is very low for small platforms, and for big
|
|
|
|
* platform like a 2S/36C/72T Skylake server, in worst case where
|
|
|
|
* vm_committed_as's spinlock is under severe contention, the time cost
|
|
|
|
* could be about 30~40 microseconds.
|
2016-03-17 21:18:50 +00:00
|
|
|
*/
|
|
|
|
unsigned long vm_memory_committed(void)
|
|
|
|
{
|
2020-08-07 06:23:07 +00:00
|
|
|
return percpu_counter_sum_positive(&vm_committed_as);
|
2016-03-17 21:18:50 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(vm_memory_committed);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check that a process has enough memory to allocate a new virtual
|
|
|
|
* mapping. 0 means there is enough memory for the allocation to
|
|
|
|
* succeed and -ENOMEM implies there is not.
|
|
|
|
*
|
|
|
|
* We currently support three overcommit policies, which are set via the
|
2022-06-27 06:00:26 +00:00
|
|
|
* vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst
|
2016-03-17 21:18:50 +00:00
|
|
|
*
|
|
|
|
* Strict overcommit modes added 2002 Feb 26 by Alan Cox.
|
|
|
|
* Additional code 2002 Jul 20 by Robert Love.
|
|
|
|
*
|
|
|
|
* cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
|
|
|
|
*
|
|
|
|
* Note this is a helper function intended to be used by LSMs which
|
|
|
|
* wish to use this logic.
|
|
|
|
*/
|
|
|
|
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
|
|
|
|
{
|
2019-05-14 00:21:50 +00:00
|
|
|
long allowed;
|
2024-02-22 19:46:17 +00:00
|
|
|
unsigned long bytes_failed;
|
2016-03-17 21:18:50 +00:00
|
|
|
|
|
|
|
vm_acct_memory(pages);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Sometimes we want to use more memory than we have
|
|
|
|
*/
|
|
|
|
if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
|
2019-05-14 00:21:50 +00:00
|
|
|
if (pages > totalram_pages() + total_swap_pages)
|
2016-03-17 21:18:50 +00:00
|
|
|
goto error;
|
2019-05-14 00:21:50 +00:00
|
|
|
return 0;
|
2016-03-17 21:18:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
allowed = vm_commit_limit();
|
|
|
|
/*
|
|
|
|
* Reserve some for root
|
|
|
|
*/
|
|
|
|
if (!cap_sys_admin)
|
|
|
|
allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Don't let a single process grow so big a user can't recover
|
|
|
|
*/
|
|
|
|
if (mm) {
|
2019-05-14 00:21:50 +00:00
|
|
|
long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
|
|
|
|
|
2016-03-17 21:18:50 +00:00
|
|
|
allowed -= min_t(long, mm->total_vm / 32, reserve);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
|
|
|
|
return 0;
|
|
|
|
error:
|
2024-02-22 19:46:17 +00:00
|
|
|
bytes_failed = pages << PAGE_SHIFT;
|
|
|
|
pr_warn_ratelimited("%s: pid: %d, comm: %s, bytes: %lu not enough memory for the allocation\n",
|
|
|
|
__func__, current->pid, current->comm, bytes_failed);
|
2016-03-17 21:18:50 +00:00
|
|
|
vm_unacct_memory(pages);
|
|
|
|
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
2014-02-11 18:11:59 +00:00
|
|
|
/**
|
|
|
|
* get_cmdline() - copy the cmdline value to a buffer.
|
|
|
|
* @task: the task whose cmdline value to copy.
|
|
|
|
* @buffer: the buffer to copy to.
|
|
|
|
* @buflen: the length of the buffer. Larger cmdline values are truncated
|
|
|
|
* to this length.
|
2019-03-05 23:48:42 +00:00
|
|
|
*
|
|
|
|
* Return: the size of the cmdline field copied. Note that the copy does
|
2014-02-11 18:11:59 +00:00
|
|
|
* not guarantee an ending NULL byte.
|
|
|
|
*/
|
|
|
|
int get_cmdline(struct task_struct *task, char *buffer, int buflen)
|
|
|
|
{
|
|
|
|
int res = 0;
|
|
|
|
unsigned int len;
|
|
|
|
struct mm_struct *mm = get_task_mm(task);
|
2016-01-20 23:01:05 +00:00
|
|
|
unsigned long arg_start, arg_end, env_start, env_end;
|
2014-02-11 18:11:59 +00:00
|
|
|
if (!mm)
|
|
|
|
goto out;
|
|
|
|
if (!mm->arg_end)
|
|
|
|
goto out_mm; /* Shh! No looking before we're done */
|
|
|
|
|
2019-06-01 05:30:19 +00:00
|
|
|
spin_lock(&mm->arg_lock);
|
2016-01-20 23:01:05 +00:00
|
|
|
arg_start = mm->arg_start;
|
|
|
|
arg_end = mm->arg_end;
|
|
|
|
env_start = mm->env_start;
|
|
|
|
env_end = mm->env_end;
|
2019-06-01 05:30:19 +00:00
|
|
|
spin_unlock(&mm->arg_lock);
|
2016-01-20 23:01:05 +00:00
|
|
|
|
|
|
|
len = arg_end - arg_start;
|
2014-02-11 18:11:59 +00:00
|
|
|
|
|
|
|
if (len > buflen)
|
|
|
|
len = buflen;
|
|
|
|
|
2016-10-13 00:20:20 +00:00
|
|
|
res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);
|
2014-02-11 18:11:59 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the nul at the end of args has been overwritten, then
|
|
|
|
* assume application is using setproctitle(3).
|
|
|
|
*/
|
|
|
|
if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
|
|
|
|
len = strnlen(buffer, res);
|
|
|
|
if (len < res) {
|
|
|
|
res = len;
|
|
|
|
} else {
|
2016-01-20 23:01:05 +00:00
|
|
|
len = env_end - env_start;
|
2014-02-11 18:11:59 +00:00
|
|
|
if (len > buflen - res)
|
|
|
|
len = buflen - res;
|
2016-01-20 23:01:05 +00:00
|
|
|
res += access_process_vm(task, env_start,
|
2016-10-13 00:20:20 +00:00
|
|
|
buffer+res, len,
|
|
|
|
FOLL_FORCE);
|
2014-02-11 18:11:59 +00:00
|
|
|
res = strnlen(buffer, res);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
out_mm:
|
|
|
|
mmput(mm);
|
|
|
|
out:
|
|
|
|
return res;
|
|
|
|
}
|
2019-09-23 22:38:19 +00:00
|
|
|
|
2019-11-27 09:53:44 +00:00
|
|
|
int __weak memcmp_pages(struct page *page1, struct page *page2)
|
2019-09-23 22:38:19 +00:00
|
|
|
{
|
|
|
|
char *addr1, *addr2;
|
|
|
|
int ret;
|
|
|
|
|
2023-11-20 14:15:27 +00:00
|
|
|
addr1 = kmap_local_page(page1);
|
|
|
|
addr2 = kmap_local_page(page2);
|
2019-09-23 22:38:19 +00:00
|
|
|
ret = memcmp(addr1, addr2, PAGE_SIZE);
|
2023-11-20 14:15:27 +00:00
|
|
|
kunmap_local(addr2);
|
|
|
|
kunmap_local(addr1);
|
2019-09-23 22:38:19 +00:00
|
|
|
return ret;
|
|
|
|
}
|
mm: Add mem_dump_obj() to print source of memory block
There are kernel facilities such as per-CPU reference counts that give
error messages in generic handlers or callbacks, whose messages are
unenlightening. In the case of per-CPU reference-count underflow, this
is not a problem when creating a new use of this facility because in that
case the bug is almost certainly in the code implementing that new use.
However, trouble arises when deploying across many systems, which might
exercise corner cases that were not seen during development and testing.
Here, it would be really nice to get some kind of hint as to which of
several uses the underflow was caused by.
This commit therefore exposes a mem_dump_obj() function that takes
a pointer to memory (which must still be allocated if it has been
dynamically allocated) and prints available information on where that
memory came from. This pointer can reference the middle of the block as
well as the beginning of the block, as needed by things like RCU callback
functions and timer handlers that might not know where the beginning of
the memory block is. These functions and handlers can use mem_dump_obj()
to print out better hints as to where the problem might lie.
The information printed can depend on kernel configuration. For example,
the allocation return address can be printed only for slab and slub,
and even then only when the necessary debug has been enabled. For slab,
build with CONFIG_DEBUG_SLAB=y, and either use sizes with ample space
to the next power of two or use the SLAB_STORE_USER when creating the
kmem_cache structure. For slub, build with CONFIG_SLUB_DEBUG=y and
boot with slub_debug=U, or pass SLAB_STORE_USER to kmem_cache_create()
if more focused use is desired. Also for slub, use CONFIG_STACKTRACE
to enable printing of the allocation-time stack trace.
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: <linux-mm@kvack.org>
Reported-by: Andrii Nakryiko <andrii@kernel.org>
[ paulmck: Convert to printing and change names per Joonsoo Kim. ]
[ paulmck: Move slab definition per Stephen Rothwell and kbuild test robot. ]
[ paulmck: Handle CONFIG_MMU=n case where vmalloc() is kmalloc(). ]
[ paulmck: Apply Vlastimil Babka feedback on slab.c kmem_provenance(). ]
[ paulmck: Extract more info from !SLUB_DEBUG per Joonsoo Kim. ]
[ paulmck: Explicitly check for small pointers per Naresh Kamboju. ]
Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
2020-12-08 01:41:02 +00:00
|
|
|
|
2021-01-07 21:46:11 +00:00
|
|
|
#ifdef CONFIG_PRINTK
|
mm: Add mem_dump_obj() to print source of memory block
There are kernel facilities such as per-CPU reference counts that give
error messages in generic handlers or callbacks, whose messages are
unenlightening. In the case of per-CPU reference-count underflow, this
is not a problem when creating a new use of this facility because in that
case the bug is almost certainly in the code implementing that new use.
However, trouble arises when deploying across many systems, which might
exercise corner cases that were not seen during development and testing.
Here, it would be really nice to get some kind of hint as to which of
several uses the underflow was caused by.
This commit therefore exposes a mem_dump_obj() function that takes
a pointer to memory (which must still be allocated if it has been
dynamically allocated) and prints available information on where that
memory came from. This pointer can reference the middle of the block as
well as the beginning of the block, as needed by things like RCU callback
functions and timer handlers that might not know where the beginning of
the memory block is. These functions and handlers can use mem_dump_obj()
to print out better hints as to where the problem might lie.
The information printed can depend on kernel configuration. For example,
the allocation return address can be printed only for slab and slub,
and even then only when the necessary debug has been enabled. For slab,
build with CONFIG_DEBUG_SLAB=y, and either use sizes with ample space
to the next power of two or use the SLAB_STORE_USER when creating the
kmem_cache structure. For slub, build with CONFIG_SLUB_DEBUG=y and
boot with slub_debug=U, or pass SLAB_STORE_USER to kmem_cache_create()
if more focused use is desired. Also for slub, use CONFIG_STACKTRACE
to enable printing of the allocation-time stack trace.
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: <linux-mm@kvack.org>
Reported-by: Andrii Nakryiko <andrii@kernel.org>
[ paulmck: Convert to printing and change names per Joonsoo Kim. ]
[ paulmck: Move slab definition per Stephen Rothwell and kbuild test robot. ]
[ paulmck: Handle CONFIG_MMU=n case where vmalloc() is kmalloc(). ]
[ paulmck: Apply Vlastimil Babka feedback on slab.c kmem_provenance(). ]
[ paulmck: Extract more info from !SLUB_DEBUG per Joonsoo Kim. ]
[ paulmck: Explicitly check for small pointers per Naresh Kamboju. ]
Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
2020-12-08 01:41:02 +00:00
|
|
|
/**
|
|
|
|
* mem_dump_obj - Print available provenance information
|
|
|
|
* @object: object for which to find provenance information.
|
|
|
|
*
|
|
|
|
* This function uses pr_cont(), so that the caller is expected to have
|
|
|
|
* printed out whatever preamble is appropriate. The provenance information
|
|
|
|
* depends on the type of object and on how much debugging is enabled.
|
|
|
|
* For example, for a slab-cache object, the slab name is printed, and,
|
|
|
|
* if available, the return address and stack trace from the allocation
|
2021-03-16 10:37:11 +00:00
|
|
|
* and last free path of that object.
|
mm: Add mem_dump_obj() to print source of memory block
There are kernel facilities such as per-CPU reference counts that give
error messages in generic handlers or callbacks, whose messages are
unenlightening. In the case of per-CPU reference-count underflow, this
is not a problem when creating a new use of this facility because in that
case the bug is almost certainly in the code implementing that new use.
However, trouble arises when deploying across many systems, which might
exercise corner cases that were not seen during development and testing.
Here, it would be really nice to get some kind of hint as to which of
several uses the underflow was caused by.
This commit therefore exposes a mem_dump_obj() function that takes
a pointer to memory (which must still be allocated if it has been
dynamically allocated) and prints available information on where that
memory came from. This pointer can reference the middle of the block as
well as the beginning of the block, as needed by things like RCU callback
functions and timer handlers that might not know where the beginning of
the memory block is. These functions and handlers can use mem_dump_obj()
to print out better hints as to where the problem might lie.
The information printed can depend on kernel configuration. For example,
the allocation return address can be printed only for slab and slub,
and even then only when the necessary debug has been enabled. For slab,
build with CONFIG_DEBUG_SLAB=y, and either use sizes with ample space
to the next power of two or use the SLAB_STORE_USER when creating the
kmem_cache structure. For slub, build with CONFIG_SLUB_DEBUG=y and
boot with slub_debug=U, or pass SLAB_STORE_USER to kmem_cache_create()
if more focused use is desired. Also for slub, use CONFIG_STACKTRACE
to enable printing of the allocation-time stack trace.
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: <linux-mm@kvack.org>
Reported-by: Andrii Nakryiko <andrii@kernel.org>
[ paulmck: Convert to printing and change names per Joonsoo Kim. ]
[ paulmck: Move slab definition per Stephen Rothwell and kbuild test robot. ]
[ paulmck: Handle CONFIG_MMU=n case where vmalloc() is kmalloc(). ]
[ paulmck: Apply Vlastimil Babka feedback on slab.c kmem_provenance(). ]
[ paulmck: Extract more info from !SLUB_DEBUG per Joonsoo Kim. ]
[ paulmck: Explicitly check for small pointers per Naresh Kamboju. ]
Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
2020-12-08 01:41:02 +00:00
|
|
|
*/
|
|
|
|
void mem_dump_obj(void *object)
|
|
|
|
{
|
2021-05-05 01:38:32 +00:00
|
|
|
const char *type;
|
|
|
|
|
2023-08-05 03:17:25 +00:00
|
|
|
if (kmem_dump_obj(object))
|
2020-12-09 00:13:57 +00:00
|
|
|
return;
|
2021-05-05 01:38:32 +00:00
|
|
|
|
2020-12-09 00:13:57 +00:00
|
|
|
if (vmalloc_dump_obj(object))
|
|
|
|
return;
|
2021-05-05 01:38:32 +00:00
|
|
|
|
rcu: dump vmalloc memory info safely
Currently, for double invoke call_rcu(), will dump rcu_head objects memory
info, if the objects is not allocated from the slab allocator, the
vmalloc_dump_obj() will be invoke and the vmap_area_lock spinlock need to
be held, since the call_rcu() can be invoked in interrupt context,
therefore, there is a possibility of spinlock deadlock scenarios.
And in Preempt-RT kernel, the rcutorture test also trigger the following
lockdep warning:
BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 1, name: swapper/0
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 1
3 locks held by swapper/0/1:
#0: ffffffffb534ee80 (fullstop_mutex){+.+.}-{4:4}, at: torture_init_begin+0x24/0xa0
#1: ffffffffb5307940 (rcu_read_lock){....}-{1:3}, at: rcu_torture_init+0x1ec7/0x2370
#2: ffffffffb536af40 (vmap_area_lock){+.+.}-{3:3}, at: find_vmap_area+0x1f/0x70
irq event stamp: 565512
hardirqs last enabled at (565511): [<ffffffffb379b138>] __call_rcu_common+0x218/0x940
hardirqs last disabled at (565512): [<ffffffffb5804262>] rcu_torture_init+0x20b2/0x2370
softirqs last enabled at (399112): [<ffffffffb36b2586>] __local_bh_enable_ip+0x126/0x170
softirqs last disabled at (399106): [<ffffffffb43fef59>] inet_register_protosw+0x9/0x1d0
Preemption disabled at:
[<ffffffffb58040c3>] rcu_torture_init+0x1f13/0x2370
CPU: 0 PID: 1 Comm: swapper/0 Tainted: G W 6.5.0-rc4-rt2-yocto-preempt-rt+ #15
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0x68/0xb0
dump_stack+0x14/0x20
__might_resched+0x1aa/0x280
? __pfx_rcu_torture_err_cb+0x10/0x10
rt_spin_lock+0x53/0x130
? find_vmap_area+0x1f/0x70
find_vmap_area+0x1f/0x70
vmalloc_dump_obj+0x20/0x60
mem_dump_obj+0x22/0x90
__call_rcu_common+0x5bf/0x940
? debug_smp_processor_id+0x1b/0x30
call_rcu_hurry+0x14/0x20
rcu_torture_init+0x1f82/0x2370
? __pfx_rcu_torture_leak_cb+0x10/0x10
? __pfx_rcu_torture_leak_cb+0x10/0x10
? __pfx_rcu_torture_init+0x10/0x10
do_one_initcall+0x6c/0x300
? debug_smp_processor_id+0x1b/0x30
kernel_init_freeable+0x2b9/0x540
? __pfx_kernel_init+0x10/0x10
kernel_init+0x1f/0x150
ret_from_fork+0x40/0x50
? __pfx_kernel_init+0x10/0x10
ret_from_fork_asm+0x1b/0x30
</TASK>
The previous patch fixes this by using the deadlock-safe best-effort
version of find_vm_area. However, in case of failure print the fact that
the pointer was a vmalloc pointer so that we print at least something.
Link: https://lkml.kernel.org/r/20230904180806.1002832-2-joel@joelfernandes.org
Fixes: 98f180837a89 ("mm: Make mem_dump_obj() handle vmalloc() memory")
Signed-off-by: Zqiang <qiang.zhang1211@gmail.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Reported-by: Zhen Lei <thunder.leizhen@huaweicloud.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-09-04 18:08:05 +00:00
|
|
|
if (is_vmalloc_addr(object))
|
|
|
|
type = "vmalloc memory";
|
|
|
|
else if (virt_addr_valid(object))
|
2021-05-05 01:38:32 +00:00
|
|
|
type = "non-slab/vmalloc memory";
|
|
|
|
else if (object == NULL)
|
|
|
|
type = "NULL pointer";
|
|
|
|
else if (object == ZERO_SIZE_PTR)
|
|
|
|
type = "zero-size pointer";
|
|
|
|
else
|
|
|
|
type = "non-paged memory";
|
|
|
|
|
|
|
|
pr_cont(" %s\n", type);
|
mm: Add mem_dump_obj() to print source of memory block
There are kernel facilities such as per-CPU reference counts that give
error messages in generic handlers or callbacks, whose messages are
unenlightening. In the case of per-CPU reference-count underflow, this
is not a problem when creating a new use of this facility because in that
case the bug is almost certainly in the code implementing that new use.
However, trouble arises when deploying across many systems, which might
exercise corner cases that were not seen during development and testing.
Here, it would be really nice to get some kind of hint as to which of
several uses the underflow was caused by.
This commit therefore exposes a mem_dump_obj() function that takes
a pointer to memory (which must still be allocated if it has been
dynamically allocated) and prints available information on where that
memory came from. This pointer can reference the middle of the block as
well as the beginning of the block, as needed by things like RCU callback
functions and timer handlers that might not know where the beginning of
the memory block is. These functions and handlers can use mem_dump_obj()
to print out better hints as to where the problem might lie.
The information printed can depend on kernel configuration. For example,
the allocation return address can be printed only for slab and slub,
and even then only when the necessary debug has been enabled. For slab,
build with CONFIG_DEBUG_SLAB=y, and either use sizes with ample space
to the next power of two or use the SLAB_STORE_USER when creating the
kmem_cache structure. For slub, build with CONFIG_SLUB_DEBUG=y and
boot with slub_debug=U, or pass SLAB_STORE_USER to kmem_cache_create()
if more focused use is desired. Also for slub, use CONFIG_STACKTRACE
to enable printing of the allocation-time stack trace.
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: <linux-mm@kvack.org>
Reported-by: Andrii Nakryiko <andrii@kernel.org>
[ paulmck: Convert to printing and change names per Joonsoo Kim. ]
[ paulmck: Move slab definition per Stephen Rothwell and kbuild test robot. ]
[ paulmck: Handle CONFIG_MMU=n case where vmalloc() is kmalloc(). ]
[ paulmck: Apply Vlastimil Babka feedback on slab.c kmem_provenance(). ]
[ paulmck: Extract more info from !SLUB_DEBUG per Joonsoo Kim. ]
[ paulmck: Explicitly check for small pointers per Naresh Kamboju. ]
Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
2020-12-08 01:41:02 +00:00
|
|
|
}
|
2020-12-08 05:23:36 +00:00
|
|
|
EXPORT_SYMBOL_GPL(mem_dump_obj);
|
2021-01-07 21:46:11 +00:00
|
|
|
#endif
|
2021-07-01 01:50:14 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* A driver might set a page logically offline -- PageOffline() -- and
|
|
|
|
* turn the page inaccessible in the hypervisor; after that, access to page
|
|
|
|
* content can be fatal.
|
|
|
|
*
|
|
|
|
* Some special PFN walkers -- i.e., /proc/kcore -- read content of random
|
|
|
|
* pages after checking PageOffline(); however, these PFN walkers can race
|
|
|
|
* with drivers that set PageOffline().
|
|
|
|
*
|
|
|
|
* page_offline_freeze()/page_offline_thaw() allows for a subsystem to
|
|
|
|
* synchronize with such drivers, achieving that a page cannot be set
|
|
|
|
* PageOffline() while frozen.
|
|
|
|
*
|
|
|
|
* page_offline_begin()/page_offline_end() is used by drivers that care about
|
|
|
|
* such races when setting a page PageOffline().
|
|
|
|
*/
|
|
|
|
static DECLARE_RWSEM(page_offline_rwsem);
|
|
|
|
|
|
|
|
void page_offline_freeze(void)
|
|
|
|
{
|
|
|
|
down_read(&page_offline_rwsem);
|
|
|
|
}
|
|
|
|
|
|
|
|
void page_offline_thaw(void)
|
|
|
|
{
|
|
|
|
up_read(&page_offline_rwsem);
|
|
|
|
}
|
|
|
|
|
|
|
|
void page_offline_begin(void)
|
|
|
|
{
|
|
|
|
down_write(&page_offline_rwsem);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(page_offline_begin);
|
|
|
|
|
|
|
|
void page_offline_end(void)
|
|
|
|
{
|
|
|
|
up_write(&page_offline_rwsem);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(page_offline_end);
|
2020-12-16 16:06:33 +00:00
|
|
|
|
2023-08-02 15:13:33 +00:00
|
|
|
#ifndef flush_dcache_folio
|
2020-12-16 16:06:33 +00:00
|
|
|
void flush_dcache_folio(struct folio *folio)
|
|
|
|
{
|
|
|
|
long i, nr = folio_nr_pages(folio);
|
|
|
|
|
|
|
|
for (i = 0; i < nr; i++)
|
|
|
|
flush_dcache_page(folio_page(folio, i));
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(flush_dcache_folio);
|
|
|
|
#endif
|