2005-04-16 15:20:36 -07:00
|
|
|
#ifndef _LINUX_MM_H
|
|
|
|
#define _LINUX_MM_H
|
|
|
|
|
|
|
|
#include <linux/errno.h>
|
|
|
|
|
|
|
|
#ifdef __KERNEL__
|
|
|
|
|
|
|
|
#include <linux/gfp.h>
|
|
|
|
#include <linux/list.h>
|
|
|
|
#include <linux/mmzone.h>
|
|
|
|
#include <linux/rbtree.h>
|
|
|
|
#include <linux/prio_tree.h>
|
|
|
|
#include <linux/fs.h>
|
2006-01-09 15:59:21 -08:00
|
|
|
#include <linux/mutex.h>
|
2006-07-03 00:24:33 -07:00
|
|
|
#include <linux/debug_locks.h>
|
2006-09-25 23:30:57 -07:00
|
|
|
#include <linux/backing-dev.h>
|
2006-09-27 01:50:01 -07:00
|
|
|
#include <linux/mm_types.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
struct mempolicy;
|
|
|
|
struct anon_vma;
|
Detach sched.h from mm.h
First thing mm.h does is including sched.h solely for can_do_mlock() inline
function which has "current" dereference inside. By dealing with can_do_mlock()
mm.h can be detached from sched.h which is good. See below, why.
This patch
a) removes unconditional inclusion of sched.h from mm.h
b) makes can_do_mlock() normal function in mm/mlock.c
c) exports can_do_mlock() to not break compilation
d) adds sched.h inclusions back to files that were getting it indirectly.
e) adds less bloated headers to some files (asm/signal.h, jiffies.h) that were
getting them indirectly
Net result is:
a) mm.h users would get less code to open, read, preprocess, parse, ... if
they don't need sched.h
b) sched.h stops being dependency for significant number of files:
on x86_64 allmodconfig touching sched.h results in recompile of 4083 files,
after patch it's only 3744 (-8.3%).
Cross-compile tested on
all arm defconfigs, all mips defconfigs, all powerpc defconfigs,
alpha alpha-up
arm
i386 i386-up i386-defconfig i386-allnoconfig
ia64 ia64-up
m68k
mips
parisc parisc-up
powerpc powerpc-up
s390 s390-up
sparc sparc-up
sparc64 sparc64-up
um-x86_64
x86_64 x86_64-up x86_64-defconfig x86_64-allnoconfig
as well as my two usual configs.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-21 01:22:52 +04:00
|
|
|
struct user_struct;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
#ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */
|
|
|
|
extern unsigned long max_mapnr;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
extern unsigned long num_physpages;
|
|
|
|
extern void * high_memory;
|
|
|
|
extern int page_cluster;
|
|
|
|
|
|
|
|
#ifdef CONFIG_SYSCTL
|
|
|
|
extern int sysctl_legacy_va_layout;
|
|
|
|
#else
|
|
|
|
#define sysctl_legacy_va_layout 0
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include <asm/page.h>
|
|
|
|
#include <asm/pgtable.h>
|
|
|
|
#include <asm/processor.h>
|
|
|
|
|
|
|
|
#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Linux kernel virtual memory manager primitives.
|
|
|
|
* The idea being to have a "virtual" mm in the same way
|
|
|
|
* we have a virtual fs - giving a cleaner interface to the
|
|
|
|
* mm details, and allowing different kinds of memory mappings
|
|
|
|
* (from shared memory to executable loading to arbitrary
|
|
|
|
* mmap() functions).
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This struct defines a memory VMM memory area. There is one of these
|
|
|
|
* per VM-area/task. A VM area is any part of the process virtual memory
|
|
|
|
* space that has a special rule for the page-fault handlers (ie a shared
|
|
|
|
* library, the executable area etc).
|
|
|
|
*/
|
|
|
|
struct vm_area_struct {
|
|
|
|
struct mm_struct * vm_mm; /* The address space we belong to. */
|
|
|
|
unsigned long vm_start; /* Our start address within vm_mm. */
|
|
|
|
unsigned long vm_end; /* The first byte after our end address
|
|
|
|
within vm_mm. */
|
|
|
|
|
|
|
|
/* linked list of VM areas per task, sorted by address */
|
|
|
|
struct vm_area_struct *vm_next;
|
|
|
|
|
|
|
|
pgprot_t vm_page_prot; /* Access permissions of this VMA. */
|
|
|
|
unsigned long vm_flags; /* Flags, listed below. */
|
|
|
|
|
|
|
|
struct rb_node vm_rb;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For areas with an address space and backing store,
|
|
|
|
* linkage into the address_space->i_mmap prio tree, or
|
|
|
|
* linkage to the list of like vmas hanging off its node, or
|
|
|
|
* linkage of vma in the address_space->i_mmap_nonlinear list.
|
|
|
|
*/
|
|
|
|
union {
|
|
|
|
struct {
|
|
|
|
struct list_head list;
|
|
|
|
void *parent; /* aligns with prio_tree_node parent */
|
|
|
|
struct vm_area_struct *head;
|
|
|
|
} vm_set;
|
|
|
|
|
|
|
|
struct raw_prio_tree_node prio_tree_node;
|
|
|
|
} shared;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
|
|
|
|
* list, after a COW of one of the file pages. A MAP_SHARED vma
|
|
|
|
* can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack
|
|
|
|
* or brk vma (with NULL file) can only be in an anon_vma list.
|
|
|
|
*/
|
|
|
|
struct list_head anon_vma_node; /* Serialized by anon_vma->lock */
|
|
|
|
struct anon_vma *anon_vma; /* Serialized by page_table_lock */
|
|
|
|
|
|
|
|
/* Function pointers to deal with this struct. */
|
|
|
|
struct vm_operations_struct * vm_ops;
|
|
|
|
|
|
|
|
/* Information about our backing store: */
|
|
|
|
unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE
|
|
|
|
units, *not* PAGE_CACHE_SIZE */
|
|
|
|
struct file * vm_file; /* File we map to (can be NULL). */
|
|
|
|
void * vm_private_data; /* was vm_pte (shared mem) */
|
|
|
|
unsigned long vm_truncate_count;/* truncate_count or restart_addr */
|
|
|
|
|
|
|
|
#ifndef CONFIG_MMU
|
|
|
|
atomic_t vm_usage; /* refcount (VMAs shared if !MMU) */
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
|
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
2006-12-06 20:32:48 -08:00
|
|
|
extern struct kmem_cache *vm_area_cachep;
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* This struct defines the per-mm list of VMAs for uClinux. If CONFIG_MMU is
|
|
|
|
* disabled, then there's a single shared list of VMAs maintained by the
|
|
|
|
* system, and mm's subscribe to these individually
|
|
|
|
*/
|
|
|
|
struct vm_list_struct {
|
|
|
|
struct vm_list_struct *next;
|
|
|
|
struct vm_area_struct *vma;
|
|
|
|
};
|
|
|
|
|
|
|
|
#ifndef CONFIG_MMU
|
|
|
|
extern struct rb_root nommu_vma_tree;
|
|
|
|
extern struct rw_semaphore nommu_vma_sem;
|
|
|
|
|
|
|
|
extern unsigned int kobjsize(const void *objp);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* vm_flags..
|
|
|
|
*/
|
|
|
|
#define VM_READ 0x00000001 /* currently active flags */
|
|
|
|
#define VM_WRITE 0x00000002
|
|
|
|
#define VM_EXEC 0x00000004
|
|
|
|
#define VM_SHARED 0x00000008
|
|
|
|
|
2005-09-21 09:55:39 -07:00
|
|
|
/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
|
2005-04-16 15:20:36 -07:00
|
|
|
#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */
|
|
|
|
#define VM_MAYWRITE 0x00000020
|
|
|
|
#define VM_MAYEXEC 0x00000040
|
|
|
|
#define VM_MAYSHARE 0x00000080
|
|
|
|
|
|
|
|
#define VM_GROWSDOWN 0x00000100 /* general info on the segment */
|
|
|
|
#define VM_GROWSUP 0x00000200
|
2005-11-28 14:34:23 -08:00
|
|
|
#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
|
2005-04-16 15:20:36 -07:00
|
|
|
#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
|
|
|
|
|
|
|
|
#define VM_EXECUTABLE 0x00001000
|
|
|
|
#define VM_LOCKED 0x00002000
|
|
|
|
#define VM_IO 0x00004000 /* Memory mapped I/O or similar */
|
|
|
|
|
|
|
|
/* Used by sys_madvise() */
|
|
|
|
#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */
|
|
|
|
#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */
|
|
|
|
|
|
|
|
#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */
|
|
|
|
#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
|
[PATCH] unpaged: VM_UNPAGED
Although we tend to associate VM_RESERVED with remap_pfn_range, quite a few
drivers set VM_RESERVED on areas which are then populated by nopage. The
PageReserved removal in 2.6.15-rc1 changed VM_RESERVED not to free pages in
zap_pte_range, without changing those drivers not to set it: so their pages
just leak away.
Let's not change miscellaneous drivers now: introduce VM_UNPAGED at the core,
to flag the special areas where the ptes may have no struct page, or if they
have then it's not to be touched. Replace most instances of VM_RESERVED in
core mm by VM_UNPAGED. Force it on in remap_pfn_range, and the sparc and
sparc64 io_remap_pfn_range.
Revert addition of VM_RESERVED to powerpc vdso, it's not needed there. Is it
needed anywhere? It still governs the mm->reserved_vm statistic, and special
vmas not to be merged, and areas not to be core dumped; but could probably be
eliminated later (the drivers are probably specifying it because in 2.4 it
kept swapout off the vma, but in 2.6 we work from the LRU, which these pages
don't get on).
Use the VM_SHM slot for VM_UNPAGED, and define VM_SHM to 0: it serves no
purpose whatsoever, and should be removed from drivers when we clean up.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: William Irwin <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-11-21 21:32:15 -08:00
|
|
|
#define VM_RESERVED 0x00080000 /* Count as reserved_vm like IO */
|
2005-04-16 15:20:36 -07:00
|
|
|
#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
|
|
|
|
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
|
|
|
|
#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
|
|
|
|
#define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */
|
2005-12-16 10:21:23 -08:00
|
|
|
#define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */
|
2007-01-26 00:56:48 -08:00
|
|
|
#define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */
|
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page.
Andrea Arcangeli identified a subtle race between invalidation of pages from
pagecache with userspace mappings, and do_no_page.
The issue is that invalidation has to shoot down all mappings to the page,
before it can be discarded from the pagecache. Between shooting down ptes to
a particular page, and actually dropping the struct page from the pagecache,
do_no_page from any process might fault on that page and establish a new
mapping to the page just before it gets discarded from the pagecache.
The most common case where such invalidation is used is in file truncation.
This case was catered for by doing a sort of open-coded seqlock between the
file's i_size, and its truncate_count.
Truncation will decrease i_size, then increment truncate_count before
unmapping userspace pages; do_no_page will read truncate_count, then find the
page if it is within i_size, and then check truncate_count under the page
table lock and back out and retry if it had subsequently been changed (ptl
will serialise against unmapping, and ensure a potentially updated
truncate_count is actually visible).
Complexity and documentation issues aside, the locking protocol fails in the
case where we would like to invalidate pagecache inside i_size. do_no_page
can come in anytime and filemap_nopage is not aware of the invalidation in
progress (as it is when it is outside i_size). The end result is that
dangling (->mapping == NULL) pages that appear to be from a particular file
may be mapped into userspace with nonsense data. Valid mappings to the same
place will see a different page.
Andrea implemented two working fixes, one using a real seqlock, another using
a page->flags bit. He also proposed using the page lock in do_no_page, but
that was initially considered too heavyweight. However, it is not a global or
per-file lock, and the page cacheline is modified in do_no_page to increment
_count and _mapcount anyway, so a further modification should not be a large
performance hit. Scalability is not an issue.
This patch implements this latter approach. ->nopage implementations return
with the page locked if it is possible for their underlying file to be
invalidated (in that case, they must set a special vm_flags bit to indicate
so). do_no_page only unlocks the page after setting up the mapping
completely. invalidation is excluded because it holds the page lock during
invalidation of each page (and ensures that the page is not mapped while
holding the lock).
This also allows significant simplifications in do_no_page, because we have
the page locked in the right place in the pagecache from the start.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 01:46:57 -07:00
|
|
|
|
2007-07-19 01:47:03 -07:00
|
|
|
#define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
|
|
|
|
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_STACK_GROWSUP
|
|
|
|
#define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
|
|
|
|
#else
|
|
|
|
#define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define VM_READHINTMASK (VM_SEQ_READ | VM_RAND_READ)
|
|
|
|
#define VM_ClearReadHint(v) (v)->vm_flags &= ~VM_READHINTMASK
|
|
|
|
#define VM_NormalReadHint(v) (!((v)->vm_flags & VM_READHINTMASK))
|
|
|
|
#define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ)
|
|
|
|
#define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* mapping from the currently active vm_flags protection bits (the
|
|
|
|
* low four bits) to a page protection mask..
|
|
|
|
*/
|
|
|
|
extern pgprot_t protection_map[16];
|
|
|
|
|
2007-07-19 01:47:03 -07:00
|
|
|
#define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
|
|
|
|
#define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */
|
|
|
|
|
|
|
|
|
2007-07-19 01:46:59 -07:00
|
|
|
/*
|
2007-07-19 01:47:03 -07:00
|
|
|
* vm_fault is filled by the the pagefault handler and passed to the vma's
|
2007-07-19 01:47:05 -07:00
|
|
|
* ->fault function. The vma's ->fault is responsible for returning a bitmask
|
|
|
|
* of VM_FAULT_xxx flags that give details about how the fault was handled.
|
2007-07-19 01:46:59 -07:00
|
|
|
*
|
2007-07-19 01:47:03 -07:00
|
|
|
* pgoff should be used in favour of virtual_address, if possible. If pgoff
|
|
|
|
* is used, one may set VM_CAN_NONLINEAR in the vma->vm_flags to get nonlinear
|
|
|
|
* mapping support.
|
2007-07-19 01:46:59 -07:00
|
|
|
*/
|
2007-07-19 01:47:03 -07:00
|
|
|
struct vm_fault {
|
|
|
|
unsigned int flags; /* FAULT_FLAG_xxx flags */
|
|
|
|
pgoff_t pgoff; /* Logical page offset based on vma */
|
|
|
|
void __user *virtual_address; /* Faulting virtual address */
|
|
|
|
|
|
|
|
struct page *page; /* ->fault handlers should return a
|
2007-07-19 01:47:05 -07:00
|
|
|
* page here, unless VM_FAULT_NOPAGE
|
2007-07-19 01:47:03 -07:00
|
|
|
* is set (which is also implied by
|
2007-07-19 01:47:05 -07:00
|
|
|
* VM_FAULT_ERROR).
|
2007-07-19 01:47:03 -07:00
|
|
|
*/
|
2007-07-19 01:46:59 -07:00
|
|
|
};
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* These are the virtual MM functions - opening of an area, closing and
|
|
|
|
* unmapping it (needed to keep files on disk up-to-date etc), pointer
|
|
|
|
* to the functions called when a no-page or a wp-page exception occurs.
|
|
|
|
*/
|
|
|
|
struct vm_operations_struct {
|
|
|
|
void (*open)(struct vm_area_struct * area);
|
|
|
|
void (*close)(struct vm_area_struct * area);
|
2007-07-19 01:47:03 -07:00
|
|
|
int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
|
2007-07-19 01:46:59 -07:00
|
|
|
struct page *(*nopage)(struct vm_area_struct *area,
|
|
|
|
unsigned long address, int *type);
|
|
|
|
unsigned long (*nopfn)(struct vm_area_struct *area,
|
|
|
|
unsigned long address);
|
2006-06-23 02:03:43 -07:00
|
|
|
|
|
|
|
/* notification that a previously read-only page is about to become
|
|
|
|
* writable, if an error is returned it will cause a SIGBUS */
|
|
|
|
int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page);
|
2005-04-16 15:20:36 -07:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
|
|
|
|
struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
|
|
|
|
unsigned long addr);
|
2006-06-25 05:46:48 -07:00
|
|
|
int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from,
|
|
|
|
const nodemask_t *to, unsigned long flags);
|
2005-04-16 15:20:36 -07:00
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
|
|
|
struct mmu_gather;
|
|
|
|
struct inode;
|
|
|
|
|
2006-01-08 01:04:36 -08:00
|
|
|
#define page_private(page) ((page)->private)
|
|
|
|
#define set_page_private(page, v) ((page)->private = (v))
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 18:16:40 -07:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* FIXME: take this include out, include page-flags.h in
|
|
|
|
* files which need it (119 of them)
|
|
|
|
*/
|
|
|
|
#include <linux/page-flags.h>
|
|
|
|
|
2006-09-25 23:30:55 -07:00
|
|
|
#ifdef CONFIG_DEBUG_VM
|
|
|
|
#define VM_BUG_ON(cond) BUG_ON(cond)
|
|
|
|
#else
|
|
|
|
#define VM_BUG_ON(condition) do { } while(0)
|
|
|
|
#endif
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Methods to modify the page usage count.
|
|
|
|
*
|
|
|
|
* What counts for a page usage:
|
|
|
|
* - cache mapping (page->mapping)
|
|
|
|
* - private data (page->private)
|
|
|
|
* - page mapped in a task's page tables, each mapping
|
|
|
|
* is counted separately
|
|
|
|
*
|
|
|
|
* Also, many kernel routines increase the page count before a critical
|
|
|
|
* routine so they can be sure the page doesn't go away from under them.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
2006-09-25 23:31:35 -07:00
|
|
|
* Drop a ref, return true if the refcount fell to zero (the page has no users)
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2006-03-22 00:08:03 -08:00
|
|
|
static inline int put_page_testzero(struct page *page)
|
|
|
|
{
|
2006-09-25 23:30:55 -07:00
|
|
|
VM_BUG_ON(atomic_read(&page->_count) == 0);
|
2006-03-22 00:08:03 -08:00
|
|
|
return atomic_dec_and_test(&page->_count);
|
2006-03-22 00:08:03 -08:00
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/*
|
2006-03-22 00:08:03 -08:00
|
|
|
* Try to grab a ref unless the page has a refcount of zero, return false if
|
|
|
|
* that is the case.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2006-03-22 00:08:03 -08:00
|
|
|
static inline int get_page_unless_zero(struct page *page)
|
|
|
|
{
|
2006-09-25 23:30:55 -07:00
|
|
|
VM_BUG_ON(PageCompound(page));
|
2006-03-22 00:08:03 -08:00
|
|
|
return atomic_inc_not_zero(&page->_count);
|
2006-03-22 00:08:03 -08:00
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-05-06 14:49:39 -07:00
|
|
|
static inline struct page *compound_head(struct page *page)
|
|
|
|
{
|
2007-05-06 14:49:40 -07:00
|
|
|
if (unlikely(PageTail(page)))
|
2007-05-06 14:49:39 -07:00
|
|
|
return page->first_page;
|
|
|
|
return page;
|
|
|
|
}
|
|
|
|
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 18:16:40 -07:00
|
|
|
static inline int page_count(struct page *page)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2007-05-06 14:49:39 -07:00
|
|
|
return atomic_read(&compound_head(page)->_count);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void get_page(struct page *page)
|
|
|
|
{
|
2007-05-06 14:49:39 -07:00
|
|
|
page = compound_head(page);
|
2006-09-25 23:30:55 -07:00
|
|
|
VM_BUG_ON(atomic_read(&page->_count) == 0);
|
2005-04-16 15:20:36 -07:00
|
|
|
atomic_inc(&page->_count);
|
|
|
|
}
|
|
|
|
|
2007-05-06 14:49:41 -07:00
|
|
|
static inline struct page *virt_to_head_page(const void *x)
|
|
|
|
{
|
|
|
|
struct page *page = virt_to_page(x);
|
|
|
|
return compound_head(page);
|
|
|
|
}
|
|
|
|
|
2006-03-22 00:08:40 -08:00
|
|
|
/*
|
|
|
|
* Setup the page count before being freed into the page allocator for
|
|
|
|
* the first time (boot or memory hotplug)
|
|
|
|
*/
|
|
|
|
static inline void init_page_count(struct page *page)
|
|
|
|
{
|
|
|
|
atomic_set(&page->_count, 1);
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
void put_page(struct page *page);
|
2006-08-13 23:24:27 -07:00
|
|
|
void put_pages_list(struct list_head *pages);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2006-03-22 00:08:05 -08:00
|
|
|
void split_page(struct page *page, unsigned int order);
|
|
|
|
|
2006-12-06 20:33:32 -08:00
|
|
|
/*
|
|
|
|
* Compound pages have a destructor function. Provide a
|
|
|
|
* prototype for that function and accessor functions.
|
|
|
|
* These are _only_ valid on the head of a PG_compound page.
|
|
|
|
*/
|
|
|
|
typedef void compound_page_dtor(struct page *);
|
|
|
|
|
|
|
|
static inline void set_compound_page_dtor(struct page *page,
|
|
|
|
compound_page_dtor *dtor)
|
|
|
|
{
|
|
|
|
page[1].lru.next = (void *)dtor;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline compound_page_dtor *get_compound_page_dtor(struct page *page)
|
|
|
|
{
|
|
|
|
return (compound_page_dtor *)page[1].lru.next;
|
|
|
|
}
|
|
|
|
|
2007-05-06 14:49:39 -07:00
|
|
|
static inline int compound_order(struct page *page)
|
|
|
|
{
|
2007-05-06 14:49:40 -07:00
|
|
|
if (!PageHead(page))
|
2007-05-06 14:49:39 -07:00
|
|
|
return 0;
|
|
|
|
return (unsigned long)page[1].lru.prev;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void set_compound_order(struct page *page, unsigned long order)
|
|
|
|
{
|
|
|
|
page[1].lru.prev = (void *)order;
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Multiple processes may "see" the same page. E.g. for untouched
|
|
|
|
* mappings of /dev/null, all processes see the same page full of
|
|
|
|
* zeroes, and text pages of executables and shared libraries have
|
|
|
|
* only one copy in memory, at most, normally.
|
|
|
|
*
|
|
|
|
* For the non-reserved pages, page_count(page) denotes a reference count.
|
2005-09-21 09:55:38 -07:00
|
|
|
* page_count() == 0 means the page is free. page->lru is then used for
|
|
|
|
* freelist management in the buddy allocator.
|
2006-09-25 23:31:35 -07:00
|
|
|
* page_count() > 0 means the page has been allocated.
|
2005-04-16 15:20:36 -07:00
|
|
|
*
|
2006-09-25 23:31:35 -07:00
|
|
|
* Pages are allocated by the slab allocator in order to provide memory
|
|
|
|
* to kmalloc and kmem_cache_alloc. In this case, the management of the
|
|
|
|
* page, and the fields in 'struct page' are the responsibility of mm/slab.c
|
|
|
|
* unless a particular usage is carefully commented. (the responsibility of
|
|
|
|
* freeing the kmalloc memory is the caller's, of course).
|
2005-04-16 15:20:36 -07:00
|
|
|
*
|
2006-09-25 23:31:35 -07:00
|
|
|
* A page may be used by anyone else who does a __get_free_page().
|
|
|
|
* In this case, page_count still tracks the references, and should only
|
|
|
|
* be used through the normal accessor functions. The top bits of page->flags
|
|
|
|
* and page->virtual store page management information, but all other fields
|
|
|
|
* are unused and could be used privately, carefully. The management of this
|
|
|
|
* page is the responsibility of the one who allocated it, and those who have
|
|
|
|
* subsequently been given references to it.
|
|
|
|
*
|
|
|
|
* The other pages (we may call them "pagecache pages") are completely
|
2005-04-16 15:20:36 -07:00
|
|
|
* managed by the Linux memory manager: I/O, buffers, swapping etc.
|
|
|
|
* The following discussion applies only to them.
|
|
|
|
*
|
2006-09-25 23:31:35 -07:00
|
|
|
* A pagecache page contains an opaque `private' member, which belongs to the
|
|
|
|
* page's address_space. Usually, this is the address of a circular list of
|
|
|
|
* the page's disk buffers. PG_private must be set to tell the VM to call
|
|
|
|
* into the filesystem to release these pages.
|
2005-04-16 15:20:36 -07:00
|
|
|
*
|
2006-09-25 23:31:35 -07:00
|
|
|
* A page may belong to an inode's memory mapping. In this case, page->mapping
|
|
|
|
* is the pointer to the inode, and page->index is the file offset of the page,
|
|
|
|
* in units of PAGE_CACHE_SIZE.
|
2005-04-16 15:20:36 -07:00
|
|
|
*
|
2006-09-25 23:31:35 -07:00
|
|
|
* If pagecache pages are not associated with an inode, they are said to be
|
|
|
|
* anonymous pages. These may become associated with the swapcache, and in that
|
|
|
|
* case PG_swapcache is set, and page->private is an offset into the swapcache.
|
2005-04-16 15:20:36 -07:00
|
|
|
*
|
2006-09-25 23:31:35 -07:00
|
|
|
* In either case (swapcache or inode backed), the pagecache itself holds one
|
|
|
|
* reference to the page. Setting PG_private should also increment the
|
|
|
|
* refcount. The each user mapping also has a reference to the page.
|
2005-04-16 15:20:36 -07:00
|
|
|
*
|
2006-09-25 23:31:35 -07:00
|
|
|
* The pagecache pages are stored in a per-mapping radix tree, which is
|
|
|
|
* rooted at mapping->page_tree, and indexed by offset.
|
|
|
|
* Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space
|
|
|
|
* lists, we instead now tag pages as dirty/writeback in the radix tree.
|
2005-04-16 15:20:36 -07:00
|
|
|
*
|
2006-09-25 23:31:35 -07:00
|
|
|
* All pagecache pages may be subject to I/O:
|
2005-04-16 15:20:36 -07:00
|
|
|
* - inode pages may need to be read from disk,
|
|
|
|
* - inode pages which have been modified and are MAP_SHARED may need
|
2006-09-25 23:31:35 -07:00
|
|
|
* to be written back to the inode on disk,
|
|
|
|
* - anonymous pages (including MAP_PRIVATE file mappings) which have been
|
|
|
|
* modified may need to be swapped out to swap space and (later) to be read
|
|
|
|
* back into memory.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The zone field is never updated after free_area_init_core()
|
|
|
|
* sets it, so none of the operations on it need to be atomic.
|
|
|
|
*/
|
2005-06-23 00:07:40 -07:00
|
|
|
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 00:07:54 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* page->flags layout:
|
|
|
|
*
|
|
|
|
* There are three possibilities for how page->flags get
|
|
|
|
* laid out. The first is for the normal case, without
|
|
|
|
* sparsemem. The second is for sparsemem when there is
|
|
|
|
* plenty of space for node and section. The last is when
|
|
|
|
* we have run out of space and have to fall back to an
|
|
|
|
* alternate (slower) way of determining the node.
|
|
|
|
*
|
|
|
|
* No sparsemem: | NODE | ZONE | ... | FLAGS |
|
|
|
|
* with space for node: | SECTION | NODE | ZONE | ... | FLAGS |
|
|
|
|
* no space for node: | SECTION | ZONE | ... | FLAGS |
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_SPARSEMEM
|
|
|
|
#define SECTIONS_WIDTH SECTIONS_SHIFT
|
|
|
|
#else
|
|
|
|
#define SECTIONS_WIDTH 0
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define ZONES_WIDTH ZONES_SHIFT
|
|
|
|
|
|
|
|
#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= FLAGS_RESERVED
|
|
|
|
#define NODES_WIDTH NODES_SHIFT
|
|
|
|
#else
|
|
|
|
#define NODES_WIDTH 0
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */
|
2005-11-05 17:25:53 +01:00
|
|
|
#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 00:07:54 -07:00
|
|
|
#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH)
|
|
|
|
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We are going to use the flags for the page to node mapping if its in
|
|
|
|
* there. This includes the case where there is no node, so it is implicit.
|
|
|
|
*/
|
2006-12-06 20:31:45 -08:00
|
|
|
#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0)
|
|
|
|
#define NODE_NOT_IN_PAGE_FLAGS
|
|
|
|
#endif
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 00:07:54 -07:00
|
|
|
|
|
|
|
#ifndef PFN_SECTION_SHIFT
|
|
|
|
#define PFN_SECTION_SHIFT 0
|
|
|
|
#endif
|
2005-06-23 00:07:40 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Define the bit shifts to access each section. For non-existant
|
|
|
|
* sections we define the shift as 0; that plus a 0 mask ensures
|
|
|
|
* the compiler will optimise away reference to them.
|
|
|
|
*/
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 00:07:54 -07:00
|
|
|
#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
|
|
|
|
#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0))
|
|
|
|
#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0))
|
2005-06-23 00:07:40 -07:00
|
|
|
|
2006-12-06 20:31:45 -08:00
|
|
|
/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allcator */
|
|
|
|
#ifdef NODE_NOT_IN_PAGEFLAGS
|
|
|
|
#define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT)
|
2007-02-10 01:43:14 -08:00
|
|
|
#define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF)? \
|
|
|
|
SECTIONS_PGOFF : ZONES_PGOFF)
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 00:07:54 -07:00
|
|
|
#else
|
2006-12-06 20:31:45 -08:00
|
|
|
#define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT)
|
2007-02-10 01:43:14 -08:00
|
|
|
#define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF)? \
|
|
|
|
NODES_PGOFF : ZONES_PGOFF)
|
2006-12-06 20:31:45 -08:00
|
|
|
#endif
|
|
|
|
|
2007-02-10 01:43:14 -08:00
|
|
|
#define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0))
|
2005-06-23 00:07:40 -07:00
|
|
|
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 00:07:54 -07:00
|
|
|
#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
|
|
|
|
#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
|
2005-06-23 00:07:40 -07:00
|
|
|
#endif
|
|
|
|
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 00:07:54 -07:00
|
|
|
#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1)
|
|
|
|
#define NODES_MASK ((1UL << NODES_WIDTH) - 1)
|
|
|
|
#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1)
|
2006-12-06 20:31:45 -08:00
|
|
|
#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1)
|
2005-06-23 00:07:40 -07:00
|
|
|
|
2006-09-25 23:31:13 -07:00
|
|
|
static inline enum zone_type page_zonenum(struct page *page)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2005-06-23 00:07:40 -07:00
|
|
|
return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2006-12-06 20:31:45 -08:00
|
|
|
/*
|
|
|
|
* The identification function is only used by the buddy allocator for
|
|
|
|
* determining if two pages could be buddies. We are not really
|
|
|
|
* identifying a zone since we could be using a the section number
|
|
|
|
* id if we have not node id available in page flags.
|
|
|
|
* We guarantee only that it will return the same value for two
|
|
|
|
* combinable pages in a zone.
|
|
|
|
*/
|
2006-06-23 02:03:01 -07:00
|
|
|
static inline int page_zone_id(struct page *page)
|
|
|
|
{
|
2006-12-06 20:31:45 -08:00
|
|
|
return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
|
2005-06-23 00:07:40 -07:00
|
|
|
}
|
|
|
|
|
2006-12-06 20:33:03 -08:00
|
|
|
static inline int zone_to_nid(struct zone *zone)
|
2006-09-25 23:31:55 -07:00
|
|
|
{
|
2006-09-27 01:50:08 -07:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
return zone->node;
|
|
|
|
#else
|
|
|
|
return 0;
|
|
|
|
#endif
|
2006-09-25 23:31:55 -07:00
|
|
|
}
|
|
|
|
|
2006-12-06 20:31:45 -08:00
|
|
|
#ifdef NODE_NOT_IN_PAGE_FLAGS
|
2006-12-06 20:33:03 -08:00
|
|
|
extern int page_to_nid(struct page *page);
|
2006-12-06 20:31:45 -08:00
|
|
|
#else
|
2006-12-06 20:33:03 -08:00
|
|
|
static inline int page_to_nid(struct page *page)
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 00:07:54 -07:00
|
|
|
{
|
2006-12-06 20:31:45 -08:00
|
|
|
return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 00:07:54 -07:00
|
|
|
}
|
2006-12-06 20:31:45 -08:00
|
|
|
#endif
|
|
|
|
|
|
|
|
static inline struct zone *page_zone(struct page *page)
|
|
|
|
{
|
|
|
|
return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
|
|
|
|
}
|
|
|
|
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 00:07:54 -07:00
|
|
|
static inline unsigned long page_to_section(struct page *page)
|
|
|
|
{
|
|
|
|
return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
|
|
|
|
}
|
|
|
|
|
2006-09-25 23:31:13 -07:00
|
|
|
static inline void set_page_zone(struct page *page, enum zone_type zone)
|
2005-06-23 00:07:40 -07:00
|
|
|
{
|
|
|
|
page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
|
|
|
|
page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
|
|
|
|
}
|
2006-09-25 23:31:13 -07:00
|
|
|
|
2005-06-23 00:07:40 -07:00
|
|
|
static inline void set_page_node(struct page *page, unsigned long node)
|
|
|
|
{
|
|
|
|
page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
|
|
|
|
page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2006-12-06 20:31:45 -08:00
|
|
|
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 00:07:54 -07:00
|
|
|
static inline void set_page_section(struct page *page, unsigned long section)
|
|
|
|
{
|
|
|
|
page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
|
|
|
|
page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2006-09-25 23:31:13 -07:00
|
|
|
static inline void set_page_links(struct page *page, enum zone_type zone,
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 00:07:54 -07:00
|
|
|
unsigned long node, unsigned long pfn)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2005-06-23 00:07:40 -07:00
|
|
|
set_page_zone(page, zone);
|
|
|
|
set_page_node(page, node);
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 00:07:54 -07:00
|
|
|
set_page_section(page, pfn_to_section_nr(pfn));
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2006-06-30 01:55:32 -07:00
|
|
|
/*
|
|
|
|
* Some inline functions in vmstat.h depend on page_zone()
|
|
|
|
*/
|
|
|
|
#include <linux/vmstat.h>
|
|
|
|
|
2006-01-14 13:21:30 -08:00
|
|
|
static __always_inline void *lowmem_page_address(struct page *page)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
return __va(page_to_pfn(page) << PAGE_SHIFT);
|
|
|
|
}
|
|
|
|
|
|
|
|
#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
|
|
|
|
#define HASHED_PAGE_VIRTUAL
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if defined(WANT_PAGE_VIRTUAL)
|
|
|
|
#define page_address(page) ((page)->virtual)
|
|
|
|
#define set_page_address(page, address) \
|
|
|
|
do { \
|
|
|
|
(page)->virtual = (address); \
|
|
|
|
} while(0)
|
|
|
|
#define page_address_init() do { } while(0)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if defined(HASHED_PAGE_VIRTUAL)
|
|
|
|
void *page_address(struct page *page);
|
|
|
|
void set_page_address(struct page *page, void *virtual);
|
|
|
|
void page_address_init(void);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
|
|
|
|
#define page_address(page) lowmem_page_address(page)
|
|
|
|
#define set_page_address(page, address) do { } while(0)
|
|
|
|
#define page_address_init() do { } while(0)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* On an anonymous page mapped into a user virtual memory area,
|
|
|
|
* page->mapping points to its anon_vma, not to a struct address_space;
|
|
|
|
* with the PAGE_MAPPING_ANON bit set to distinguish it.
|
|
|
|
*
|
|
|
|
* Please note that, confusingly, "page_mapping" refers to the inode
|
|
|
|
* address_space which maps the page from disk; whereas "page_mapped"
|
|
|
|
* refers to user virtual address space into which the page is mapped.
|
|
|
|
*/
|
|
|
|
#define PAGE_MAPPING_ANON 1
|
|
|
|
|
|
|
|
extern struct address_space swapper_space;
|
|
|
|
static inline struct address_space *page_mapping(struct page *page)
|
|
|
|
{
|
|
|
|
struct address_space *mapping = page->mapping;
|
|
|
|
|
2007-07-17 04:03:33 -07:00
|
|
|
VM_BUG_ON(PageSlab(page));
|
2005-04-16 15:20:36 -07:00
|
|
|
if (unlikely(PageSwapCache(page)))
|
|
|
|
mapping = &swapper_space;
|
2007-06-21 23:27:45 +01:00
|
|
|
#ifdef CONFIG_SLUB
|
|
|
|
else if (unlikely(PageSlab(page)))
|
|
|
|
mapping = NULL;
|
|
|
|
#endif
|
2005-04-16 15:20:36 -07:00
|
|
|
else if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON))
|
|
|
|
mapping = NULL;
|
|
|
|
return mapping;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int PageAnon(struct page *page)
|
|
|
|
{
|
|
|
|
return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return the pagecache index of the passed page. Regular pagecache pages
|
|
|
|
* use ->index whereas swapcache pages use ->private
|
|
|
|
*/
|
|
|
|
static inline pgoff_t page_index(struct page *page)
|
|
|
|
{
|
|
|
|
if (unlikely(PageSwapCache(page)))
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 18:16:40 -07:00
|
|
|
return page_private(page);
|
2005-04-16 15:20:36 -07:00
|
|
|
return page->index;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The atomic page->_mapcount, like _count, starts from -1:
|
|
|
|
* so that transitions both from it and to it can be tracked,
|
|
|
|
* using atomic_inc_and_test and atomic_add_negative(-1).
|
|
|
|
*/
|
|
|
|
static inline void reset_page_mapcount(struct page *page)
|
|
|
|
{
|
|
|
|
atomic_set(&(page)->_mapcount, -1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int page_mapcount(struct page *page)
|
|
|
|
{
|
|
|
|
return atomic_read(&(page)->_mapcount) + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return true if this page is mapped into pagetables.
|
|
|
|
*/
|
|
|
|
static inline int page_mapped(struct page *page)
|
|
|
|
{
|
|
|
|
return atomic_read(&(page)->_mapcount) >= 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Error return values for the *_nopage functions
|
|
|
|
*/
|
|
|
|
#define NOPAGE_SIGBUS (NULL)
|
|
|
|
#define NOPAGE_OOM ((struct page *) (-1))
|
|
|
|
|
2006-09-27 01:50:10 -07:00
|
|
|
/*
|
|
|
|
* Error return values for the *_nopfn functions
|
|
|
|
*/
|
|
|
|
#define NOPFN_SIGBUS ((unsigned long) -1)
|
|
|
|
#define NOPFN_OOM ((unsigned long) -2)
|
2007-02-12 00:51:38 -08:00
|
|
|
#define NOPFN_REFAULT ((unsigned long) -3)
|
2006-09-27 01:50:10 -07:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Different kinds of faults, as returned by handle_mm_fault().
|
|
|
|
* Used to decide whether a process gets delivered SIGBUS or
|
|
|
|
* just gets major/minor fault counters bumped up.
|
|
|
|
*/
|
2007-07-19 01:47:03 -07:00
|
|
|
|
2007-07-19 01:47:05 -07:00
|
|
|
#define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */
|
2007-07-19 01:47:03 -07:00
|
|
|
|
2007-07-19 01:47:05 -07:00
|
|
|
#define VM_FAULT_OOM 0x0001
|
|
|
|
#define VM_FAULT_SIGBUS 0x0002
|
|
|
|
#define VM_FAULT_MAJOR 0x0004
|
|
|
|
#define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */
|
[PATCH] fix get_user_pages bug
Checking pte_dirty instead of pte_write in __follow_page is problematic
for s390, and for copy_one_pte which leaves dirty when clearing write.
So revert __follow_page to check pte_write as before, and make
do_wp_page pass back a special extra VM_FAULT_WRITE bit to say it has
done its full job: once get_user_pages receives this value, it no longer
requires pte_write in __follow_page.
But most callers of handle_mm_fault, in the various architectures, have
switch statements which do not expect this new case. To avoid changing
them all in a hurry, make an inline wrapper function (using the old
name) that masks off the new bit, and use the extended interface with
double underscores.
Yes, we do have a call to do_wp_page from do_swap_page, but no need to
change that: in rare case it's needed, another do_wp_page will follow.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
[ Cleanups by Nick Piggin ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-08-03 20:24:01 +10:00
|
|
|
|
2007-07-19 01:47:05 -07:00
|
|
|
#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
|
|
|
|
#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-07-19 01:47:05 -07:00
|
|
|
#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS)
|
2007-07-19 01:47:03 -07:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK)
|
|
|
|
|
|
|
|
extern void show_free_areas(void);
|
|
|
|
|
|
|
|
#ifdef CONFIG_SHMEM
|
|
|
|
int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new);
|
|
|
|
struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
|
|
|
|
unsigned long addr);
|
|
|
|
int shmem_lock(struct file *file, int lock, struct user_struct *user);
|
|
|
|
#else
|
2006-01-06 00:10:52 -08:00
|
|
|
static inline int shmem_lock(struct file *file, int lock,
|
|
|
|
struct user_struct *user)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int shmem_set_policy(struct vm_area_struct *vma,
|
|
|
|
struct mempolicy *new)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
|
|
|
|
unsigned long addr)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
#endif
|
|
|
|
struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags);
|
|
|
|
|
|
|
|
int shmem_zero_setup(struct vm_area_struct *);
|
|
|
|
|
2006-01-06 00:11:42 -08:00
|
|
|
#ifndef CONFIG_MMU
|
|
|
|
extern unsigned long shmem_get_unmapped_area(struct file *file,
|
|
|
|
unsigned long addr,
|
|
|
|
unsigned long len,
|
|
|
|
unsigned long pgoff,
|
|
|
|
unsigned long flags);
|
|
|
|
#endif
|
|
|
|
|
Detach sched.h from mm.h
First thing mm.h does is including sched.h solely for can_do_mlock() inline
function which has "current" dereference inside. By dealing with can_do_mlock()
mm.h can be detached from sched.h which is good. See below, why.
This patch
a) removes unconditional inclusion of sched.h from mm.h
b) makes can_do_mlock() normal function in mm/mlock.c
c) exports can_do_mlock() to not break compilation
d) adds sched.h inclusions back to files that were getting it indirectly.
e) adds less bloated headers to some files (asm/signal.h, jiffies.h) that were
getting them indirectly
Net result is:
a) mm.h users would get less code to open, read, preprocess, parse, ... if
they don't need sched.h
b) sched.h stops being dependency for significant number of files:
on x86_64 allmodconfig touching sched.h results in recompile of 4083 files,
after patch it's only 3744 (-8.3%).
Cross-compile tested on
all arm defconfigs, all mips defconfigs, all powerpc defconfigs,
alpha alpha-up
arm
i386 i386-up i386-defconfig i386-allnoconfig
ia64 ia64-up
m68k
mips
parisc parisc-up
powerpc powerpc-up
s390 s390-up
sparc sparc-up
sparc64 sparc64-up
um-x86_64
x86_64 x86_64-up x86_64-defconfig x86_64-allnoconfig
as well as my two usual configs.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-21 01:22:52 +04:00
|
|
|
extern int can_do_mlock(void);
|
2005-04-16 15:20:36 -07:00
|
|
|
extern int user_shm_lock(size_t, struct user_struct *);
|
|
|
|
extern void user_shm_unlock(size_t, struct user_struct *);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Parameter block passed down to zap_pte_range in exceptional cases.
|
|
|
|
*/
|
|
|
|
struct zap_details {
|
|
|
|
struct vm_area_struct *nonlinear_vma; /* Check page->index if set */
|
|
|
|
struct address_space *check_mapping; /* Check page->mapping if set */
|
|
|
|
pgoff_t first_index; /* Lowest page->index to unmap */
|
|
|
|
pgoff_t last_index; /* Highest page->index to unmap */
|
|
|
|
spinlock_t *i_mmap_lock; /* For unmap_mapping_range: */
|
|
|
|
unsigned long truncate_count; /* Compare vm_truncate_count */
|
|
|
|
};
|
|
|
|
|
2005-11-28 14:34:23 -08:00
|
|
|
struct page *vm_normal_page(struct vm_area_struct *, unsigned long, pte_t);
|
2005-04-19 13:29:15 -07:00
|
|
|
unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
|
2005-04-16 15:20:36 -07:00
|
|
|
unsigned long size, struct zap_details *);
|
2005-10-29 18:16:30 -07:00
|
|
|
unsigned long unmap_vmas(struct mmu_gather **tlb,
|
2005-04-16 15:20:36 -07:00
|
|
|
struct vm_area_struct *start_vma, unsigned long start_addr,
|
|
|
|
unsigned long end_addr, unsigned long *nr_accounted,
|
|
|
|
struct zap_details *);
|
2005-04-19 13:29:16 -07:00
|
|
|
void free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
|
|
|
|
unsigned long end, unsigned long floor, unsigned long ceiling);
|
|
|
|
void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma,
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-19 13:29:15 -07:00
|
|
|
unsigned long floor, unsigned long ceiling);
|
2005-04-16 15:20:36 -07:00
|
|
|
int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
|
|
|
|
struct vm_area_struct *vma);
|
|
|
|
int zeromap_page_range(struct vm_area_struct *vma, unsigned long from,
|
|
|
|
unsigned long size, pgprot_t prot);
|
|
|
|
void unmap_mapping_range(struct address_space *mapping,
|
|
|
|
loff_t const holebegin, loff_t const holelen, int even_cows);
|
|
|
|
|
|
|
|
static inline void unmap_shared_mapping_range(struct address_space *mapping,
|
|
|
|
loff_t const holebegin, loff_t const holelen)
|
|
|
|
{
|
|
|
|
unmap_mapping_range(mapping, holebegin, holelen, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
extern int vmtruncate(struct inode * inode, loff_t offset);
|
[PATCH] madvise(MADV_REMOVE): remove pages from tmpfs shm backing store
Here is the patch to implement madvise(MADV_REMOVE) - which frees up a
given range of pages & its associated backing store. Current
implementation supports only shmfs/tmpfs and other filesystems return
-ENOSYS.
"Some app allocates large tmpfs files, then when some task quits and some
client disconnect, some memory can be released. However the only way to
release tmpfs-swap is to MADV_REMOVE". - Andrea Arcangeli
Databases want to use this feature to drop a section of their bufferpool
(shared memory segments) - without writing back to disk/swap space.
This feature is also useful for supporting hot-plug memory on UML.
Concerns raised by Andrew Morton:
- "We have no plan for holepunching! If we _do_ have such a plan (or
might in the future) then what would the API look like? I think
sys_holepunch(fd, start, len), so we should start out with that."
- Using madvise is very weird, because people will ask "why do I need to
mmap my file before I can stick a hole in it?"
- None of the other madvise operations call into the filesystem in this
manner. A broad question is: is this capability an MM operation or a
filesytem operation? truncate, for example, is a filesystem operation
which sometimes has MM side-effects. madvise is an mm operation and with
this patch, it gains FS side-effects, only they're really, really
significant ones."
Comments:
- Andrea suggested the fs operation too but then it's more efficient to
have it as a mm operation with fs side effects, because they don't
immediatly know fd and physical offset of the range. It's possible to
fixup in userland and to use the fs operation but it's more expensive,
the vmas are already in the kernel and we can use them.
Short term plan & Future Direction:
- We seem to need this interface only for shmfs/tmpfs files in the short
term. We have to add hooks into the filesystem for correctness and
completeness. This is what this patch does.
- In the future, plan is to support both fs and mmap apis also. This
also involves (other) filesystem specific functions to be implemented.
- Current patch doesn't support VM_NONLINEAR - which can be addressed in
the future.
Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Andrea Arcangeli <andrea@suse.de>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-06 00:10:38 -08:00
|
|
|
extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
|
[PATCH] fix get_user_pages bug
Checking pte_dirty instead of pte_write in __follow_page is problematic
for s390, and for copy_one_pte which leaves dirty when clearing write.
So revert __follow_page to check pte_write as before, and make
do_wp_page pass back a special extra VM_FAULT_WRITE bit to say it has
done its full job: once get_user_pages receives this value, it no longer
requires pte_write in __follow_page.
But most callers of handle_mm_fault, in the various architectures, have
switch statements which do not expect this new case. To avoid changing
them all in a hurry, make an inline wrapper function (using the old
name) that masks off the new bit, and use the extended interface with
double underscores.
Yes, we do have a call to do_wp_page from do_swap_page, but no need to
change that: in rare case it's needed, another do_wp_page will follow.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
[ Cleanups by Nick Piggin ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-08-03 20:24:01 +10:00
|
|
|
|
2006-01-06 00:11:44 -08:00
|
|
|
#ifdef CONFIG_MMU
|
2007-07-19 01:47:05 -07:00
|
|
|
extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
2006-01-06 00:11:44 -08:00
|
|
|
unsigned long address, int write_access);
|
|
|
|
#else
|
|
|
|
static inline int handle_mm_fault(struct mm_struct *mm,
|
|
|
|
struct vm_area_struct *vma, unsigned long address,
|
|
|
|
int write_access)
|
|
|
|
{
|
|
|
|
/* should never happen if there's no MMU */
|
|
|
|
BUG();
|
|
|
|
return VM_FAULT_SIGBUS;
|
|
|
|
}
|
|
|
|
#endif
|
[PATCH] fix get_user_pages bug
Checking pte_dirty instead of pte_write in __follow_page is problematic
for s390, and for copy_one_pte which leaves dirty when clearing write.
So revert __follow_page to check pte_write as before, and make
do_wp_page pass back a special extra VM_FAULT_WRITE bit to say it has
done its full job: once get_user_pages receives this value, it no longer
requires pte_write in __follow_page.
But most callers of handle_mm_fault, in the various architectures, have
switch statements which do not expect this new case. To avoid changing
them all in a hurry, make an inline wrapper function (using the old
name) that masks off the new bit, and use the extended interface with
double underscores.
Yes, we do have a call to do_wp_page from do_swap_page, but no need to
change that: in rare case it's needed, another do_wp_page will follow.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
[ Cleanups by Nick Piggin ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-08-03 20:24:01 +10:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
extern int make_pages_present(unsigned long addr, unsigned long end);
|
|
|
|
extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
|
|
|
|
|
|
|
|
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
|
|
|
|
int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
|
2005-10-29 18:16:12 -07:00
|
|
|
void print_bad_pte(struct vm_area_struct *, pte_t, unsigned long);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2006-08-29 19:05:54 +01:00
|
|
|
extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
|
|
|
|
extern void do_invalidatepage(struct page *page, unsigned long offset);
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
int __set_page_dirty_nobuffers(struct page *page);
|
2007-02-10 01:43:15 -08:00
|
|
|
int __set_page_dirty_no_writeback(struct page *page);
|
2005-04-16 15:20:36 -07:00
|
|
|
int redirty_page_for_writepage(struct writeback_control *wbc,
|
|
|
|
struct page *page);
|
|
|
|
int FASTCALL(set_page_dirty(struct page *page));
|
|
|
|
int set_page_dirty_lock(struct page *page);
|
|
|
|
int clear_page_dirty_for_io(struct page *page);
|
|
|
|
|
2007-07-19 01:48:16 -07:00
|
|
|
extern unsigned long move_page_tables(struct vm_area_struct *vma,
|
|
|
|
unsigned long old_addr, struct vm_area_struct *new_vma,
|
|
|
|
unsigned long new_addr, unsigned long len);
|
2005-04-16 15:20:36 -07:00
|
|
|
extern unsigned long do_mremap(unsigned long addr,
|
|
|
|
unsigned long old_len, unsigned long new_len,
|
|
|
|
unsigned long flags, unsigned long new_addr);
|
2007-07-19 01:48:16 -07:00
|
|
|
extern int mprotect_fixup(struct vm_area_struct *vma,
|
|
|
|
struct vm_area_struct **pprev, unsigned long start,
|
|
|
|
unsigned long end, unsigned long newflags);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/*
|
2007-07-17 04:03:17 -07:00
|
|
|
* A callback you can register to apply pressure to ageable caches.
|
2005-04-16 15:20:36 -07:00
|
|
|
*
|
2007-07-17 04:03:17 -07:00
|
|
|
* 'shrink' is passed a count 'nr_to_scan' and a 'gfpmask'. It should
|
|
|
|
* look through the least-recently-used 'nr_to_scan' entries and
|
|
|
|
* attempt to free them up. It should return the number of objects
|
|
|
|
* which remain in the cache. If it returns -1, it means it cannot do
|
|
|
|
* any scanning at this time (eg. there is a risk of deadlock).
|
2005-04-16 15:20:36 -07:00
|
|
|
*
|
2007-07-17 04:03:17 -07:00
|
|
|
* The 'gfpmask' refers to the allocation we are currently trying to
|
|
|
|
* fulfil.
|
|
|
|
*
|
|
|
|
* Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
|
|
|
|
* querying the cache size, so a fastpath for that case is appropriate.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2007-07-17 04:03:17 -07:00
|
|
|
struct shrinker {
|
|
|
|
int (*shrink)(int nr_to_scan, gfp_t gfp_mask);
|
|
|
|
int seeks; /* seeks to recreate an obj */
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-07-17 04:03:17 -07:00
|
|
|
/* These are for internal use */
|
|
|
|
struct list_head list;
|
|
|
|
long nr; /* objs pending delete */
|
|
|
|
};
|
|
|
|
#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
|
|
|
|
extern void register_shrinker(struct shrinker *);
|
|
|
|
extern void unregister_shrinker(struct shrinker *);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2006-09-25 23:30:57 -07:00
|
|
|
/*
|
|
|
|
* Some shared mappigns will want the pages marked read-only
|
|
|
|
* to track write events. If so, we'll downgrade vm_page_prot
|
|
|
|
* to the private version (using protection_map[] without the
|
|
|
|
* VM_SHARED bit).
|
|
|
|
*/
|
|
|
|
static inline int vma_wants_writenotify(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
unsigned int vm_flags = vma->vm_flags;
|
|
|
|
|
|
|
|
/* If it was private or non-writable, the write bit is already clear */
|
|
|
|
if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* The backer wishes to know when pages are first written to? */
|
|
|
|
if (vma->vm_ops && vma->vm_ops->page_mkwrite)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
/* The open routine did something to the protections already? */
|
|
|
|
if (pgprot_val(vma->vm_page_prot) !=
|
|
|
|
pgprot_val(protection_map[vm_flags &
|
|
|
|
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* Specialty mapping? */
|
|
|
|
if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* Can the mapping track the dirty pages? */
|
|
|
|
return vma->vm_file && vma->vm_file->f_mapping &&
|
|
|
|
mapping_cap_account_dirty(vma->vm_file->f_mapping);
|
|
|
|
}
|
|
|
|
|
2005-11-29 14:03:14 -08:00
|
|
|
extern pte_t *FASTCALL(get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl));
|
|
|
|
|
2007-05-06 14:49:02 -07:00
|
|
|
#ifdef __PAGETABLE_PUD_FOLDED
|
|
|
|
static inline int __pud_alloc(struct mm_struct *mm, pgd_t *pgd,
|
|
|
|
unsigned long address)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#else
|
2005-10-29 18:16:22 -07:00
|
|
|
int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
|
2007-05-06 14:49:02 -07:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef __PAGETABLE_PMD_FOLDED
|
|
|
|
static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
|
|
|
|
unsigned long address)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#else
|
2005-10-29 18:16:22 -07:00
|
|
|
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
|
2007-05-06 14:49:02 -07:00
|
|
|
#endif
|
|
|
|
|
2005-10-29 18:16:22 -07:00
|
|
|
int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address);
|
|
|
|
int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* The following ifdef needed to get the 4level-fixup.h header to work.
|
|
|
|
* Remove it when 4level-fixup.h has been removed.
|
|
|
|
*/
|
2005-10-29 18:16:22 -07:00
|
|
|
#if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK)
|
2005-04-16 15:20:36 -07:00
|
|
|
static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
|
|
|
|
{
|
2005-10-29 18:16:22 -07:00
|
|
|
return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))?
|
|
|
|
NULL: pud_offset(pgd, address);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
|
|
|
|
{
|
2005-10-29 18:16:22 -07:00
|
|
|
return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
|
|
|
|
NULL: pmd_offset(pud, address);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2005-10-29 18:16:22 -07:00
|
|
|
#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
|
|
|
|
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 18:16:40 -07:00
|
|
|
#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
|
|
|
|
/*
|
|
|
|
* We tuck a spinlock to guard each pagetable page into its struct page,
|
|
|
|
* at page->private, with BUILD_BUG_ON to make sure that this will not
|
|
|
|
* overflow into the next struct page (as it might with DEBUG_SPINLOCK).
|
|
|
|
* When freeing, reset page->mapping so free_pages_check won't complain.
|
|
|
|
*/
|
2006-01-08 01:04:36 -08:00
|
|
|
#define __pte_lockptr(page) &((page)->ptl)
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 18:16:40 -07:00
|
|
|
#define pte_lock_init(_page) do { \
|
|
|
|
spin_lock_init(__pte_lockptr(_page)); \
|
|
|
|
} while (0)
|
|
|
|
#define pte_lock_deinit(page) ((page)->mapping = NULL)
|
|
|
|
#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
|
|
|
|
#else
|
|
|
|
/*
|
|
|
|
* We use mm->page_table_lock to guard all pagetable pages of the mm.
|
|
|
|
*/
|
|
|
|
#define pte_lock_init(page) do {} while (0)
|
|
|
|
#define pte_lock_deinit(page) do {} while (0)
|
|
|
|
#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;})
|
|
|
|
#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
|
|
|
|
|
2005-10-29 18:16:23 -07:00
|
|
|
#define pte_offset_map_lock(mm, pmd, address, ptlp) \
|
|
|
|
({ \
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 18:16:40 -07:00
|
|
|
spinlock_t *__ptl = pte_lockptr(mm, pmd); \
|
2005-10-29 18:16:23 -07:00
|
|
|
pte_t *__pte = pte_offset_map(pmd, address); \
|
|
|
|
*(ptlp) = __ptl; \
|
|
|
|
spin_lock(__ptl); \
|
|
|
|
__pte; \
|
|
|
|
})
|
|
|
|
|
|
|
|
#define pte_unmap_unlock(pte, ptl) do { \
|
|
|
|
spin_unlock(ptl); \
|
|
|
|
pte_unmap(pte); \
|
|
|
|
} while (0)
|
|
|
|
|
2005-10-29 18:16:22 -07:00
|
|
|
#define pte_alloc_map(mm, pmd, address) \
|
|
|
|
((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
|
|
|
|
NULL: pte_offset_map(pmd, address))
|
|
|
|
|
2005-10-29 18:16:23 -07:00
|
|
|
#define pte_alloc_map_lock(mm, pmd, address, ptlp) \
|
|
|
|
((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
|
|
|
|
NULL: pte_offset_map_lock(mm, pmd, address, ptlp))
|
|
|
|
|
2005-10-29 18:16:22 -07:00
|
|
|
#define pte_alloc_kernel(pmd, address) \
|
|
|
|
((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
|
|
|
|
NULL: pte_offset_kernel(pmd, address))
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
extern void free_area_init(unsigned long * zones_size);
|
|
|
|
extern void free_area_init_node(int nid, pg_data_t *pgdat,
|
|
|
|
unsigned long * zones_size, unsigned long zone_start_pfn,
|
|
|
|
unsigned long *zholes_size);
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 01:49:43 -07:00
|
|
|
#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
|
|
|
|
/*
|
|
|
|
* With CONFIG_ARCH_POPULATES_NODE_MAP set, an architecture may initialise its
|
|
|
|
* zones, allocate the backing mem_map and account for memory holes in a more
|
|
|
|
* architecture independent manner. This is a substitute for creating the
|
|
|
|
* zone_sizes[] and zholes_size[] arrays and passing them to
|
|
|
|
* free_area_init_node()
|
|
|
|
*
|
|
|
|
* An architecture is expected to register range of page frames backed by
|
|
|
|
* physical memory with add_active_range() before calling
|
|
|
|
* free_area_init_nodes() passing in the PFN each zone ends at. At a basic
|
|
|
|
* usage, an architecture is expected to do something like
|
|
|
|
*
|
|
|
|
* unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
|
|
|
|
* max_highmem_pfn};
|
|
|
|
* for_each_valid_physical_page_range()
|
|
|
|
* add_active_range(node_id, start_pfn, end_pfn)
|
|
|
|
* free_area_init_nodes(max_zone_pfns);
|
|
|
|
*
|
|
|
|
* If the architecture guarantees that there are no holes in the ranges
|
|
|
|
* registered with add_active_range(), free_bootmem_active_regions()
|
|
|
|
* will call free_bootmem_node() for each registered physical page range.
|
|
|
|
* Similarly sparse_memory_present_with_active_regions() calls
|
|
|
|
* memory_present() for each range when SPARSEMEM is enabled.
|
|
|
|
*
|
|
|
|
* See mm/page_alloc.c for more information on each function exposed by
|
|
|
|
* CONFIG_ARCH_POPULATES_NODE_MAP
|
|
|
|
*/
|
|
|
|
extern void free_area_init_nodes(unsigned long *max_zone_pfn);
|
|
|
|
extern void add_active_range(unsigned int nid, unsigned long start_pfn,
|
|
|
|
unsigned long end_pfn);
|
|
|
|
extern void shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
|
|
|
|
unsigned long new_end_pfn);
|
2006-09-27 01:49:59 -07:00
|
|
|
extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn,
|
|
|
|
unsigned long end_pfn);
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 01:49:43 -07:00
|
|
|
extern void remove_all_active_ranges(void);
|
|
|
|
extern unsigned long absent_pages_in_range(unsigned long start_pfn,
|
|
|
|
unsigned long end_pfn);
|
|
|
|
extern void get_pfn_range_for_nid(unsigned int nid,
|
|
|
|
unsigned long *start_pfn, unsigned long *end_pfn);
|
|
|
|
extern unsigned long find_min_pfn_with_active_regions(void);
|
|
|
|
extern unsigned long find_max_pfn_with_active_regions(void);
|
|
|
|
extern void free_bootmem_with_active_regions(int nid,
|
|
|
|
unsigned long max_low_pfn);
|
|
|
|
extern void sparse_memory_present_with_active_regions(int nid);
|
|
|
|
#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
|
|
|
|
extern int early_pfn_to_nid(unsigned long pfn);
|
|
|
|
#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
|
|
|
|
#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
|
2006-09-27 01:49:56 -07:00
|
|
|
extern void set_dma_reserve(unsigned long new_dma_reserve);
|
2007-01-10 23:15:30 -08:00
|
|
|
extern void memmap_init_zone(unsigned long, int, unsigned long,
|
|
|
|
unsigned long, enum memmap_context);
|
2005-10-29 18:16:54 -07:00
|
|
|
extern void setup_per_zone_pages_min(void);
|
2005-04-16 15:20:36 -07:00
|
|
|
extern void mem_init(void);
|
|
|
|
extern void show_mem(void);
|
|
|
|
extern void si_meminfo(struct sysinfo * val);
|
|
|
|
extern void si_meminfo_node(struct sysinfo *val, int nid);
|
|
|
|
|
2005-06-21 17:14:47 -07:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
extern void setup_per_cpu_pageset(void);
|
|
|
|
#else
|
|
|
|
static inline void setup_per_cpu_pageset(void) {}
|
|
|
|
#endif
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/* prio_tree.c */
|
|
|
|
void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old);
|
|
|
|
void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *);
|
|
|
|
void vma_prio_tree_remove(struct vm_area_struct *, struct prio_tree_root *);
|
|
|
|
struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
|
|
|
|
struct prio_tree_iter *iter);
|
|
|
|
|
|
|
|
#define vma_prio_tree_foreach(vma, iter, root, begin, end) \
|
|
|
|
for (prio_tree_iter_init(iter, root, begin, end), vma = NULL; \
|
|
|
|
(vma = vma_prio_tree_next(vma, iter)); )
|
|
|
|
|
|
|
|
static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
|
|
|
|
struct list_head *list)
|
|
|
|
{
|
|
|
|
vma->shared.vm_set.parent = NULL;
|
|
|
|
list_add_tail(&vma->shared.vm_set.list, list);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* mmap.c */
|
|
|
|
extern int __vm_enough_memory(long pages, int cap_sys_admin);
|
|
|
|
extern void vma_adjust(struct vm_area_struct *vma, unsigned long start,
|
|
|
|
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert);
|
|
|
|
extern struct vm_area_struct *vma_merge(struct mm_struct *,
|
|
|
|
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
|
|
|
|
unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
|
|
|
|
struct mempolicy *);
|
|
|
|
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
|
|
|
|
extern int split_vma(struct mm_struct *,
|
|
|
|
struct vm_area_struct *, unsigned long addr, int new_below);
|
|
|
|
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
|
|
|
|
extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
|
|
|
|
struct rb_node **, struct rb_node *);
|
2005-10-29 18:15:57 -07:00
|
|
|
extern void unlink_file_vma(struct vm_area_struct *);
|
2005-04-16 15:20:36 -07:00
|
|
|
extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
|
|
|
|
unsigned long addr, unsigned long len, pgoff_t pgoff);
|
|
|
|
extern void exit_mmap(struct mm_struct *);
|
2005-05-01 08:58:35 -07:00
|
|
|
extern int may_expand_vm(struct mm_struct *mm, unsigned long npages);
|
2007-02-08 14:20:41 -08:00
|
|
|
extern int install_special_mapping(struct mm_struct *mm,
|
|
|
|
unsigned long addr, unsigned long len,
|
|
|
|
unsigned long flags, struct page **pages);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
|
|
|
|
|
|
|
|
extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long prot,
|
|
|
|
unsigned long flag, unsigned long pgoff);
|
2007-07-15 23:38:26 -07:00
|
|
|
extern unsigned long mmap_region(struct file *file, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long flags,
|
|
|
|
unsigned int vm_flags, unsigned long pgoff,
|
|
|
|
int accountable);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
static inline unsigned long do_mmap(struct file *file, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long prot,
|
|
|
|
unsigned long flag, unsigned long offset)
|
|
|
|
{
|
|
|
|
unsigned long ret = -EINVAL;
|
|
|
|
if ((offset + PAGE_ALIGN(len)) < offset)
|
|
|
|
goto out;
|
|
|
|
if (!(offset & ~PAGE_MASK))
|
|
|
|
ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
extern int do_munmap(struct mm_struct *, unsigned long, size_t);
|
|
|
|
|
|
|
|
extern unsigned long do_brk(unsigned long, unsigned long);
|
|
|
|
|
|
|
|
/* filemap.c */
|
|
|
|
extern unsigned long page_unuse(struct page *);
|
|
|
|
extern void truncate_inode_pages(struct address_space *, loff_t);
|
2006-01-06 00:10:36 -08:00
|
|
|
extern void truncate_inode_pages_range(struct address_space *,
|
|
|
|
loff_t lstart, loff_t lend);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/* generic vm_area_ops exported for stackable file systems */
|
2007-07-19 01:47:03 -07:00
|
|
|
extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/* mm/page-writeback.c */
|
|
|
|
int write_one_page(struct page *page, int wait);
|
|
|
|
|
|
|
|
/* readahead.c */
|
|
|
|
#define VM_MAX_READAHEAD 128 /* kbytes */
|
|
|
|
#define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */
|
|
|
|
#define VM_MAX_CACHE_HIT 256 /* max pages in a row in cache before
|
|
|
|
* turning readahead off */
|
|
|
|
|
|
|
|
int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
|
2005-11-07 00:59:28 -08:00
|
|
|
pgoff_t offset, unsigned long nr_to_read);
|
2005-04-16 15:20:36 -07:00
|
|
|
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
|
2005-11-07 00:59:28 -08:00
|
|
|
pgoff_t offset, unsigned long nr_to_read);
|
2007-07-19 01:48:08 -07:00
|
|
|
|
|
|
|
void page_cache_sync_readahead(struct address_space *mapping,
|
|
|
|
struct file_ra_state *ra,
|
|
|
|
struct file *filp,
|
|
|
|
pgoff_t offset,
|
|
|
|
unsigned long size);
|
|
|
|
|
|
|
|
void page_cache_async_readahead(struct address_space *mapping,
|
|
|
|
struct file_ra_state *ra,
|
|
|
|
struct file *filp,
|
|
|
|
struct page *pg,
|
|
|
|
pgoff_t offset,
|
|
|
|
unsigned long size);
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
unsigned long max_sane_readahead(unsigned long nr);
|
|
|
|
|
|
|
|
/* Do stack extension */
|
2005-10-29 18:16:20 -07:00
|
|
|
extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
|
2005-11-18 16:16:42 -05:00
|
|
|
#ifdef CONFIG_IA64
|
2005-10-29 18:16:20 -07:00
|
|
|
extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
|
2005-11-18 16:16:42 -05:00
|
|
|
#endif
|
2007-07-19 01:48:16 -07:00
|
|
|
extern int expand_stack_downwards(struct vm_area_struct *vma,
|
|
|
|
unsigned long address);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
|
|
|
|
extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
|
|
|
|
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
|
|
|
|
struct vm_area_struct **pprev);
|
|
|
|
|
|
|
|
/* Look up the first VMA which intersects the interval start_addr..end_addr-1,
|
|
|
|
NULL if none. Assume start_addr < end_addr. */
|
|
|
|
static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
|
|
|
|
{
|
|
|
|
struct vm_area_struct * vma = find_vma(mm,start_addr);
|
|
|
|
|
|
|
|
if (vma && end_addr <= vma->vm_start)
|
|
|
|
vma = NULL;
|
|
|
|
return vma;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned long vma_pages(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
|
|
|
|
}
|
|
|
|
|
2006-07-26 21:39:49 +01:00
|
|
|
pgprot_t vm_get_page_prot(unsigned long vm_flags);
|
2005-10-29 18:16:33 -07:00
|
|
|
struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
|
|
|
|
struct page *vmalloc_to_page(void *addr);
|
|
|
|
unsigned long vmalloc_to_pfn(void *addr);
|
|
|
|
int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
|
|
|
|
unsigned long pfn, unsigned long size, pgprot_t);
|
2005-11-30 09:35:19 -08:00
|
|
|
int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
|
2007-02-12 00:51:36 -08:00
|
|
|
int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
|
|
|
|
unsigned long pfn);
|
2005-10-29 18:16:33 -07:00
|
|
|
|
2005-11-28 14:34:23 -08:00
|
|
|
struct page *follow_page(struct vm_area_struct *, unsigned long address,
|
2005-10-29 18:16:33 -07:00
|
|
|
unsigned int foll_flags);
|
|
|
|
#define FOLL_WRITE 0x01 /* check pte is writable */
|
|
|
|
#define FOLL_TOUCH 0x02 /* mark page accessed */
|
|
|
|
#define FOLL_GET 0x04 /* do get_page on page */
|
|
|
|
#define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-05-06 14:48:54 -07:00
|
|
|
typedef int (*pte_fn_t)(pte_t *pte, struct page *pmd_page, unsigned long addr,
|
|
|
|
void *data);
|
|
|
|
extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
|
|
|
|
unsigned long size, pte_fn_t fn, void *data);
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
#ifdef CONFIG_PROC_FS
|
2005-10-29 18:15:56 -07:00
|
|
|
void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
|
2005-04-16 15:20:36 -07:00
|
|
|
#else
|
2005-10-29 18:15:56 -07:00
|
|
|
static inline void vm_stat_account(struct mm_struct *mm,
|
2005-04-16 15:20:36 -07:00
|
|
|
unsigned long flags, struct file *file, long pages)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_PROC_FS */
|
|
|
|
|
|
|
|
#ifndef CONFIG_DEBUG_PAGEALLOC
|
|
|
|
static inline void
|
2006-10-11 01:21:30 -07:00
|
|
|
kernel_map_pages(struct page *page, int numpages, int enable) {}
|
2005-04-16 15:20:36 -07:00
|
|
|
#endif
|
|
|
|
|
|
|
|
extern struct vm_area_struct *get_gate_vma(struct task_struct *tsk);
|
|
|
|
#ifdef __HAVE_ARCH_GATE_AREA
|
|
|
|
int in_gate_area_no_task(unsigned long addr);
|
|
|
|
int in_gate_area(struct task_struct *task, unsigned long addr);
|
|
|
|
#else
|
|
|
|
int in_gate_area_no_task(unsigned long addr);
|
|
|
|
#define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);})
|
|
|
|
#endif /* __HAVE_ARCH_GATE_AREA */
|
|
|
|
|
2006-01-08 01:00:39 -08:00
|
|
|
int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
|
|
|
|
void __user *, size_t *, loff_t *);
|
2006-03-22 00:08:19 -08:00
|
|
|
unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
|
2006-01-08 01:00:39 -08:00
|
|
|
unsigned long lru_pages);
|
|
|
|
void drop_pagecache(void);
|
|
|
|
void drop_slab(void);
|
|
|
|
|
2006-02-20 18:28:07 -08:00
|
|
|
#ifndef CONFIG_MMU
|
|
|
|
#define randomize_va_space 0
|
|
|
|
#else
|
2006-02-16 23:41:58 +01:00
|
|
|
extern int randomize_va_space;
|
2006-02-20 18:28:07 -08:00
|
|
|
#endif
|
2006-02-16 23:41:58 +01:00
|
|
|
|
2006-09-27 01:50:23 -07:00
|
|
|
__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma);
|
[PATCH] vdso: randomize the i386 vDSO by moving it into a vma
Move the i386 VDSO down into a vma and thus randomize it.
Besides the security implications, this feature also helps debuggers, which
can COW a vma-backed VDSO just like a normal DSO and can thus do
single-stepping and other debugging features.
It's good for hypervisors (Xen, VMWare) too, which typically live in the same
high-mapped address space as the VDSO, hence whenever the VDSO is used, they
get lots of guest pagefaults and have to fix such guest accesses up - which
slows things down instead of speeding things up (the primary purpose of the
VDSO).
There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support
for older glibcs that still rely on a prelinked high-mapped VDSO. Newer
distributions (using glibc 2.3.3 or later) can turn this option off. Turning
it off is also recommended for security reasons: attackers cannot use the
predictable high-mapped VDSO page as syscall trampoline anymore.
There is a new vdso=[0|1] boot option as well, and a runtime
/proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned
on/off.
(This version of the VDSO-randomization patch also has working ELF
coredumping, the previous patch crashed in the coredumping code.)
This code is a combined work of the exec-shield VDSO randomization
code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell
started this patch and i completed it.
[akpm@osdl.org: cleanups]
[akpm@osdl.org: compile fix]
[akpm@osdl.org: compile fix 2]
[akpm@osdl.org: compile fix 3]
[akpm@osdl.org: revernt MAXMEM change]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 02:53:50 -07:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
#endif /* __KERNEL__ */
|
|
|
|
#endif /* _LINUX_MM_H */
|