2008-01-30 13:33:41 +01:00
|
|
|
/*
|
|
|
|
* Copyright 2002 Andi Kleen, SuSE Labs.
|
2005-04-16 15:20:36 -07:00
|
|
|
* Thanks to Ben LaHaise for precious feedback.
|
2008-01-30 13:33:41 +01:00
|
|
|
*/
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
#include <linux/highmem.h>
|
|
|
|
#include <linux/module.h>
|
2008-01-30 13:33:41 +01:00
|
|
|
#include <linux/sched.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
#include <linux/slab.h>
|
2008-01-30 13:33:41 +01:00
|
|
|
#include <linux/mm.h>
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
#include <asm/processor.h>
|
|
|
|
#include <asm/tlbflush.h>
|
2006-01-06 00:12:10 -08:00
|
|
|
#include <asm/sections.h>
|
2008-01-30 13:33:41 +01:00
|
|
|
#include <asm/uaccess.h>
|
|
|
|
#include <asm/pgalloc.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-01-30 13:33:43 +01:00
|
|
|
pte_t *lookup_address(unsigned long address, int *level)
|
2008-01-30 13:33:41 +01:00
|
|
|
{
|
2005-04-16 15:20:36 -07:00
|
|
|
pgd_t *pgd = pgd_offset_k(address);
|
|
|
|
pud_t *pud;
|
|
|
|
pmd_t *pmd;
|
2008-01-30 13:33:41 +01:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
if (pgd_none(*pgd))
|
|
|
|
return NULL;
|
|
|
|
pud = pud_offset(pgd, address);
|
|
|
|
if (pud_none(*pud))
|
|
|
|
return NULL;
|
|
|
|
pmd = pmd_offset(pud, address);
|
|
|
|
if (pmd_none(*pmd))
|
|
|
|
return NULL;
|
2008-01-30 13:33:43 +01:00
|
|
|
*level = 2;
|
2005-04-16 15:20:36 -07:00
|
|
|
if (pmd_large(*pmd))
|
|
|
|
return (pte_t *)pmd;
|
2008-01-30 13:33:43 +01:00
|
|
|
*level = 3;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-01-30 13:33:41 +01:00
|
|
|
return pte_offset_kernel(pmd, address);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
|
|
|
|
{
|
2005-04-16 15:20:36 -07:00
|
|
|
unsigned long flags;
|
2008-01-30 13:33:41 +01:00
|
|
|
struct page *page;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-01-30 13:33:41 +01:00
|
|
|
/* change init_mm */
|
|
|
|
set_pte_atomic(kpte, pte);
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-02 19:27:13 +02:00
|
|
|
if (SHARED_KERNEL_PMD)
|
2005-04-16 15:20:36 -07:00
|
|
|
return;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&pgd_lock, flags);
|
|
|
|
for (page = pgd_list; page; page = (struct page *)page->index) {
|
|
|
|
pgd_t *pgd;
|
|
|
|
pud_t *pud;
|
|
|
|
pmd_t *pmd;
|
2008-01-30 13:33:41 +01:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
pgd = (pgd_t *)page_address(page) + pgd_index(address);
|
|
|
|
pud = pud_offset(pgd, address);
|
|
|
|
pmd = pmd_offset(pud, address);
|
|
|
|
set_pte_atomic((pte_t *)pmd, pte);
|
|
|
|
}
|
|
|
|
spin_unlock_irqrestore(&pgd_lock, flags);
|
|
|
|
}
|
|
|
|
|
2008-01-30 13:33:56 +01:00
|
|
|
static int
|
|
|
|
split_large_page(pte_t *kpte, unsigned long address, pgprot_t ref_prot)
|
|
|
|
{
|
|
|
|
int i, level;
|
|
|
|
unsigned long addr;
|
|
|
|
pte_t *pbase, *tmp;
|
|
|
|
struct page *base;
|
|
|
|
|
|
|
|
base = alloc_pages(GFP_KERNEL, 0);
|
|
|
|
if (!base)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
down_write(&init_mm.mmap_sem);
|
|
|
|
/*
|
|
|
|
* Check for races, another CPU might have split this page
|
|
|
|
* up for us already:
|
|
|
|
*/
|
|
|
|
tmp = lookup_address(address, &level);
|
|
|
|
if (tmp != kpte)
|
|
|
|
goto out_unlock;
|
|
|
|
|
|
|
|
address = __pa(address);
|
|
|
|
addr = address & LARGE_PAGE_MASK;
|
|
|
|
pbase = (pte_t *)page_address(base);
|
|
|
|
paravirt_alloc_pt(&init_mm, page_to_pfn(base));
|
|
|
|
|
|
|
|
for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE)
|
|
|
|
set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, ref_prot));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Install the new, split up pagetable:
|
|
|
|
*/
|
|
|
|
set_pmd_pte(kpte, address, mk_pte(base, ref_prot));
|
|
|
|
base = NULL;
|
|
|
|
|
|
|
|
out_unlock:
|
|
|
|
up_write(&init_mm.mmap_sem);
|
|
|
|
|
|
|
|
if (base)
|
|
|
|
__free_pages(base, 0);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-01-30 13:33:41 +01:00
|
|
|
static int __change_page_attr(struct page *page, pgprot_t prot)
|
|
|
|
{
|
2008-01-30 13:33:55 +01:00
|
|
|
pgprot_t ref_prot = PAGE_KERNEL;
|
2005-04-16 15:20:36 -07:00
|
|
|
struct page *kpte_page;
|
2008-01-30 13:33:41 +01:00
|
|
|
unsigned long address;
|
2008-01-30 13:33:56 +01:00
|
|
|
int level, err = 0;
|
2008-01-30 13:33:55 +01:00
|
|
|
pgprot_t oldprot;
|
2008-01-30 13:33:41 +01:00
|
|
|
pte_t *kpte;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
BUG_ON(PageHighMem(page));
|
|
|
|
address = (unsigned long)page_address(page);
|
|
|
|
|
2008-01-30 13:33:55 +01:00
|
|
|
repeat:
|
2008-01-30 13:33:43 +01:00
|
|
|
kpte = lookup_address(address, &level);
|
2005-04-16 15:20:36 -07:00
|
|
|
if (!kpte)
|
|
|
|
return -EINVAL;
|
2008-01-30 13:33:41 +01:00
|
|
|
|
2008-01-30 13:33:55 +01:00
|
|
|
oldprot = pte_pgprot(*kpte);
|
2005-04-16 15:20:36 -07:00
|
|
|
kpte_page = virt_to_page(kpte);
|
2007-07-21 17:09:51 +02:00
|
|
|
BUG_ON(PageLRU(kpte_page));
|
|
|
|
BUG_ON(PageCompound(kpte_page));
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
2008-01-30 13:33:55 +01:00
|
|
|
* Better fail early if someone sets the kernel text to NX.
|
|
|
|
* Does not cover __inittext
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2008-01-30 13:33:55 +01:00
|
|
|
BUG_ON(address >= (unsigned long)&_text &&
|
|
|
|
address < (unsigned long)&_etext &&
|
|
|
|
(pgprot_val(prot) & _PAGE_NX));
|
2007-07-21 17:09:51 +02:00
|
|
|
|
2008-01-30 13:33:55 +01:00
|
|
|
if ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
|
|
|
|
ref_prot = PAGE_KERNEL_EXEC;
|
|
|
|
|
|
|
|
ref_prot = canon_pgprot(ref_prot);
|
|
|
|
prot = canon_pgprot(prot);
|
|
|
|
|
|
|
|
if (level == 3) {
|
|
|
|
set_pte_atomic(kpte, mk_pte(page, prot));
|
|
|
|
} else {
|
2008-01-30 13:33:56 +01:00
|
|
|
err = split_large_page(kpte, address, ref_prot);
|
|
|
|
if (!err)
|
|
|
|
goto repeat;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2008-01-30 13:33:56 +01:00
|
|
|
return err;
|
2008-01-30 13:33:41 +01:00
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Change the page attributes of an page in the linear mapping.
|
|
|
|
*
|
|
|
|
* This should be used when a page is mapped with a different caching policy
|
|
|
|
* than write-back somewhere - some CPUs do not like it when mappings with
|
|
|
|
* different caching policies exist. This changes the page attributes of the
|
|
|
|
* in kernel linear mapping too.
|
2008-01-30 13:33:41 +01:00
|
|
|
*
|
2005-04-16 15:20:36 -07:00
|
|
|
* The caller needs to ensure that there are no conflicting mappings elsewhere.
|
|
|
|
* This function only deals with the kernel linear map.
|
2008-01-30 13:33:41 +01:00
|
|
|
*
|
2005-04-16 15:20:36 -07:00
|
|
|
* Caller must call global_flush_tlb() after this.
|
|
|
|
*/
|
|
|
|
int change_page_attr(struct page *page, int numpages, pgprot_t prot)
|
|
|
|
{
|
2008-01-30 13:33:41 +01:00
|
|
|
int err = 0, i;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-01-30 13:33:41 +01:00
|
|
|
for (i = 0; i < numpages; i++, page++) {
|
2005-04-16 15:20:36 -07:00
|
|
|
err = __change_page_attr(page, prot);
|
2008-01-30 13:33:41 +01:00
|
|
|
if (err)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
return err;
|
|
|
|
}
|
2008-01-30 13:33:41 +01:00
|
|
|
EXPORT_SYMBOL(change_page_attr);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-01-30 13:33:55 +01:00
|
|
|
int change_page_attr_addr(unsigned long addr, int numpages, pgprot_t prot)
|
2006-06-23 02:05:55 -07:00
|
|
|
{
|
2008-01-30 13:33:55 +01:00
|
|
|
int i;
|
|
|
|
unsigned long pfn = (addr >> PAGE_SHIFT);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-01-30 13:33:55 +01:00
|
|
|
for (i = 0; i < numpages; i++) {
|
|
|
|
if (!pfn_valid(pfn + i)) {
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
int level;
|
|
|
|
pte_t *pte = lookup_address(addr + i*PAGE_SIZE, &level);
|
|
|
|
BUG_ON(pte && !pte_none(*pte));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return change_page_attr(virt_to_page(addr), i, prot);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void flush_kernel_map(void *arg)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Flush all to work around Errata in early athlons regarding
|
|
|
|
* large page flushing.
|
|
|
|
*/
|
|
|
|
__flush_tlb_all();
|
|
|
|
|
|
|
|
if (boot_cpu_data.x86_model >= 4)
|
|
|
|
wbinvd();
|
|
|
|
}
|
|
|
|
|
|
|
|
void global_flush_tlb(void)
|
|
|
|
{
|
2005-04-16 15:20:36 -07:00
|
|
|
BUG_ON(irqs_disabled());
|
|
|
|
|
2008-01-30 13:33:55 +01:00
|
|
|
on_each_cpu(flush_kernel_map, NULL, 1, 1);
|
2006-06-23 02:05:55 -07:00
|
|
|
}
|
2008-01-30 13:33:41 +01:00
|
|
|
EXPORT_SYMBOL(global_flush_tlb);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
#ifdef CONFIG_DEBUG_PAGEALLOC
|
|
|
|
void kernel_map_pages(struct page *page, int numpages, int enable)
|
|
|
|
{
|
|
|
|
if (PageHighMem(page))
|
|
|
|
return;
|
2008-01-30 13:33:41 +01:00
|
|
|
if (!enable) {
|
2006-06-27 02:54:49 -07:00
|
|
|
debug_check_no_locks_freed(page_address(page),
|
|
|
|
numpages * PAGE_SIZE);
|
2008-01-30 13:33:41 +01:00
|
|
|
}
|
2006-01-09 15:59:21 -08:00
|
|
|
|
2008-01-30 13:33:41 +01:00
|
|
|
/*
|
|
|
|
* the return value is ignored - the calls cannot fail,
|
2005-04-16 15:20:36 -07:00
|
|
|
* large pages are disabled at boot time.
|
|
|
|
*/
|
|
|
|
change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
|
2008-01-30 13:33:41 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* we should perform an IPI and flush all tlbs,
|
2005-04-16 15:20:36 -07:00
|
|
|
* but that can deadlock->flush only current cpu.
|
|
|
|
*/
|
|
|
|
__flush_tlb_all();
|
|
|
|
}
|
|
|
|
#endif
|