mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-15 13:15:57 +00:00
Merge branch 'kvm-e500-check-writable-pfn' into HEAD
The new __kvm_faultin_pfn() function is upset by the fact that e500 KVM ignores host page permissions - __kvm_faultin requires a "writable" outgoing argument, but e500 KVM is passing NULL. While a simple fix would be possible that simply allows writable to be NULL, it is quite ugly to have e500 KVM ignore completely the host permissions and map readonly host pages as guest-writable. Merge a more complete fix and remove the VMA-based attempts at building huge shadow TLB entries. Using a PTE lookup, similar to what is done for x86, is better and works with remap_pfn_range() because it does not assume that VM_PFNMAP areas are contiguous. Note that the same incorrect logic is there in ARM's get_vma_page_shift() and RISC-V's kvm_riscv_gstage_ioremap(). Fortunately, for e500 most of the code is already there; it just has to be changed to compute the range from find_linux_pte()'s output rather than find_vma(). The new code works for both VM_PFNMAP and hugetlb mappings, so the latter is removed. Patches 2-5 were tested by the reporter, Christian Zigotzky. Since the difference with v1 is minimal, I am going to send it to Linus today.
This commit is contained in:
commit
71b7bf1702
@ -34,6 +34,8 @@ enum vcpu_ftr {
|
||||
#define E500_TLB_BITMAP (1 << 30)
|
||||
/* TLB1 entry is mapped by host TLB0 */
|
||||
#define E500_TLB_TLB0 (1 << 29)
|
||||
/* entry is writable on the host */
|
||||
#define E500_TLB_WRITABLE (1 << 28)
|
||||
/* bits [6-5] MAS2_X1 and MAS2_X0 and [4-0] bits for WIMGE */
|
||||
#define E500_TLB_MAS2_ATTR (0x7f)
|
||||
|
||||
|
@ -45,11 +45,14 @@ static inline unsigned int tlb1_max_shadow_size(void)
|
||||
return host_tlb_params[1].entries - tlbcam_index - 1;
|
||||
}
|
||||
|
||||
static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode)
|
||||
static inline u32 e500_shadow_mas3_attrib(u32 mas3, bool writable, int usermode)
|
||||
{
|
||||
/* Mask off reserved bits. */
|
||||
mas3 &= MAS3_ATTRIB_MASK;
|
||||
|
||||
if (!writable)
|
||||
mas3 &= ~(MAS3_UW|MAS3_SW);
|
||||
|
||||
#ifndef CONFIG_KVM_BOOKE_HV
|
||||
if (!usermode) {
|
||||
/* Guest is in supervisor mode,
|
||||
@ -242,17 +245,18 @@ static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe)
|
||||
return tlbe->mas7_3 & (MAS3_SW|MAS3_UW);
|
||||
}
|
||||
|
||||
static inline bool kvmppc_e500_ref_setup(struct tlbe_ref *ref,
|
||||
static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
|
||||
struct kvm_book3e_206_tlb_entry *gtlbe,
|
||||
kvm_pfn_t pfn, unsigned int wimg)
|
||||
kvm_pfn_t pfn, unsigned int wimg,
|
||||
bool writable)
|
||||
{
|
||||
ref->pfn = pfn;
|
||||
ref->flags = E500_TLB_VALID;
|
||||
if (writable)
|
||||
ref->flags |= E500_TLB_WRITABLE;
|
||||
|
||||
/* Use guest supplied MAS2_G and MAS2_E */
|
||||
ref->flags |= (gtlbe->mas2 & MAS2_ATTRIB_MASK) | wimg;
|
||||
|
||||
return tlbe_is_writable(gtlbe);
|
||||
}
|
||||
|
||||
static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref)
|
||||
@ -305,6 +309,7 @@ static void kvmppc_e500_setup_stlbe(
|
||||
{
|
||||
kvm_pfn_t pfn = ref->pfn;
|
||||
u32 pr = vcpu->arch.shared->msr & MSR_PR;
|
||||
bool writable = !!(ref->flags & E500_TLB_WRITABLE);
|
||||
|
||||
BUG_ON(!(ref->flags & E500_TLB_VALID));
|
||||
|
||||
@ -312,7 +317,7 @@ static void kvmppc_e500_setup_stlbe(
|
||||
stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID;
|
||||
stlbe->mas2 = (gvaddr & MAS2_EPN) | (ref->flags & E500_TLB_MAS2_ATTR);
|
||||
stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) |
|
||||
e500_shadow_mas3_attrib(gtlbe->mas7_3, pr);
|
||||
e500_shadow_mas3_attrib(gtlbe->mas7_3, writable, pr);
|
||||
}
|
||||
|
||||
static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
|
||||
@ -321,15 +326,14 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
|
||||
struct tlbe_ref *ref)
|
||||
{
|
||||
struct kvm_memory_slot *slot;
|
||||
unsigned long pfn = 0; /* silence GCC warning */
|
||||
unsigned int psize;
|
||||
unsigned long pfn;
|
||||
struct page *page = NULL;
|
||||
unsigned long hva;
|
||||
int pfnmap = 0;
|
||||
int tsize = BOOK3E_PAGESZ_4K;
|
||||
int ret = 0;
|
||||
unsigned long mmu_seq;
|
||||
struct kvm *kvm = vcpu_e500->vcpu.kvm;
|
||||
unsigned long tsize_pages = 0;
|
||||
pte_t *ptep;
|
||||
unsigned int wimg = 0;
|
||||
pgd_t *pgdir;
|
||||
@ -351,30 +355,54 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
|
||||
slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn);
|
||||
hva = gfn_to_hva_memslot(slot, gfn);
|
||||
|
||||
if (tlbsel == 1) {
|
||||
struct vm_area_struct *vma;
|
||||
mmap_read_lock(kvm->mm);
|
||||
pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &page);
|
||||
if (is_error_noslot_pfn(pfn)) {
|
||||
if (printk_ratelimit())
|
||||
pr_err("%s: real page not found for gfn %lx\n",
|
||||
__func__, (long)gfn);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
vma = find_vma(kvm->mm, hva);
|
||||
if (vma && hva >= vma->vm_start &&
|
||||
(vma->vm_flags & VM_PFNMAP)) {
|
||||
spin_lock(&kvm->mmu_lock);
|
||||
if (mmu_invalidate_retry(kvm, mmu_seq)) {
|
||||
ret = -EAGAIN;
|
||||
goto out;
|
||||
}
|
||||
|
||||
|
||||
pgdir = vcpu_e500->vcpu.arch.pgdir;
|
||||
/*
|
||||
* This VMA is a physically contiguous region (e.g.
|
||||
* /dev/mem) that bypasses normal Linux page
|
||||
* management. Find the overlap between the
|
||||
* vma and the memslot.
|
||||
* We are just looking at the wimg bits, so we don't
|
||||
* care much about the trans splitting bit.
|
||||
* We are holding kvm->mmu_lock so a notifier invalidate
|
||||
* can't run hence pfn won't change.
|
||||
*/
|
||||
local_irq_save(flags);
|
||||
ptep = find_linux_pte(pgdir, hva, NULL, &psize);
|
||||
if (ptep) {
|
||||
pte_t pte = READ_ONCE(*ptep);
|
||||
|
||||
if (pte_present(pte)) {
|
||||
wimg = (pte_val(pte) >> PTE_WIMGE_SHIFT) &
|
||||
MAS2_WIMGE_MASK;
|
||||
} else {
|
||||
local_irq_restore(flags);
|
||||
pr_err_ratelimited("%s: pte not present: gfn %lx,pfn %lx\n",
|
||||
__func__, (long)gfn, pfn);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
local_irq_restore(flags);
|
||||
|
||||
if (psize && tlbsel == 1) {
|
||||
unsigned long psize_pages, tsize_pages;
|
||||
unsigned long start, end;
|
||||
unsigned long slot_start, slot_end;
|
||||
|
||||
pfnmap = 1;
|
||||
|
||||
start = vma->vm_pgoff;
|
||||
end = start +
|
||||
vma_pages(vma);
|
||||
|
||||
pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT);
|
||||
psize_pages = 1UL << (psize - PAGE_SHIFT);
|
||||
start = pfn & ~(psize_pages - 1);
|
||||
end = start + psize_pages;
|
||||
|
||||
slot_start = pfn - (gfn - slot->base_gfn);
|
||||
slot_end = slot_start + slot->npages;
|
||||
@ -387,6 +415,12 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
|
||||
tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
|
||||
MAS1_TSIZE_SHIFT;
|
||||
|
||||
/*
|
||||
* Any page size that doesn't satisfy the host mapping
|
||||
* will fail the start and end tests.
|
||||
*/
|
||||
tsize = min(psize - PAGE_SHIFT + BOOK3E_PAGESZ_4K, tsize);
|
||||
|
||||
/*
|
||||
* e500 doesn't implement the lowest tsize bit,
|
||||
* or 1K pages.
|
||||
@ -419,79 +453,12 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
|
||||
pfn &= ~(tsize_pages - 1);
|
||||
break;
|
||||
}
|
||||
} else if (vma && hva >= vma->vm_start &&
|
||||
is_vm_hugetlb_page(vma)) {
|
||||
unsigned long psize = vma_kernel_pagesize(vma);
|
||||
|
||||
tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
|
||||
MAS1_TSIZE_SHIFT;
|
||||
|
||||
/*
|
||||
* Take the largest page size that satisfies both host
|
||||
* and guest mapping
|
||||
*/
|
||||
tsize = min(__ilog2(psize) - 10, tsize);
|
||||
|
||||
/*
|
||||
* e500 doesn't implement the lowest tsize bit,
|
||||
* or 1K pages.
|
||||
*/
|
||||
tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
|
||||
}
|
||||
|
||||
mmap_read_unlock(kvm->mm);
|
||||
}
|
||||
|
||||
if (likely(!pfnmap)) {
|
||||
tsize_pages = 1UL << (tsize + 10 - PAGE_SHIFT);
|
||||
pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, NULL, &page);
|
||||
if (is_error_noslot_pfn(pfn)) {
|
||||
if (printk_ratelimit())
|
||||
pr_err("%s: real page not found for gfn %lx\n",
|
||||
__func__, (long)gfn);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* Align guest and physical address to page map boundaries */
|
||||
pfn &= ~(tsize_pages - 1);
|
||||
gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
|
||||
}
|
||||
|
||||
spin_lock(&kvm->mmu_lock);
|
||||
if (mmu_invalidate_retry(kvm, mmu_seq)) {
|
||||
ret = -EAGAIN;
|
||||
goto out;
|
||||
}
|
||||
|
||||
|
||||
pgdir = vcpu_e500->vcpu.arch.pgdir;
|
||||
/*
|
||||
* We are just looking at the wimg bits, so we don't
|
||||
* care much about the trans splitting bit.
|
||||
* We are holding kvm->mmu_lock so a notifier invalidate
|
||||
* can't run hence pfn won't change.
|
||||
*/
|
||||
local_irq_save(flags);
|
||||
ptep = find_linux_pte(pgdir, hva, NULL, NULL);
|
||||
if (ptep) {
|
||||
pte_t pte = READ_ONCE(*ptep);
|
||||
|
||||
if (pte_present(pte)) {
|
||||
wimg = (pte_val(pte) >> PTE_WIMGE_SHIFT) &
|
||||
MAS2_WIMGE_MASK;
|
||||
local_irq_restore(flags);
|
||||
} else {
|
||||
local_irq_restore(flags);
|
||||
pr_err_ratelimited("%s: pte not present: gfn %lx,pfn %lx\n",
|
||||
__func__, (long)gfn, pfn);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
writable = kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg);
|
||||
|
||||
kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg, writable);
|
||||
kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize,
|
||||
ref, gvaddr, stlbe);
|
||||
writable = tlbe_is_writable(stlbe);
|
||||
|
||||
/* Clear i-cache for new pages */
|
||||
kvmppc_mmu_flush_icache(pfn);
|
||||
|
Loading…
x
Reference in New Issue
Block a user