From 89ed0b769d6adf30364f60e6b1566961821a9893 Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Sun, 9 Oct 2022 20:41:25 -0700 Subject: [PATCH 01/19] powerpc/pseries/vas: Add VAS IRQ primary handler irq_default_primary_handler() can be used only with IRQF_ONESHOT flag, but the flag disables IRQ before executing the thread handler and enables it after the interrupt is handled. But this IRQ disable sets the VAS IRQ OFF state in the hypervisor. In case if NX faults during this window, the hypervisor will not deliver the fault interrupt to the partition and the user space may wait continuously for the CSB update. So use VAS specific IRQ handler instead of calling the default primary handler. Increment pending_faults counter in IRQ handler and the bottom thread handler will process all faults based on this counter. In case if the another interrupt is received while the thread is running, it will be processed using this counter. The synchronization of top and bottom handlers will be done with IRQTF_RUNTHREAD flag and will re-enter to bottom half if this flag is set. Signed-off-by: Haren Myneni Reviewed-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/aaad8813b4762a6753cfcd0b605a7574a5192ec7.camel@linux.ibm.com --- arch/powerpc/platforms/pseries/vas.c | 40 +++++++++++++++++++++++----- arch/powerpc/platforms/pseries/vas.h | 1 + 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index 0e0524cbe20c..53b36381489e 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -200,16 +200,41 @@ static irqreturn_t pseries_vas_fault_thread_fn(int irq, void *data) struct vas_user_win_ref *tsk_ref; int rc; - rc = h_get_nx_fault(txwin->vas_win.winid, (u64)virt_to_phys(&crb)); - if (!rc) { - tsk_ref = &txwin->vas_win.task_ref; - vas_dump_crb(&crb); - vas_update_csb(&crb, tsk_ref); + while (atomic_read(&txwin->pending_faults)) { + rc = h_get_nx_fault(txwin->vas_win.winid, (u64)virt_to_phys(&crb)); + if (!rc) { + tsk_ref = &txwin->vas_win.task_ref; + vas_dump_crb(&crb); + vas_update_csb(&crb, tsk_ref); + } + atomic_dec(&txwin->pending_faults); } return IRQ_HANDLED; } +/* + * irq_default_primary_handler() can be used only with IRQF_ONESHOT + * which disables IRQ before executing the thread handler and enables + * it after. But this disabling interrupt sets the VAS IRQ OFF + * state in the hypervisor. If the NX generates fault interrupt + * during this window, the hypervisor will not deliver this + * interrupt to the LPAR. So use VAS specific IRQ handler instead + * of calling the default primary handler. + */ +static irqreturn_t pseries_vas_irq_handler(int irq, void *data) +{ + struct pseries_vas_window *txwin = data; + + /* + * The thread hanlder will process this interrupt if it is + * already running. + */ + atomic_inc(&txwin->pending_faults); + + return IRQ_WAKE_THREAD; +} + /* * Allocate window and setup IRQ mapping. */ @@ -240,8 +265,9 @@ static int allocate_setup_window(struct pseries_vas_window *txwin, goto out_irq; } - rc = request_threaded_irq(txwin->fault_virq, NULL, - pseries_vas_fault_thread_fn, IRQF_ONESHOT, + rc = request_threaded_irq(txwin->fault_virq, + pseries_vas_irq_handler, + pseries_vas_fault_thread_fn, 0, txwin->name, txwin); if (rc) { pr_err("VAS-Window[%d]: Request IRQ(%u) failed with %d\n", diff --git a/arch/powerpc/platforms/pseries/vas.h b/arch/powerpc/platforms/pseries/vas.h index 333ffa2f9f42..a2cb12a31c17 100644 --- a/arch/powerpc/platforms/pseries/vas.h +++ b/arch/powerpc/platforms/pseries/vas.h @@ -132,6 +132,7 @@ struct pseries_vas_window { u64 flags; char *name; int fault_virq; + atomic_t pending_faults; /* Number of pending faults */ }; int sysfs_add_vas_caps(struct vas_cop_feat_caps *caps); From 2147783d6bf0b7ca14c72a25527dc5135bd17f65 Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Thu, 6 Oct 2022 22:29:59 -0700 Subject: [PATCH 02/19] powerpc/pseries: Use lparcfg to reconfig VAS windows for DLPAR CPU The hypervisor assigns VAS (Virtual Accelerator Switchboard) windows depends on cores configured in LPAR. The kernel uses OF reconfig notifier to reconfig VAS windows for DLPAR CPU event. In the case of shared CPU mode partition, the hypervisor assigns VAS windows depends on CPU entitled capacity, not based on vcpus. When the user changes CPU entitled capacity for the partition, drmgr uses /proc/ppc64/lparcfg interface to notify the kernel. This patch adds the following changes to update VAS resources for shared mode: - Call vas reconfig windows from lparcfg_write() - Ignore reconfig changes in the VAS notifier Signed-off-by: Haren Myneni [mpe: Rework error handling, report any errors as EIO] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/efa9c16e4a78dda4567a16f13dabfd73cb4674a2.camel@linux.ibm.com --- arch/powerpc/platforms/pseries/lparcfg.c | 11 ++++++ arch/powerpc/platforms/pseries/vas.c | 43 ++++++++++++++++-------- arch/powerpc/platforms/pseries/vas.h | 5 +++ 3 files changed, 45 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index 507dc0b5987d..63fd925ccbb8 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -35,6 +35,7 @@ #include #include "pseries.h" +#include "vas.h" /* pseries_vas_dlpar_cpu() */ /* * This isn't a module but we expose that to userspace @@ -748,6 +749,16 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf, return -EINVAL; retval = update_ppp(new_entitled_ptr, NULL); + + if (retval == H_SUCCESS || retval == H_CONSTRAINED) { + /* + * The hypervisor assigns VAS resources based + * on entitled capacity for shared mode. + * Reconfig VAS windows based on DLPAR CPU events. + */ + if (pseries_vas_dlpar_cpu() != 0) + retval = H_HARDWARE; + } } else if (!strcmp(kbuf, "capacity_weight")) { char *endp; *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10); diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index 53b36381489e..4ad6e510d405 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -852,6 +852,25 @@ int vas_reconfig_capabilties(u8 type, int new_nr_creds) mutex_unlock(&vas_pseries_mutex); return rc; } + +int pseries_vas_dlpar_cpu(void) +{ + int new_nr_creds, rc; + + rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, + vascaps[VAS_GZIP_DEF_FEAT_TYPE].feat, + (u64)virt_to_phys(&hv_cop_caps)); + if (!rc) { + new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds); + rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE, new_nr_creds); + } + + if (rc) + pr_err("Failed reconfig VAS capabilities with DLPAR\n"); + + return rc; +} + /* * Total number of default credits available (target_credits) * in LPAR depends on number of cores configured. It varies based on @@ -866,7 +885,15 @@ static int pseries_vas_notifier(struct notifier_block *nb, struct of_reconfig_data *rd = data; struct device_node *dn = rd->dn; const __be32 *intserv = NULL; - int new_nr_creds, len, rc = 0; + int len; + + /* + * For shared CPU partition, the hypervisor assigns total credits + * based on entitled core capacity. So updating VAS windows will + * be called from lparcfg_write(). + */ + if (is_shared_processor()) + return NOTIFY_OK; if ((action == OF_RECONFIG_ATTACH_NODE) || (action == OF_RECONFIG_DETACH_NODE)) @@ -878,19 +905,7 @@ static int pseries_vas_notifier(struct notifier_block *nb, if (!intserv) return NOTIFY_OK; - rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, - vascaps[VAS_GZIP_DEF_FEAT_TYPE].feat, - (u64)virt_to_phys(&hv_cop_caps)); - if (!rc) { - new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds); - rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE, - new_nr_creds); - } - - if (rc) - pr_err("Failed reconfig VAS capabilities with DLPAR\n"); - - return rc; + return pseries_vas_dlpar_cpu(); } static struct notifier_block pseries_vas_nb = { diff --git a/arch/powerpc/platforms/pseries/vas.h b/arch/powerpc/platforms/pseries/vas.h index a2cb12a31c17..7115043ec488 100644 --- a/arch/powerpc/platforms/pseries/vas.h +++ b/arch/powerpc/platforms/pseries/vas.h @@ -141,10 +141,15 @@ int __init sysfs_pseries_vas_init(struct vas_all_caps *vas_caps); #ifdef CONFIG_PPC_VAS int vas_migration_handler(int action); +int pseries_vas_dlpar_cpu(void); #else static inline int vas_migration_handler(int action) { return 0; } +static inline int pseries_vas_dlpar_cpu(void) +{ + return 0; +} #endif #endif /* _VAS_H */ From be83d5485da549d934ec65463ea831709f2827b1 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 09:07:08 +1000 Subject: [PATCH 03/19] powerpc/64s: Add lockdep for HPTE lock Add lockdep annotation for the HPTE bit-spinlock. Modern systems don't take the tlbie lock, so this shows up some of the same lockdep warnings that were being reported by the ppc970. And they're not taken in exactly the same places so this is nice to have in its own right. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221013230710.1987253-1-npiggin@gmail.com --- arch/powerpc/mm/book3s64/hash_native.c | 42 +++++++++++++++++++++----- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 623a7b7ab38b..7a640e610ea3 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -43,6 +43,29 @@ static DEFINE_RAW_SPINLOCK(native_tlbie_lock); +#ifdef CONFIG_LOCKDEP +static struct lockdep_map hpte_lock_map = + STATIC_LOCKDEP_MAP_INIT("hpte_lock", &hpte_lock_map); + +static void acquire_hpte_lock(void) +{ + lock_map_acquire(&hpte_lock_map); +} + +static void release_hpte_lock(void) +{ + lock_map_release(&hpte_lock_map); +} +#else +static void acquire_hpte_lock(void) +{ +} + +static void release_hpte_lock(void) +{ +} +#endif + static inline unsigned long ___tlbie(unsigned long vpn, int psize, int apsize, int ssize) { @@ -220,6 +243,7 @@ static inline void native_lock_hpte(struct hash_pte *hptep) { unsigned long *word = (unsigned long *)&hptep->v; + acquire_hpte_lock(); while (1) { if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word)) break; @@ -234,6 +258,7 @@ static inline void native_unlock_hpte(struct hash_pte *hptep) { unsigned long *word = (unsigned long *)&hptep->v; + release_hpte_lock(); clear_bit_unlock(HPTE_LOCK_BIT, word); } @@ -279,6 +304,7 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, hpte_v = hpte_old_to_new_v(hpte_v); } + release_hpte_lock(); hptep->r = cpu_to_be64(hpte_r); /* Guarantee the second dword is visible before the valid bit */ eieio(); @@ -327,6 +353,7 @@ static long native_hpte_remove(unsigned long hpte_group) return -1; /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; return i; @@ -517,10 +544,11 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, /* recheck with locks held */ hpte_v = hpte_get_old_v(hptep); - if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; - else + } else native_unlock_hpte(hptep); } /* @@ -580,10 +608,8 @@ static void native_hugepage_invalidate(unsigned long vsid, hpte_v = hpte_get_old_v(hptep); if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { - /* - * Invalidate the hpte. NOTE: this also unlocks it - */ - + /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; } else native_unlock_hpte(hptep); @@ -765,8 +791,10 @@ static void native_flush_hash_range(unsigned long number, int local) if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) native_unlock_hpte(hptep); - else + else { + release_hpte_lock(); hptep->v = 0; + } } pte_iterate_hashed_end(); } From 35159b5717fa9c6031fdd6a2193c7a3dc717ce33 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 09:07:09 +1000 Subject: [PATCH 04/19] powerpc/64s: make HPTE lock and native_tlbie_lock irq-safe With kfence enabled, there are several cases where HPTE and TLBIE locks are called from softirq context, for example: WARNING: inconsistent lock state 6.0.0-11845-g0cbbc95b12ac #1 Tainted: G N -------------------------------- inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. swapper/0/1 [HC0[0]:SC0[0]:HE1:SE1] takes: c000000002734de8 (native_tlbie_lock){+.?.}-{2:2}, at: .native_hpte_updateboltedpp+0x1a4/0x600 {IN-SOFTIRQ-W} state was registered at: .lock_acquire+0x20c/0x520 ._raw_spin_lock+0x4c/0x70 .native_hpte_invalidate+0x62c/0x840 .hash__kernel_map_pages+0x450/0x640 .kfence_protect+0x58/0xc0 .kfence_guarded_free+0x374/0x5a0 .__slab_free+0x3d0/0x630 .put_cred_rcu+0xcc/0x120 .rcu_core+0x3c4/0x14e0 .__do_softirq+0x1dc/0x7dc .do_softirq_own_stack+0x40/0x60 Fix this by consistently disabling irqs while taking either of these locks. Don't just disable bh because several of the more common cases already disable irqs, so this just makes the locks always irq-safe. Reported-by: Guenter Roeck Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221013230710.1987253-2-npiggin@gmail.com --- arch/powerpc/mm/book3s64/hash_native.c | 27 ++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 7a640e610ea3..9342e79870df 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -268,8 +268,11 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, { struct hash_pte *hptep = htab_address + hpte_group; unsigned long hpte_v, hpte_r; + unsigned long flags; int i; + local_irq_save(flags); + if (!(vflags & HPTE_V_BOLTED)) { DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx," " rflags=%lx, vflags=%lx, psize=%d)\n", @@ -288,8 +291,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, hptep++; } - if (i == HPTES_PER_GROUP) + if (i == HPTES_PER_GROUP) { + local_irq_restore(flags); return -1; + } hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; @@ -304,7 +309,6 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, hpte_v = hpte_old_to_new_v(hpte_v); } - release_hpte_lock(); hptep->r = cpu_to_be64(hpte_r); /* Guarantee the second dword is visible before the valid bit */ eieio(); @@ -312,10 +316,13 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, * Now set the first dword including the valid bit * NOTE: this also unlocks the hpte */ + release_hpte_lock(); hptep->v = cpu_to_be64(hpte_v); __asm__ __volatile__ ("ptesync" : : : "memory"); + local_irq_restore(flags); + return i | (!!(vflags & HPTE_V_SECONDARY) << 3); } @@ -366,6 +373,9 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, struct hash_pte *hptep = htab_address + slot; unsigned long hpte_v, want_v; int ret = 0, local = 0; + unsigned long irqflags; + + local_irq_save(irqflags); want_v = hpte_encode_avpn(vpn, bpsize, ssize); @@ -409,6 +419,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, if (!(flags & HPTE_NOHPTE_UPDATE)) tlbie(vpn, bpsize, apsize, ssize, local); + local_irq_restore(irqflags); + return ret; } @@ -472,6 +484,9 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, unsigned long vsid; long slot; struct hash_pte *hptep; + unsigned long flags; + + local_irq_save(flags); vsid = get_kernel_vsid(ea, ssize); vpn = hpt_vpn(ea, vsid, ssize); @@ -490,6 +505,8 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, * actual page size will be same. */ tlbie(vpn, psize, psize, ssize, 0); + + local_irq_restore(flags); } /* @@ -503,6 +520,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) unsigned long vsid; long slot; struct hash_pte *hptep; + unsigned long flags; + + local_irq_save(flags); vsid = get_kernel_vsid(ea, ssize); vpn = hpt_vpn(ea, vsid, ssize); @@ -520,6 +540,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) /* Invalidate the TLB */ tlbie(vpn, psize, psize, ssize, 0); + + local_irq_restore(flags); + return 0; } From b12eb279ff552bd67c167b0fe701ae602aa7311e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 09:07:10 +1000 Subject: [PATCH 05/19] powerpc/64s: make linear_map_hash_lock a raw spinlock This lock is taken while the raw kfence_freelist_lock is held, so it must also be a raw spinlock, as reported by lockdep when raw lock nesting checking is enabled. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221013230710.1987253-3-npiggin@gmail.com --- arch/powerpc/mm/book3s64/hash_utils.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index df008edf7be0..6df4c6d38b66 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1981,7 +1981,7 @@ repeat: } #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) -static DEFINE_SPINLOCK(linear_map_hash_lock); +static DEFINE_RAW_SPINLOCK(linear_map_hash_lock); static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) { @@ -2005,10 +2005,10 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) mmu_linear_psize, mmu_kernel_ssize); BUG_ON (ret < 0); - spin_lock(&linear_map_hash_lock); + raw_spin_lock(&linear_map_hash_lock); BUG_ON(linear_map_hash_slots[lmi] & 0x80); linear_map_hash_slots[lmi] = ret | 0x80; - spin_unlock(&linear_map_hash_lock); + raw_spin_unlock(&linear_map_hash_lock); } static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) @@ -2018,14 +2018,14 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); - spin_lock(&linear_map_hash_lock); + raw_spin_lock(&linear_map_hash_lock); if (!(linear_map_hash_slots[lmi] & 0x80)) { - spin_unlock(&linear_map_hash_lock); + raw_spin_unlock(&linear_map_hash_lock); return; } hidx = linear_map_hash_slots[lmi] & 0x7f; linear_map_hash_slots[lmi] = 0; - spin_unlock(&linear_map_hash_lock); + raw_spin_unlock(&linear_map_hash_lock); if (hidx & _PTEIDX_SECONDARY) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; From b9ef323ea1682f9837bf63ba10c5e3750f71a20a Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 01:16:45 +1000 Subject: [PATCH 06/19] powerpc/64s: Disable preemption in hash lazy mmu mode apply_to_page_range on kernel pages does not disable preemption, which is a requirement for hash's lazy mmu mode, which keeps track of the TLBs to flush with a per-cpu array. Reported-by: Guenter Roeck Signed-off-by: Nicholas Piggin Tested-by: Guenter Roeck Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221013151647.1857994-1-npiggin@gmail.com --- arch/powerpc/include/asm/book3s/64/tlbflush-hash.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index fab8332fe1ad..751921f6db46 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -32,6 +32,11 @@ static inline void arch_enter_lazy_mmu_mode(void) if (radix_enabled()) return; + /* + * apply_to_page_range can call us this preempt enabled when + * operating on kernel page tables. + */ + preempt_disable(); batch = this_cpu_ptr(&ppc64_tlb_batch); batch->active = 1; } @@ -47,6 +52,7 @@ static inline void arch_leave_lazy_mmu_mode(void) if (batch->index) __flush_tlb_pending(batch); batch->active = 0; + preempt_enable(); } #define arch_flush_lazy_mmu_mode() do {} while (0) From 2b2095f3a6b43ec36ff890febc588df1ec32e826 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 01:16:46 +1000 Subject: [PATCH 07/19] powerpc/64s: Fix hash__change_memory_range preemption warning stop_machine_cpuslocked takes a mutex so it must be called in a preemptible context, so it can't simply be fixed by disabling preemption. This is not a bug, because CPU hotplug is locked, so this processor will call in to the stop machine function. So raw_smp_processor_id() could be used. This leaves a small chance that this thread will be migrated to another CPU, so the master work would be done by a CPU from a different context. Better for test coverage to make that a common case by just having the first CPU to call in become the master. Signed-off-by: Nicholas Piggin Tested-by: Guenter Roeck Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221013151647.1857994-2-npiggin@gmail.com --- arch/powerpc/mm/book3s64/hash_pgtable.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index 747492edb75a..51f48984abca 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -404,7 +404,8 @@ EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage); struct change_memory_parms { unsigned long start, end, newpp; - unsigned int step, nr_cpus, master_cpu; + unsigned int step, nr_cpus; + atomic_t master_cpu; atomic_t cpu_counter; }; @@ -478,7 +479,8 @@ static int change_memory_range_fn(void *data) { struct change_memory_parms *parms = data; - if (parms->master_cpu != smp_processor_id()) + // First CPU goes through, all others wait. + if (atomic_xchg(&parms->master_cpu, 1) == 1) return chmem_secondary_loop(parms); // Wait for all but one CPU (this one) to call-in @@ -516,7 +518,7 @@ static bool hash__change_memory_range(unsigned long start, unsigned long end, chmem_parms.end = end; chmem_parms.step = step; chmem_parms.newpp = newpp; - chmem_parms.master_cpu = smp_processor_id(); + atomic_set(&chmem_parms.master_cpu, 0); cpus_read_lock(); From 00ff1eaac129a24516a3f6d75adfb9df1efb55dd Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 01:16:47 +1000 Subject: [PATCH 08/19] powerpc: Fix reschedule bug in KUAP-unlocked user copy schedule must not be explicitly called while KUAP is unlocked, because the AMR register will not be saved across the context switch on 64s (preemption is allowed because that is driven by interrupts which do save the AMR). exit_vmx_usercopy() runs inside an unlocked user access region, and it calls preempt_enable() which will call schedule() if need_resched() was set while non-preemptible. This can cause tasks to run unprotected when the should not, and can cause the user copy to be improperly blocked when scheduling back to it. Fix this by avoiding the explicit resched for preempt kernels by generating an interrupt to reschedule the context if need_resched() got set. Reported-by: Samuel Holland Signed-off-by: Nicholas Piggin Tested-by: Guenter Roeck Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221013151647.1857994-3-npiggin@gmail.com --- arch/powerpc/lib/vmx-helper.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c index f76a50291fd7..d491da8d1838 100644 --- a/arch/powerpc/lib/vmx-helper.c +++ b/arch/powerpc/lib/vmx-helper.c @@ -36,7 +36,17 @@ int exit_vmx_usercopy(void) { disable_kernel_altivec(); pagefault_enable(); - preempt_enable(); + preempt_enable_no_resched(); + /* + * Must never explicitly call schedule (including preempt_enable()) + * while in a kuap-unlocked user copy, because the AMR register will + * not be saved and restored across context switch. However preempt + * kernels need to be preempted as soon as possible if need_resched is + * set and we are preemptible. The hack here is to schedule a + * decrementer to fire here and reschedule for us if necessary. + */ + if (IS_ENABLED(CONFIG_PREEMPT) && need_resched()) + set_dec(1); return 0; } From e59b3399fde5e173b026d4952b215043e77b4521 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 13:07:28 +1000 Subject: [PATCH 09/19] KVM: PPC: BookS PR-KVM and BookE do not support context tracking The context tracking code in PR-KVM and BookE implementations is not complete, and can cause host crashes if context tracking is enabled. Make these implementations depend on !CONTEXT_TRACKING_USER. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221014030729.2077151-2-npiggin@gmail.com --- arch/powerpc/kvm/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 61cdd782d3c5..a9f57dad6d91 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -51,6 +51,7 @@ config KVM_BOOK3S_HV_POSSIBLE config KVM_BOOK3S_32 tristate "KVM support for PowerPC book3s_32 processors" depends on PPC_BOOK3S_32 && !SMP && !PTE_64BIT + depends on !CONTEXT_TRACKING_USER select KVM select KVM_BOOK3S_32_HANDLER select KVM_BOOK3S_PR_POSSIBLE @@ -105,6 +106,7 @@ config KVM_BOOK3S_64_HV config KVM_BOOK3S_64_PR tristate "KVM support without using hypervisor mode in host" depends on KVM_BOOK3S_64 + depends on !CONTEXT_TRACKING_USER select KVM_BOOK3S_PR_POSSIBLE help Support running guest kernels in virtual machines on processors @@ -190,6 +192,7 @@ config KVM_EXIT_TIMING config KVM_E500V2 bool "KVM support for PowerPC E500v2 processors" depends on PPC_E500 && !PPC_E500MC + depends on !CONTEXT_TRACKING_USER select KVM select KVM_MMIO select MMU_NOTIFIER @@ -205,6 +208,7 @@ config KVM_E500V2 config KVM_E500MC bool "KVM support for PowerPC E500MC/E5500/E6500 processors" depends on PPC_E500MC + depends on !CONTEXT_TRACKING_USER select KVM select KVM_MMIO select KVM_BOOKE_HV From a073672eb09670540e95a2a4aa1c46f5da74159f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 13:07:29 +1000 Subject: [PATCH 10/19] powerpc/64/interrupt: Prevent NMI PMI causing a dangerous warning NMI PMIs really should not return using the normal interrupt_return function. If such a PMI hits in code returning to user with the context switched to user mode, this warning can fire. This was enough to cause crashes when reproducing on 64s, because another perf interrupt would hit while reporting bug, and that would cause another bug, and so on until smashing the stack. Work around that particular crash for now by just disabling that context warning for PMIs. This is a hack and not a complete fix, there could be other such problems lurking in corners. But it does fix the known crash. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221014030729.2077151-3-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64e.S | 7 +++++++ arch/powerpc/kernel/interrupt.c | 12 +++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 930e36099015..2f68fb2ee4fc 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -813,6 +813,13 @@ kernel_dbg_exc: EXCEPTION_COMMON(0x260) CHECK_NAPPING() addi r3,r1,STACK_FRAME_OVERHEAD + /* + * XXX: Returning from performance_monitor_exception taken as a + * soft-NMI (Linux irqs disabled) may be risky to use interrupt_return + * and could cause bugs in return or elsewhere. That case should just + * restore registers and return. There is a workaround for one known + * problem in interrupt_exit_kernel_prepare(). + */ bl performance_monitor_exception b interrupt_return diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index f9db0a172401..7bc93367de68 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -374,10 +374,16 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) if (regs_is_unrecoverable(regs)) unrecoverable_exception(regs); /* - * CT_WARN_ON comes here via program_check_exception, - * so avoid recursion. + * CT_WARN_ON comes here via program_check_exception, so avoid + * recursion. + * + * Skip the assertion on PMIs to work around a problem caused by NMI + * PMIs incorrectly taking this interrupt return path, it's possible + * for this to hit after interrupt exit to user switches context to + * user. See also the comment in the performance monitor handler in + * exceptions-64e/s.S */ - if (TRAP(regs) != INTERRUPT_PROGRAM) + if (TRAP(regs) != INTERRUPT_PROGRAM && TRAP(regs) != INTERRUPT_PERFMON) CT_WARN_ON(ct_state() == CONTEXT_USER); kuap = kuap_get_and_assert_locked(); From dc398a084d459f065658855454e09f2778f8c5cc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 7 Oct 2022 00:04:11 +1000 Subject: [PATCH 11/19] powerpc/64s/interrupt: Perf NMI should not take normal exit path NMI interrupts should exit with EXCEPTION_RESTORE_REGS not with interrupt_return_srr, which is what the perf NMI handler currently does. This breaks if a PMI hits after interrupt_exit_user_prepare_main() has switched the context tracking to user mode, then the CT_WARN_ON() in interrupt_exit_kernel_prepare() fires because it returns to kernel with context set to user. This could possibly be solved by soft-disabling PMIs in the exit path, but that reduces our ability to profile that code. The warning could be removed, but it's potentially useful. All other NMIs and soft-NMIs return using EXCEPTION_RESTORE_REGS, so this makes perf interrupts consistent with that and seems like the best fix. Signed-off-by: Nicholas Piggin [mpe: Squash in fixups from Nick] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221006140413.126443-3-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 14 +++++++++++++- arch/powerpc/kernel/interrupt.c | 14 ++++++++------ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 5381a43e50fe..651c36b056bd 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -2357,9 +2357,21 @@ EXC_VIRT_END(performance_monitor, 0x4f00, 0x20) EXC_COMMON_BEGIN(performance_monitor_common) GEN_COMMON performance_monitor addi r3,r1,STACK_FRAME_OVERHEAD - bl performance_monitor_exception + lbz r4,PACAIRQSOFTMASK(r13) + cmpdi r4,IRQS_ENABLED + bne 1f + bl performance_monitor_exception_async b interrupt_return_srr +1: + bl performance_monitor_exception_nmi + /* Clear MSR_RI before setting SRR0 and SRR1. */ + li r9,0 + mtmsrd r9,1 + kuap_kernel_restore r9, r10 + + EXCEPTION_RESTORE_REGS hsrr=0 + RFI_TO_KERNEL /** * Interrupt 0xf20 - Vector Unavailable Interrupt. diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index 7bc93367de68..fc6631a80527 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -377,13 +377,15 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) * CT_WARN_ON comes here via program_check_exception, so avoid * recursion. * - * Skip the assertion on PMIs to work around a problem caused by NMI - * PMIs incorrectly taking this interrupt return path, it's possible - * for this to hit after interrupt exit to user switches context to - * user. See also the comment in the performance monitor handler in - * exceptions-64e/s.S + * Skip the assertion on PMIs on 64e to work around a problem caused + * by NMI PMIs incorrectly taking this interrupt return path, it's + * possible for this to hit after interrupt exit to user switches + * context to user. See also the comment in the performance monitor + * handler in exceptions-64e.S */ - if (TRAP(regs) != INTERRUPT_PROGRAM && TRAP(regs) != INTERRUPT_PERFMON) + if (!IS_ENABLED(CONFIG_PPC_BOOK3E_64) && + TRAP(regs) != INTERRUPT_PROGRAM && + TRAP(regs) != INTERRUPT_PERFMON) CT_WARN_ON(ct_state() == CONTEXT_USER); kuap = kuap_get_and_assert_locked(); From 65722736c3baf29e02e964a09e85c9ef71c48e8d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 22 Oct 2022 15:22:07 +1000 Subject: [PATCH 12/19] powerpc/64s/interrupt: Fix clear of PACA_IRQS_HARD_DIS when returning to soft-masked context Commit a4cb3651a1743 ("powerpc/64s/interrupt: Fix lost interrupts when returning to soft-masked context") fixed the problem of pending irqs being cleared when clearing the HARD_DIS bit, but then it didn't clear the bit at all. This change clears HARD_DIS without affecting other bits in the mask. When an interrupt hits in a soft-masked section that has MSR[EE]=1, it can hard disable and set PACA_IRQS_HARD_DIS, which must be cleared when returning to the EE=1 caller (unless it was set due to a MUST_HARD_MASK interrupt becoming pending). Failure to clear this leaves the returned-to context running with MSR[EE]=1 and PACA_IRQS_HARD_DIS, which confuses irq assertions and could be dangerous for code that might test the flag. This was observed in a hash MMU kernel where a kernel hash fault hits in a local_irqs_disabled region that has EE=1. The hash fault also runs with EE=1, then as it returns, a decrementer hits in the restart section and the irq restart code hard-masks which sets the PACA_IRQ_HARD_DIS flag, which is not clear when the original context is returned to. Reported-by: Sachin Sant Fixes: a4cb3651a1743 ("powerpc/64s/interrupt: Fix lost interrupts when returning to soft-masked context") Signed-off-by: Nicholas Piggin Tested-by: Sachin Sant Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221022052207.471328-1-npiggin@gmail.com --- arch/powerpc/kernel/interrupt_64.S | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S index 978a173eb339..a019ed6fc839 100644 --- a/arch/powerpc/kernel/interrupt_64.S +++ b/arch/powerpc/kernel/interrupt_64.S @@ -532,15 +532,24 @@ _ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_kernel) * Returning to soft-disabled context. * Check if a MUST_HARD_MASK interrupt has become pending, in which * case we need to disable MSR[EE] in the return context. + * + * The MSR[EE] check catches among other things the short incoherency + * in hard_irq_disable() between clearing MSR[EE] and setting + * PACA_IRQ_HARD_DIS. */ ld r12,_MSR(r1) andi. r10,r12,MSR_EE beq .Lfast_kernel_interrupt_return_\srr\() // EE already disabled lbz r11,PACAIRQHAPPENED(r13) andi. r10,r11,PACA_IRQ_MUST_HARD_MASK - beq .Lfast_kernel_interrupt_return_\srr\() // No HARD_MASK pending + bne 1f // HARD_MASK is pending + // No HARD_MASK pending, clear possible HARD_DIS set by interrupt + andi. r11,r11,(~PACA_IRQ_HARD_DIS)@l + stb r11,PACAIRQHAPPENED(r13) + b .Lfast_kernel_interrupt_return_\srr\() - /* Must clear MSR_EE from _MSR */ + +1: /* Must clear MSR_EE from _MSR */ #ifdef CONFIG_PPC_BOOK3S li r10,0 /* Clear valid before changing _MSR */ From 2153fc9623e5465f503d793d4c94ad65e9ec9b5f Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 27 Oct 2022 23:56:26 +1100 Subject: [PATCH 13/19] powerpc/64e: Fix amdgpu build on Book3E w/o AltiVec There's a build failure for Book3E without AltiVec: Error: cc1: error: AltiVec not supported in this target make[6]: *** [/linux/scripts/Makefile.build:250: drivers/gpu/drm/amd/amdgpu/../display/dc/dml/display_mode_lib.o] Error 1 This happens because the amdgpu build is only gated by PPC_LONG_DOUBLE_128, but that symbol can be enabled even though AltiVec is disabled. The only user of PPC_LONG_DOUBLE_128 is amdgpu, so just add a dependency on AltiVec to that symbol to fix the build. Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221027125626.1383092-1-mpe@ellerman.id.au --- arch/powerpc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 699df27b0e2f..20fb1765238c 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -285,7 +285,7 @@ config PPC # config PPC_LONG_DOUBLE_128 - depends on PPC64 + depends on PPC64 && ALTIVEC def_bool $(success,test "$(shell,echo __LONG_DOUBLE_128__ | $(CC) -E -P -)" = 1) config PPC_BARRIER_NOSPEC From 40ff21432883216aa440b6619d559ad8f7d7a7d9 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Mon, 31 Oct 2022 14:23:13 +0100 Subject: [PATCH 14/19] asm-generic: compat: fix compat_arg_u64() and compat_arg_u64_dual() The macros are defined backwards. This affects the following compat syscalls: - compat_sys_truncate64() - compat_sys_ftruncate64() - compat_sys_fallocate() - compat_sys_sync_file_range() - compat_sys_fadvise64_64() - compat_sys_readahead() - compat_sys_pread64() - compat_sys_pwrite64() Fixes: 43d5de2b67d7 ("asm-generic: compat: Support BE for long long args in 32-bit ABIs") Signed-off-by: Andreas Schwab [mpe: Add list of affected syscalls] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/871qqoyvni.fsf_-_@igel.home --- include/asm-generic/compat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/compat.h b/include/asm-generic/compat.h index aeb257ad3d1a..8392caea398f 100644 --- a/include/asm-generic/compat.h +++ b/include/asm-generic/compat.h @@ -15,7 +15,7 @@ #endif #ifndef compat_arg_u64 -#ifdef CONFIG_CPU_BIG_ENDIAN +#ifndef CONFIG_CPU_BIG_ENDIAN #define compat_arg_u64(name) u32 name##_lo, u32 name##_hi #define compat_arg_u64_dual(name) u32, name##_lo, u32, name##_hi #else From ce883a2ba310cd7c291bb66ce5d207965fca6003 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Mon, 31 Oct 2022 15:47:35 +0100 Subject: [PATCH 15/19] powerpc/32: fix syscall wrappers with 64-bit arguments With the introduction of syscall wrappers all wrappers for syscalls with 64-bit arguments must be handled specially, not only those that have unaligned 64-bit arguments. This left out the fallocate() and sync_file_range2() syscalls. Fixes: 7e92e01b7245 ("powerpc: Provide syscall wrapper") Fixes: e23750623835 ("powerpc/32: fix syscall wrappers with 64-bit arguments of unaligned register-pairs") Signed-off-by: Andreas Schwab Reviewed-by: Arnd Bergmann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/87mt9cxd6g.fsf_-_@igel.home --- arch/powerpc/include/asm/syscalls.h | 7 +++++++ arch/powerpc/kernel/sys_ppc32.c | 13 ++++++++++++- arch/powerpc/kernel/syscalls/syscall.tbl | 7 +++++-- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h index a1142496cd58..6d51b007b59e 100644 --- a/arch/powerpc/include/asm/syscalls.h +++ b/arch/powerpc/include/asm/syscalls.h @@ -104,6 +104,13 @@ long sys_ppc_ftruncate64(unsigned int fd, u32 reg4, unsigned long len1, unsigned long len2); long sys_ppc32_fadvise64(int fd, u32 unused, u32 offset1, u32 offset2, size_t len, int advice); +long sys_ppc_sync_file_range2(int fd, unsigned int flags, + unsigned int offset1, + unsigned int offset2, + unsigned int nbytes1, + unsigned int nbytes2); +long sys_ppc_fallocate(int fd, int mode, u32 offset1, u32 offset2, + u32 len1, u32 len2); #endif #ifdef CONFIG_COMPAT long compat_sys_mmap2(unsigned long addr, size_t len, diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index 1ab4a4d95aba..d451a8229223 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -112,7 +112,7 @@ PPC32_SYSCALL_DEFINE6(ppc32_fadvise64, advice); } -COMPAT_SYSCALL_DEFINE6(ppc_sync_file_range2, +PPC32_SYSCALL_DEFINE6(ppc_sync_file_range2, int, fd, unsigned int, flags, unsigned int, offset1, unsigned int, offset2, unsigned int, nbytes1, unsigned int, nbytes2) @@ -122,3 +122,14 @@ COMPAT_SYSCALL_DEFINE6(ppc_sync_file_range2, return ksys_sync_file_range(fd, offset, nbytes, flags); } + +#ifdef CONFIG_PPC32 +SYSCALL_DEFINE6(ppc_fallocate, + int, fd, int, mode, + u32, offset1, u32, offset2, u32, len1, u32, len2) +{ + return ksys_fallocate(fd, mode, + merge_64(offset1, offset2), + merge_64(len1, len2)); +} +#endif diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index e9e0df4f9a61..a0be127475b1 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -394,8 +394,11 @@ 305 common signalfd sys_signalfd compat_sys_signalfd 306 common timerfd_create sys_timerfd_create 307 common eventfd sys_eventfd -308 common sync_file_range2 sys_sync_file_range2 compat_sys_ppc_sync_file_range2 -309 nospu fallocate sys_fallocate compat_sys_fallocate +308 32 sync_file_range2 sys_ppc_sync_file_range2 compat_sys_ppc_sync_file_range2 +308 64 sync_file_range2 sys_sync_file_range2 +308 spu sync_file_range2 sys_sync_file_range2 +309 32 fallocate sys_ppc_fallocate compat_sys_fallocate +309 64 fallocate sys_fallocate 310 nospu subpage_prot sys_subpage_prot 311 32 timerfd_settime sys_timerfd_settime32 311 64 timerfd_settime sys_timerfd_settime From 02a771c9a68a9f08cce4ec5e324fb1bc4dce7202 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 1 Nov 2022 14:48:52 +1100 Subject: [PATCH 16/19] powerpc/32: Select ARCH_SPLIT_ARG64 On 32-bit kernels, 64-bit syscall arguments are split into two registers. For that to work with syscall wrappers, the prototype of the syscall must have the argument split so that the wrapper macro properly unpacks the arguments from pt_regs. The fanotify_mark() syscall is one such syscall, which already has a split prototype, guarded behind ARCH_SPLIT_ARG64. So select ARCH_SPLIT_ARG64 to get that prototype and fix fanotify_mark() on 32-bit kernels with syscall wrappers. Note also that fanotify_mark() is the only usage of ARCH_SPLIT_ARG64. Fixes: 7e92e01b7245 ("powerpc: Provide syscall wrapper") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221101034852.2340319-1-mpe@ellerman.id.au --- arch/powerpc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 20fb1765238c..2ca5418457ed 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -147,6 +147,7 @@ config PPC select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT + select ARCH_SPLIT_ARG64 if PPC32 select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx || 40x From eb761a1760bf30cf64e98ee8d914866e62ec9e8a Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 16 Nov 2022 14:39:53 +1000 Subject: [PATCH 17/19] powerpc: Fix writable sections being moved into the rodata region .data.rel.ro* catches .data.rel.root_cpuacct, and the kernel crashes on a store in css_clear_dir. At least we know read-only data protection is working... Fixes: b6adc6d6d3272 ("powerpc/build: move .data.rel.ro, .sdata2 to read-only") Signed-off-by: Nicholas Piggin Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221116043954.3307852-1-npiggin@gmail.com --- arch/powerpc/kernel/vmlinux.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 7786e3ac7611..8c3862b4c259 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -142,7 +142,7 @@ SECTIONS #endif .data.rel.ro : AT(ADDR(.data.rel.ro) - LOAD_OFFSET) { - *(.data.rel.ro*) + *(.data.rel.ro .data.rel.ro.*) } .branch_lt : AT(ADDR(.branch_lt) - LOAD_OFFSET) { From 89d21e259a94f7d5582ec675aa445f5a79f347e4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 24 Nov 2022 09:37:27 +0100 Subject: [PATCH 18/19] powerpc/bpf/32: Fix Oops on tail call tests test_bpf tail call tests end up as: test_bpf: #0 Tail call leaf jited:1 85 PASS test_bpf: #1 Tail call 2 jited:1 111 PASS test_bpf: #2 Tail call 3 jited:1 145 PASS test_bpf: #3 Tail call 4 jited:1 170 PASS test_bpf: #4 Tail call load/store leaf jited:1 190 PASS test_bpf: #5 Tail call load/store jited:1 BUG: Unable to handle kernel data access on write at 0xf1b4e000 Faulting instruction address: 0xbe86b710 Oops: Kernel access of bad area, sig: 11 [#1] BE PAGE_SIZE=4K MMU=Hash PowerMac Modules linked in: test_bpf(+) CPU: 0 PID: 97 Comm: insmod Not tainted 6.1.0-rc4+ #195 Hardware name: PowerMac3,1 750CL 0x87210 PowerMac NIP: be86b710 LR: be857e88 CTR: be86b704 REGS: f1b4df20 TRAP: 0300 Not tainted (6.1.0-rc4+) MSR: 00009032 CR: 28008242 XER: 00000000 DAR: f1b4e000 DSISR: 42000000 GPR00: 00000001 f1b4dfe0 c11d2280 00000000 00000000 00000000 00000002 00000000 GPR08: f1b4e000 be86b704 f1b4e000 00000000 00000000 100d816a f2440000 fe73baa8 GPR16: f2458000 00000000 c1941ae4 f1fe2248 00000045 c0de0000 f2458030 00000000 GPR24: 000003e8 0000000f f2458000 f1b4dc90 3e584b46 00000000 f24466a0 c1941a00 NIP [be86b710] 0xbe86b710 LR [be857e88] __run_one+0xec/0x264 [test_bpf] Call Trace: [f1b4dfe0] [00000002] 0x2 (unreliable) Instruction dump: XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX ---[ end trace 0000000000000000 ]--- This is a tentative to write above the stack. The problem is encoutered with tests added by commit 38608ee7b690 ("bpf, tests: Add load store test case for tail call") This happens because tail call is done to a BPF prog with a different stack_depth. At the time being, the stack is kept as is when the caller tail calls its callee. But at exit, the callee restores the stack based on its own properties. Therefore here, at each run, r1 is erroneously increased by 32 - 16 = 16 bytes. This was done that way in order to pass the tail call count from caller to callee through the stack. As powerpc32 doesn't have a red zone in the stack, it was necessary the maintain the stack as is for the tail call. But it was not anticipated that the BPF frame size could be different. Let's take a new approach. Use register r4 to carry the tail call count during the tail call, and save it into the stack at function entry if required. This means the input parameter must be in r3, which is more correct as it is a 32 bits parameter, then tail call better match with normal BPF function entry, the down side being that we move that input parameter back and forth between r3 and r4. That can be optimised later. Doing that also has the advantage of maximising the common parts between tail calls and a normal function exit. With the fix, tail call tests are now successfull: test_bpf: #0 Tail call leaf jited:1 53 PASS test_bpf: #1 Tail call 2 jited:1 115 PASS test_bpf: #2 Tail call 3 jited:1 154 PASS test_bpf: #3 Tail call 4 jited:1 165 PASS test_bpf: #4 Tail call load/store leaf jited:1 101 PASS test_bpf: #5 Tail call load/store jited:1 141 PASS test_bpf: #6 Tail call error path, max count reached jited:1 994 PASS test_bpf: #7 Tail call count preserved across function calls jited:1 140975 PASS test_bpf: #8 Tail call error path, NULL target jited:1 110 PASS test_bpf: #9 Tail call error path, index out of range jited:1 69 PASS test_bpf: test_tail_calls: Summary: 10 PASSED, 0 FAILED, [10/10 JIT'ed] Suggested-by: Naveen N. Rao Fixes: 51c66ad849a7 ("powerpc/bpf: Implement extended BPF on PPC32") Cc: stable@vger.kernel.org Signed-off-by: Christophe Leroy Tested-by: Naveen N. Rao Link: https://lore.kernel.org/r/757acccb7fbfc78efa42dcf3c974b46678198905.1669278887.git.christophe.leroy@csgroup.eu --- arch/powerpc/net/bpf_jit_comp32.c | 56 +++++++++++++------------------ 1 file changed, 23 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index 43f1c76d48ce..a379b0ce19ff 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -113,23 +113,19 @@ void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) { int i; + /* Initialize tail_call_cnt, to be skipped if we do tail calls. */ + EMIT(PPC_RAW_LI(_R4, 0)); + +#define BPF_TAILCALL_PROLOGUE_SIZE 4 + + EMIT(PPC_RAW_STWU(_R1, _R1, -BPF_PPC_STACKFRAME(ctx))); + + if (ctx->seen & SEEN_TAILCALL) + EMIT(PPC_RAW_STW(_R4, _R1, bpf_jit_stack_offsetof(ctx, BPF_PPC_TC))); + /* First arg comes in as a 32 bits pointer. */ EMIT(PPC_RAW_MR(bpf_to_ppc(BPF_REG_1), _R3)); EMIT(PPC_RAW_LI(bpf_to_ppc(BPF_REG_1) - 1, 0)); - EMIT(PPC_RAW_STWU(_R1, _R1, -BPF_PPC_STACKFRAME(ctx))); - - /* - * Initialize tail_call_cnt in stack frame if we do tail calls. - * Otherwise, put in NOPs so that it can be skipped when we are - * invoked through a tail call. - */ - if (ctx->seen & SEEN_TAILCALL) - EMIT(PPC_RAW_STW(bpf_to_ppc(BPF_REG_1) - 1, _R1, - bpf_jit_stack_offsetof(ctx, BPF_PPC_TC))); - else - EMIT(PPC_RAW_NOP()); - -#define BPF_TAILCALL_PROLOGUE_SIZE 16 /* * We need a stack frame, but we don't necessarily need to @@ -170,6 +166,16 @@ static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx for (i = BPF_PPC_NVR_MIN; i <= 31; i++) if (bpf_is_seen_register(ctx, i)) EMIT(PPC_RAW_LWZ(i, _R1, bpf_jit_stack_offsetof(ctx, i))); + + if (ctx->seen & SEEN_FUNC) + EMIT(PPC_RAW_LWZ(_R0, _R1, BPF_PPC_STACKFRAME(ctx) + PPC_LR_STKOFF)); + + /* Tear down our stack frame */ + EMIT(PPC_RAW_ADDI(_R1, _R1, BPF_PPC_STACKFRAME(ctx))); + + if (ctx->seen & SEEN_FUNC) + EMIT(PPC_RAW_MTLR(_R0)); + } void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx) @@ -178,16 +184,6 @@ void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx) bpf_jit_emit_common_epilogue(image, ctx); - /* Tear down our stack frame */ - - if (ctx->seen & SEEN_FUNC) - EMIT(PPC_RAW_LWZ(_R0, _R1, BPF_PPC_STACKFRAME(ctx) + PPC_LR_STKOFF)); - - EMIT(PPC_RAW_ADDI(_R1, _R1, BPF_PPC_STACKFRAME(ctx))); - - if (ctx->seen & SEEN_FUNC) - EMIT(PPC_RAW_MTLR(_R0)); - EMIT(PPC_RAW_BLR()); } @@ -244,7 +240,6 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o EMIT(PPC_RAW_RLWINM(_R3, b2p_index, 2, 0, 29)); EMIT(PPC_RAW_ADD(_R3, _R3, b2p_bpf_array)); EMIT(PPC_RAW_LWZ(_R3, _R3, offsetof(struct bpf_array, ptrs))); - EMIT(PPC_RAW_STW(_R0, _R1, bpf_jit_stack_offsetof(ctx, BPF_PPC_TC))); /* * if (prog == NULL) @@ -255,19 +250,14 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o /* goto *(prog->bpf_func + prologue_size); */ EMIT(PPC_RAW_LWZ(_R3, _R3, offsetof(struct bpf_prog, bpf_func))); - - if (ctx->seen & SEEN_FUNC) - EMIT(PPC_RAW_LWZ(_R0, _R1, BPF_PPC_STACKFRAME(ctx) + PPC_LR_STKOFF)); - EMIT(PPC_RAW_ADDIC(_R3, _R3, BPF_TAILCALL_PROLOGUE_SIZE)); - - if (ctx->seen & SEEN_FUNC) - EMIT(PPC_RAW_MTLR(_R0)); - EMIT(PPC_RAW_MTCTR(_R3)); EMIT(PPC_RAW_MR(_R3, bpf_to_ppc(BPF_REG_1))); + /* Put tail_call_cnt in r4 */ + EMIT(PPC_RAW_MR(_R4, _R0)); + /* tear restore NVRs, ... */ bpf_jit_emit_common_epilogue(image, ctx); From 2e7ec190a0e38aaa8a6d87fd5f804ec07947febc Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 25 Nov 2022 22:34:42 +1100 Subject: [PATCH 19/19] powerpc/64s: Add missing declaration for machine_check_early_boot() There's no declaration for machine_check_early_boot(), which leads to a build failure with W=1. Add one. Fixes: 2f5182cffa43 ("powerpc/64s: early boot machine check handler") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221125132521.2167039-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/interrupt.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index 4745bb9998bd..6d8492b6e2b8 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -602,6 +602,7 @@ ____##func(struct pt_regs *regs) /* kernel/traps.c */ DECLARE_INTERRUPT_HANDLER_NMI(system_reset_exception); #ifdef CONFIG_PPC_BOOK3S_64 +DECLARE_INTERRUPT_HANDLER_RAW(machine_check_early_boot); DECLARE_INTERRUPT_HANDLER_ASYNC(machine_check_exception_async); #endif DECLARE_INTERRUPT_HANDLER_NMI(machine_check_exception);