From 675357362aeba19688440eb1aaa7991067f73b12 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 4 Nov 2017 04:16:12 -0700 Subject: [PATCH 1/2] Revert "x86/mm: Stop calling leave_mm() in idle code" This reverts commit 43858b4f25cf0adc5c2ca9cf5ce5fdf2532941e5. The reason I removed the leave_mm() calls in question is because the heuristic wasn't needed after that patch. With the original version of my PCID series, we never flushed a "lazy cpu" (i.e. a CPU running kernel thread) due a flush on the loaded mm. Unfortunately, that caused architectural issues, so now I've reinstated these flushes on non-PCID systems in: commit b956575bed91 ("x86/mm: Flush more aggressively in lazy TLB mode"). That, in turn, gives us a power management and occasionally performance regression as compared to old kernels: a process that goes into a deep idle state on a given CPU and gets its mm flushed due to activity on a different CPU will wake the idle CPU. Reinstate the old ugly heuristic: if a CPU goes into ACPI C3 or an intel_idle state that is likely to cause a TLB flush gets its mm switched to init_mm before going idle. FWIW, this heuristic is lousy. Whether we should change CR3 before idle isn't a good hint except insofar as the performance hit is a bit lower if the TLB is getting flushed by the idle code anyway. What we really want to know is whether we anticipate being idle long enough that the mm is likely to be flushed before we wake up. This is more a matter of the expected latency than the idle state that gets chosen. This heuristic also completely fails on systems that don't know whether the TLB will be flushed (e.g. AMD systems?). OTOH it may be a bit obsolete anyway -- PCID systems don't presently benefit from this heuristic at all. We also shouldn't do this callback from innermost bit of the idle code due to the RCU nastiness it causes. All the information need is available before rcu_idle_enter() needs to happen. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 43858b4f25cf "x86/mm: Stop calling leave_mm() in idle code" Link: http://lkml.kernel.org/r/c513bbd4e653747213e05bc7062de000bf0202a5.1509793738.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/acpi.h | 2 ++ arch/x86/include/asm/acpi.h | 2 ++ arch/x86/mm/tlb.c | 17 ++++++++++++++--- drivers/acpi/processor_idle.c | 2 ++ drivers/idle/intel_idle.c | 9 +++++---- 5 files changed, 25 insertions(+), 7 deletions(-) diff --git a/arch/ia64/include/asm/acpi.h b/arch/ia64/include/asm/acpi.h index c86a947f5368..a3d0211970e9 100644 --- a/arch/ia64/include/asm/acpi.h +++ b/arch/ia64/include/asm/acpi.h @@ -112,6 +112,8 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf) buf[2] |= ACPI_PDC_EST_CAPABILITY_SMP; } +#define acpi_unlazy_tlb(x) + #ifdef CONFIG_ACPI_NUMA extern cpumask_t early_cpu_possible_map; #define for_each_possible_early_cpu(cpu) \ diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 72d867f6b518..8d0ec9df1cbe 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -150,6 +150,8 @@ static inline void disable_acpi(void) { } extern int x86_acpi_numa_init(void); #endif /* CONFIG_ACPI_NUMA */ +#define acpi_unlazy_tlb(x) leave_mm(x) + #ifdef CONFIG_ACPI_APEI static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) { diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 0f3d0cea4d00..3118392cdf75 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -85,6 +85,7 @@ void leave_mm(int cpu) switch_mm(NULL, &init_mm, NULL); } +EXPORT_SYMBOL_GPL(leave_mm); void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) @@ -195,12 +196,22 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); write_cr3(build_cr3(next, new_asid)); - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, - TLB_FLUSH_ALL); + + /* + * NB: This gets called via leave_mm() in the idle path + * where RCU functions differently. Tracing normally + * uses RCU, so we need to use the _rcuidle variant. + * + * (There is no good reason for this. The idle code should + * be rearranged to call this before rcu_idle_enter().) + */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ write_cr3(build_cr3_noflush(next, new_asid)); - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); + + /* See above wrt _rcuidle. */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); } this_cpu_write(cpu_tlbstate.loaded_mm, next); diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 2736e25e9dc6..d50a7b6ccddd 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -710,6 +710,8 @@ static DEFINE_RAW_SPINLOCK(c3_lock); static void acpi_idle_enter_bm(struct acpi_processor *pr, struct acpi_processor_cx *cx, bool timer_bc) { + acpi_unlazy_tlb(smp_processor_id()); + /* * Must be done before busmaster disable as we might need to * access HPET ! diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 5dc7ea4b6bc4..f0b06b14e782 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -913,15 +913,16 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state = &drv->states[index]; unsigned long eax = flg2MWAIT(state->flags); unsigned int cstate; + int cpu = smp_processor_id(); cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1; /* - * NB: if CPUIDLE_FLAG_TLB_FLUSHED is set, this idle transition - * will probably flush the TLB. It's not guaranteed to flush - * the TLB, though, so it's not clear that we can do anything - * useful with this knowledge. + * leave_mm() to avoid costly and often unnecessary wakeups + * for flushing the user TLB's associated with the active mm. */ + if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED) + leave_mm(cpu); if (!(lapic_timer_reliable_states & (1 << (cstate)))) tick_broadcast_enter(); From eda9cec4c9a12208a6f69fbe68f72a6311d50032 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 3 Nov 2017 07:58:54 -0500 Subject: [PATCH 2/2] x86/module: Detect and skip invalid relocations There have been some cases where external tooling (e.g., kpatch-build) creates a corrupt relocation which targets the wrong address. This is a silent failure which can corrupt memory in unexpected places. On x86, the bytes of data being overwritten by relocations are always initialized to zero beforehand. Use that knowledge to add sanity checks to detect such cases before they corrupt memory. Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: jeyu@kernel.org Cc: live-patching@vger.kernel.org Link: http://lkml.kernel.org/r/37450d6c6225e54db107fba447ce9e56e5f758e9.1509713553.git.jpoimboe@redhat.com [ Restructured the messages, as it's unclear whether the relocation or the target is corrupted. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/module.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 62e7d70aadd5..da0c160e5589 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -172,19 +172,27 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_X86_64_NONE: break; case R_X86_64_64: + if (*(u64 *)loc != 0) + goto invalid_relocation; *(u64 *)loc = val; break; case R_X86_64_32: + if (*(u32 *)loc != 0) + goto invalid_relocation; *(u32 *)loc = val; if (val != *(u32 *)loc) goto overflow; break; case R_X86_64_32S: + if (*(s32 *)loc != 0) + goto invalid_relocation; *(s32 *)loc = val; if ((s64)val != *(s32 *)loc) goto overflow; break; case R_X86_64_PC32: + if (*(u32 *)loc != 0) + goto invalid_relocation; val -= (u64)loc; *(u32 *)loc = val; #if 0 @@ -200,6 +208,11 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, } return 0; +invalid_relocation: + pr_err("x86/modules: Skipping invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), loc, val); + return -ENOEXEC; + overflow: pr_err("overflow in relocation type %d val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), val);