From 41d840e224170d2768493e320f290ed060491727 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 12 Jun 2009 12:57:52 +0800 Subject: [PATCH 01/45] x86: change kernel_physical_mapping_init() __init to __meminit kernel_physical_mapping_init() could be called in memory hotplug path. [ Impact: fix potential crash with memory hotplug ] Signed-off-by: Shaohua Li LKML-Reference: <20090612045752.GA827@sli10-desk.sh.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 52bb9519bb86..52e1bff6bfd0 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -527,7 +527,7 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, return phys_pud_init(pud, addr, end, page_size_mask); } -unsigned long __init +unsigned long __meminit kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask) From 46e443283891dbd45fba7fa037baab831f5d8f3f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 13 Jun 2009 20:00:54 -0700 Subject: [PATCH 02/45] x86: atomic_32.h: Fix kernel-doc warnings Fix kernel-doc warnings in atomic_32.h: Warning(arch/x86/include/asm/atomic_32.h:265): No description found for parameter 'ptr' Warning(arch/x86/include/asm/atomic_32.h:265): Excess function parameter 'v' description in '__atomic64_read' Warning(arch/x86/include/asm/atomic_32.h:305): Excess function parameter 'old_val' description in 'atomic64_xchg' Signed-off-by: Randy Dunlap LKML-Reference: <4A3467E6.6010907@oracle.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic_32.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index aff9f1fcdcd7..a6a4b357076d 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -257,7 +257,7 @@ typedef struct { /** * atomic64_read - read atomic64 variable - * @v: pointer of type atomic64_t + * @ptr: pointer of type atomic64_t * * Atomically reads the value of @v. * Doesn't imply a read memory barrier. @@ -294,7 +294,6 @@ atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val, * atomic64_xchg - xchg atomic64 variable * @ptr: pointer to type atomic64_t * @new_val: value to assign - * @old_val: old value that was there * * Atomically xchgs the value of @ptr to @new_val and returns * the old value. From 507fa3a3d80365c595113a5ac3232309e3dbf5d8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Jun 2009 17:46:01 +0200 Subject: [PATCH 03/45] x86: hpet: Mark per cpu interrupts IRQF_TIMER to prevent resume failure timer interrupts are excluded from being disabled during suspend. The clock events code manages the disabling of clock events on its own because the timer interrupt needs to be functional before the resume code reenables the device interrupts. The hpet per cpu timers request their interrupt without setting the IRQF_TIMER flag so suspend_device_irqs() disables them as well which results in a fatal resume failure on the boot CPU. Adding IRQF_TIMER to the interupt flags when requesting the hpet per cpu timer interrupts solves the problem. Reported-by: Benjamin S. Signed-off-by: Thomas Gleixner Tested-by: Benjamin S. Cc: stable@kernel.org --- arch/x86/kernel/hpet.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 81408b93f887..dedc2bddf7a5 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -510,7 +510,8 @@ static int hpet_setup_irq(struct hpet_dev *dev) { if (request_irq(dev->irq, hpet_interrupt_handler, - IRQF_DISABLED|IRQF_NOBALANCING, dev->name, dev)) + IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, + dev->name, dev)) return -1; disable_irq(dev->irq); From 0975904276552c8e201dad0ad31152ba8a21505a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 9 Jun 2009 17:52:27 +0200 Subject: [PATCH 04/45] amd-iommu: disable IOMMU hardware on shutdown When the IOMMU stays enabled the BIOS may not be able to finish the machine shutdown properly. So disable the hardware on shutdown. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu.h | 2 ++ arch/x86/kernel/amd_iommu_init.c | 5 +++++ arch/x86/kernel/pci-dma.c | 2 ++ 3 files changed, 9 insertions(+) diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h index 262e02820049..bdf96f119f06 100644 --- a/arch/x86/include/asm/amd_iommu.h +++ b/arch/x86/include/asm/amd_iommu.h @@ -29,9 +29,11 @@ extern void amd_iommu_detect(void); extern irqreturn_t amd_iommu_int_handler(int irq, void *data); extern void amd_iommu_flush_all_domains(void); extern void amd_iommu_flush_all_devices(void); +extern void amd_iommu_shutdown(void); #else static inline int amd_iommu_init(void) { return -ENODEV; } static inline void amd_iommu_detect(void) { } +static inline void amd_iommu_shutdown(void) { } #endif #endif /* _ASM_X86_AMD_IOMMU_H */ diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 238989ec077d..575ca46211bb 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1273,6 +1273,11 @@ free: goto out; } +void amd_iommu_shutdown(void) +{ + disable_iommus(); +} + /**************************************************************************** * * Early detect code. This code runs at IOMMU detection time in the DMA diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 745579bc8256..328592fb6044 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -290,6 +290,8 @@ static int __init pci_iommu_init(void) void pci_iommu_shutdown(void) { gart_iommu_shutdown(); + + amd_iommu_shutdown(); } /* Must execute after PCI subsystem */ fs_initcall(pci_iommu_init); From 61d047be99757fd9b0af900d7abce9a13a337488 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 9 Jun 2009 17:56:09 +0200 Subject: [PATCH 05/45] x86: disable IOMMUs on kernel crash If the IOMMUs are still enabled when the kexec kernel boots access to the disk is not possible. This is bad for tools like kdump or anything else which wants to use PCI devices. Signed-off-by: Joerg Roedel --- arch/x86/kernel/crash.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index ff958248e61d..5e409dc298a4 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -27,6 +27,7 @@ #include #include #include +#include #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) @@ -103,5 +104,10 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #ifdef CONFIG_HPET_TIMER hpet_disable(); #endif + +#ifdef CONFIG_X86_64 + pci_iommu_shutdown(); +#endif + crash_save_cpu(regs, safe_smp_processor_id()); } From 42a49f965a8d24ed92af04f5b564d63f17fd9c56 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Mon, 15 Jun 2009 15:42:00 +0200 Subject: [PATCH 06/45] amd-iommu: flush domain tlb when attaching a new device When kexec'ing to a new kernel (for example, when crashing and launching a kdump session), the AMD IOMMU may have cached translations. The kexec'd kernel, during initialization, will invalidate the IOMMU device table entries, but not the domain translations. These stale entries can cause a device's DMA to fail, makes it rough to write a dump to disk when the disk controller can't DMA ;-) Signed-off-by: Chris Wright Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 1c60554537c3..9372f0406ad4 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -434,6 +434,16 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); } +/* Flush the whole IO/TLB for a given protection domain - including PDE */ +static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) +{ + u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; + + INC_STATS_COUNTER(domain_flush_single); + + iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1); +} + /* * This function is used to flush the IO/TLB for a given protection domain * on every IOMMU in the system @@ -1078,7 +1088,13 @@ static void attach_device(struct amd_iommu *iommu, amd_iommu_pd_table[devid] = domain; write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + /* + * We might boot into a crash-kernel here. The crashed kernel + * left the caches in the IOMMU dirty. So we have to flush + * here to evict all dirty stuff. + */ iommu_queue_inv_dev_entry(iommu, devid); + iommu_flush_tlb_pde(iommu, domain->id); } /* From a8c485bb6857811807d42f9fd1fde2f5f89cc5c9 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Mon, 15 Jun 2009 15:53:45 +0200 Subject: [PATCH 07/45] amd-iommu: disable cmd buffer and evt logging before reprogramming iommu The IOMMU spec states that IOMMU behavior may be undefined when the IOMMU registers are rewritten while command or event buffer is enabled. Disable them in IOMMU disable path. Signed-off-by: Chris Wright Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 575ca46211bb..48a79b9b2f9e 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -260,6 +260,14 @@ static void iommu_enable(struct amd_iommu *iommu) static void iommu_disable(struct amd_iommu *iommu) { + /* Disable command buffer */ + iommu_feature_disable(iommu, CONTROL_CMDBUF_EN); + + /* Disable event logging and event interrupts */ + iommu_feature_disable(iommu, CONTROL_EVT_INT_EN); + iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN); + + /* Disable IOMMU hardware itself */ iommu_feature_disable(iommu, CONTROL_IOMMU_EN); } @@ -1042,6 +1050,7 @@ static void enable_iommus(void) struct amd_iommu *iommu; for_each_iommu(iommu) { + iommu_disable(iommu); iommu_set_device_table(iommu); iommu_enable_command_buffer(iommu); iommu_enable_event_buffer(iommu); From 09067207f6eacb7f00c8f7f0623c3696083ce042 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 15 Jun 2009 16:06:48 +0200 Subject: [PATCH 08/45] amd-iommu: set event buffer head and tail to 0 manually These registers may contain values from previous kernels. So reset them to known values before enable the event buffer again. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 48a79b9b2f9e..068a3569f837 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -486,6 +486,10 @@ static void iommu_enable_event_buffer(struct amd_iommu *iommu) memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, &entry, sizeof(entry)); + /* set head and tail to zero manually */ + writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); + writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); + iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); } From 6a047d8b9efc4b7d0c57ca4835f7e519e5c90d3f Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Tue, 16 Jun 2009 03:01:37 -0400 Subject: [PATCH 09/45] amd-iommu: resume cleanup Now that enable_iommus() will call iommu_disable() for each iommu, the call to disable_iommus() during resume is redundant. Also, the order for an invalidation is to invalidate device table entries first, then domain translations. Signed-off-by: Chris Wright Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 068a3569f837..10b2accd12ea 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1079,12 +1079,6 @@ static void disable_iommus(void) static int amd_iommu_resume(struct sys_device *dev) { - /* - * Disable IOMMUs before reprogramming the hardware registers. - * IOMMU is still enabled from the resume kernel. - */ - disable_iommus(); - /* re-load the hardware */ enable_iommus(); @@ -1092,8 +1086,8 @@ static int amd_iommu_resume(struct sys_device *dev) * we have to flush after the IOMMUs are enabled because a * disabled IOMMU will never execute the commands we send */ - amd_iommu_flush_all_domains(); amd_iommu_flush_all_devices(); + amd_iommu_flush_all_domains(); return 0; } From 5dfaf90f8052327c92fbe3c470a2e6634be296c0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 16 Jun 2009 10:23:32 +0200 Subject: [PATCH 10/45] x86: mm: Read cr2 before prefetching the mmap_lock Prefetch instructions can generate spurious faults on certain models of older CPUs. The faults themselves cannot be stopped and they can occur pretty much anywhere - so the way we solve them is that we detect certain patterns and ignore the fault. There is one small path of code where we must not take faults though: the #PF handler execution leading up to the reading of the CR2 (the faulting address). If we take a fault there then we destroy the CR2 value (with that of the prefetching instruction's) and possibly mishandle user-space or kernel-space pagefaults. It turns out that in current upstream we do exactly that: prefetchw(&mm->mmap_sem); /* Get the faulting address: */ address = read_cr2(); This is not good. So turn around the order: first read the cr2 then prefetch the lock address. Reading cr2 is plenty fast (2 cycles) so delaying the prefetch by this amount shouldnt be a big issue performance-wise. [ And this might explain a mystery fault.c warning that sometimes occurs on one an old AMD/Semptron based test-system i have - which does have such prefetch problems. ] Cc: Mathieu Desnoyers Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Nick Piggin Cc: Pekka Enberg Cc: Vegard Nossum Cc: Jeremy Fitzhardinge Cc: Hugh Dickins LKML-Reference: <20090616030522.GA22162@Krystal> Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index c6acc6326374..0482fa649738 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -951,11 +951,11 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) tsk = current; mm = tsk->mm; - prefetchw(&mm->mmap_sem); - /* Get the faulting address: */ address = read_cr2(); + prefetchw(&mm->mmap_sem); + if (unlikely(kmmio_fault(regs, address))) return; From 184e1fdfea066ab8f12a1e8912f402d2d6556d11 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 15 Jun 2009 15:37:07 +0800 Subject: [PATCH 11/45] x86, mce: fix a race condition about mce_callin and no_way_out If one CPU has no_way_out == 1, all other CPUs should have no_way_out == 1. But despite global_nwo is read after mce_callin, global_nwo is updated after mce_callin too. So it is possible that some CPU read global_nwo before some other CPU update global_nwo, so that no_way_out == 1 for some CPU, while no_way_out == 0 for some other CPU. This patch fixes this race condition via moving mce_callin updating after global_nwo updating, with a smp_wmb in between. A smp_rmb is added between their reading too. Signed-off-by: Huang Ying Acked-by: Andi Kleen Acked-by: Hidetoshi Seto --- arch/x86/kernel/cpu/mcheck/mce.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index fabba15e4558..19294b8524cb 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -703,6 +703,11 @@ static int mce_start(int no_way_out, int *order) } atomic_add(no_way_out, &global_nwo); + /* + * global_nwo should be updated before mce_callin + */ + smp_wmb(); + *order = atomic_add_return(1, &mce_callin); /* * Wait for everyone. @@ -716,6 +721,10 @@ static int mce_start(int no_way_out, int *order) ndelay(SPINUNIT); } + /* + * mce_callin should be read before global_nwo + */ + smp_rmb(); /* * Cache the global no_way_out state. */ @@ -862,7 +871,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * Establish sequential order between the CPUs entering the machine * check handler. */ - int order; + int order = -1; /* * If no_way_out gets set, there is no safe way to recover from this @@ -887,7 +896,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (!banks) goto out; - order = atomic_add_return(1, &mce_callin); mce_setup(&m); m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); From 33edbf02a92771fa2a81e41084a44ba874e3a5a5 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:18:45 +0900 Subject: [PATCH 12/45] x86, mce: don't init timer if !mce_available In mce_cpu_restart, mce_init_timer is called unconditionally. If !mce_available (e.g. mce is disabled), there are no useful work for timer. Stop running it. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 19294b8524cb..dda77215e9e2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1617,8 +1617,9 @@ static int mce_resume(struct sys_device *dev) static void mce_cpu_restart(void *data) { del_timer_sync(&__get_cpu_var(mce_timer)); - if (mce_available(¤t_cpu_data)) - mce_init(); + if (!mce_available(¤t_cpu_data)) + return; + mce_init(); mce_init_timer(); } From 7fb06fc9672b947424e05871243a4c8e19ec3bce Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 18:18:43 +0900 Subject: [PATCH 13/45] x86, mce: cleanup mce_start() Simplify interface of mce_start(): - no_way_out = mce_start(no_way_out, &order); + order = mce_start(&no_way_out); Now Monarch and Subjects share same exit(return) in usual path. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 70 +++++++++++++++----------------- 1 file changed, 33 insertions(+), 37 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index dda77215e9e2..739fd7eca0a4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -691,23 +691,21 @@ static atomic_t global_nwo; * in the entry order. * TBD double check parallel CPU hotunplug */ -static int mce_start(int no_way_out, int *order) +static int mce_start(int *no_way_out) { - int nwo; + int order; int cpus = num_online_cpus(); u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; - if (!timeout) { - *order = -1; - return no_way_out; - } + if (!timeout) + return -1; - atomic_add(no_way_out, &global_nwo); + atomic_add(*no_way_out, &global_nwo); /* * global_nwo should be updated before mce_callin */ smp_wmb(); - *order = atomic_add_return(1, &mce_callin); + order = atomic_add_return(1, &mce_callin); /* * Wait for everyone. @@ -715,8 +713,7 @@ static int mce_start(int no_way_out, int *order) while (atomic_read(&mce_callin) != cpus) { if (mce_timed_out(&timeout)) { atomic_set(&global_nwo, 0); - *order = -1; - return no_way_out; + return -1; } ndelay(SPINUNIT); } @@ -725,34 +722,34 @@ static int mce_start(int no_way_out, int *order) * mce_callin should be read before global_nwo */ smp_rmb(); + + if (order == 1) { + /* + * Monarch: Starts executing now, the others wait. + */ + atomic_set(&mce_executing, 1); + } else { + /* + * Subject: Now start the scanning loop one by one in + * the original callin order. + * This way when there are any shared banks it will be + * only seen by one CPU before cleared, avoiding duplicates. + */ + while (atomic_read(&mce_executing) < order) { + if (mce_timed_out(&timeout)) { + atomic_set(&global_nwo, 0); + return -1; + } + ndelay(SPINUNIT); + } + } + /* * Cache the global no_way_out state. */ - nwo = atomic_read(&global_nwo); + *no_way_out = atomic_read(&global_nwo); - /* - * Monarch starts executing now, the others wait. - */ - if (*order == 1) { - atomic_set(&mce_executing, 1); - return nwo; - } - - /* - * Now start the scanning loop one by one - * in the original callin order. - * This way when there are any shared banks it will - * be only seen by one CPU before cleared, avoiding duplicates. - */ - while (atomic_read(&mce_executing) < *order) { - if (mce_timed_out(&timeout)) { - atomic_set(&global_nwo, 0); - *order = -1; - return no_way_out; - } - ndelay(SPINUNIT); - } - return nwo; + return order; } /* @@ -871,8 +868,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * Establish sequential order between the CPUs entering the machine * check handler. */ - int order = -1; - + int order; /* * If no_way_out gets set, there is no safe way to recover from this * MCE. If tolerant is cranked up, we'll try anyway. @@ -917,7 +913,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * This way we don't report duplicated events on shared banks * because the first one to see it will clear it. */ - no_way_out = mce_start(no_way_out, &order); + order = mce_start(&no_way_out); for (i = 0; i < banks; i++) { __clear_bit(i, toclear); if (!bank[i]) From 4e5b3e690dda890523e93af9c545261f5916a3a6 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:20:20 +0900 Subject: [PATCH 14/45] x86, mce: add __read_mostly Add __read_mostly to data written during setup. Suggested-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 739fd7eca0a4..2d2e0b565a9c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -57,7 +57,7 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code) void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; -int mce_disabled; +int mce_disabled __read_mostly; #ifdef CONFIG_X86_NEW_MCE @@ -76,19 +76,19 @@ DEFINE_PER_CPU(unsigned, mce_exception_count); * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors * 3: never panic or SIGBUS, log all errors (for testing only) */ -static int tolerant = 1; -static int banks; -static u64 *bank; -static unsigned long notify_user; -static int rip_msr; -static int mce_bootlog = -1; -static int monarch_timeout = -1; -static int mce_panic_timeout; -static int mce_dont_log_ce; -int mce_cmci_disabled; -int mce_ignore_ce; -int mce_ser; +static int tolerant __read_mostly = 1; +static int banks __read_mostly; +static u64 *bank __read_mostly; +static int rip_msr __read_mostly; +static int mce_bootlog __read_mostly = -1; +static int monarch_timeout __read_mostly = -1; +static int mce_panic_timeout __read_mostly; +static int mce_dont_log_ce __read_mostly; +int mce_cmci_disabled __read_mostly; +int mce_ignore_ce __read_mostly; +int mce_ser __read_mostly; +static unsigned long notify_user; static char trigger[128]; static char *trigger_argv[2] = { trigger, NULL }; From 1020bcbcc7da36001d9226c5d57e999949cb80c5 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:20:57 +0900 Subject: [PATCH 15/45] x86, mce: rename static variables around trigger "trigger" is not straight forward name for valiable that holds name of user mode helper program which triggered by machine check events. This patch renames this valiable and kins to more recognizable names. trigger => mce_helper trigger_argv => mce_helper_argv notify_user => mce_need_notify No functional changes. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2d2e0b565a9c..e8b893e880ed 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -88,9 +88,10 @@ int mce_cmci_disabled __read_mostly; int mce_ignore_ce __read_mostly; int mce_ser __read_mostly; -static unsigned long notify_user; -static char trigger[128]; -static char *trigger_argv[2] = { trigger, NULL }; +/* User mode helper program triggered by machine check event */ +static unsigned long mce_need_notify; +static char mce_helper[128]; +static char *mce_helper_argv[2] = { mce_helper, NULL }; static unsigned long dont_init_banks; @@ -180,7 +181,7 @@ void mce_log(struct mce *mce) wmb(); mce->finished = 1; - set_bit(0, ¬ify_user); + set_bit(0, &mce_need_notify); } static void print_mce(struct mce *m) @@ -1122,7 +1123,7 @@ static void mcheck_timer(unsigned long data) static void mce_do_trigger(struct work_struct *work) { - call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); + call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); } static DECLARE_WORK(mce_trigger_work, mce_do_trigger); @@ -1139,7 +1140,7 @@ int mce_notify_irq(void) clear_thread_flag(TIF_MCE_NOTIFY); - if (test_and_clear_bit(0, ¬ify_user)) { + if (test_and_clear_bit(0, &mce_need_notify)) { wake_up_interruptible(&mce_wait); /* @@ -1147,7 +1148,7 @@ int mce_notify_irq(void) * work_pending is always cleared before the function is * executed. */ - if (trigger[0] && !work_pending(&mce_trigger_work)) + if (mce_helper[0] && !work_pending(&mce_trigger_work)) schedule_work(&mce_trigger_work); if (__ratelimit(&ratelimit)) @@ -1664,9 +1665,9 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) { - strcpy(buf, trigger); + strcpy(buf, mce_helper); strcat(buf, "\n"); - return strlen(trigger) + 1; + return strlen(mce_helper) + 1; } static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, @@ -1675,10 +1676,10 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *p; int len; - strncpy(trigger, buf, sizeof(trigger)); - trigger[sizeof(trigger)-1] = 0; - len = strlen(trigger); - p = strchr(trigger, '\n'); + strncpy(mce_helper, buf, sizeof(mce_helper)); + mce_helper[sizeof(mce_helper)-1] = 0; + len = strlen(mce_helper); + p = strchr(mce_helper, '\n'); if (*p) *p = 0; From 9af43b54ab4509f1dac49637d6917d57292e6518 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:21:36 +0900 Subject: [PATCH 16/45] x86, mce: sysfs entries for new mce options Add sysfs interface for admins who want to tweak these options without rebooting the system. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 84 +++++++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e8b893e880ed..4b62443b6e72 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1626,6 +1626,26 @@ static void mce_restart(void) on_each_cpu(mce_cpu_restart, NULL, 1); } +/* Toggle features for corrected errors */ +static void mce_disable_ce(void *all) +{ + if (!mce_available(¤t_cpu_data)) + return; + if (all) + del_timer_sync(&__get_cpu_var(mce_timer)); + cmci_clear(); +} + +static void mce_enable_ce(void *all) +{ + if (!mce_available(¤t_cpu_data)) + return; + cmci_reenable(); + cmci_recheck(); + if (all) + mce_init_timer(); +} + static struct sysdev_class mce_sysclass = { .suspend = mce_suspend, .shutdown = mce_shutdown, @@ -1687,6 +1707,52 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, return len; } +static ssize_t set_ignore_ce(struct sys_device *s, + struct sysdev_attribute *attr, + const char *buf, size_t size) +{ + u64 new; + + if (strict_strtoull(buf, 0, &new) < 0) + return -EINVAL; + + if (mce_ignore_ce ^ !!new) { + if (new) { + /* disable ce features */ + on_each_cpu(mce_disable_ce, (void *)1, 1); + mce_ignore_ce = 1; + } else { + /* enable ce features */ + mce_ignore_ce = 0; + on_each_cpu(mce_enable_ce, (void *)1, 1); + } + } + return size; +} + +static ssize_t set_cmci_disabled(struct sys_device *s, + struct sysdev_attribute *attr, + const char *buf, size_t size) +{ + u64 new; + + if (strict_strtoull(buf, 0, &new) < 0) + return -EINVAL; + + if (mce_cmci_disabled ^ !!new) { + if (new) { + /* disable cmci */ + on_each_cpu(mce_disable_ce, NULL, 1); + mce_cmci_disabled = 1; + } else { + /* enable cmci */ + mce_cmci_disabled = 0; + on_each_cpu(mce_enable_ce, NULL, 1); + } + } + return size; +} + static ssize_t store_int_with_restart(struct sys_device *s, struct sysdev_attribute *attr, const char *buf, size_t size) @@ -1699,6 +1765,7 @@ static ssize_t store_int_with_restart(struct sys_device *s, static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); +static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); static struct sysdev_ext_attribute attr_check_interval = { _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, @@ -1706,9 +1773,24 @@ static struct sysdev_ext_attribute attr_check_interval = { &check_interval }; +static struct sysdev_ext_attribute attr_ignore_ce = { + _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), + &mce_ignore_ce +}; + +static struct sysdev_ext_attribute attr_cmci_disabled = { + _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_cmci_disabled), + &mce_cmci_disabled +}; + static struct sysdev_attribute *mce_attrs[] = { - &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, + &attr_tolerant.attr, + &attr_check_interval.attr, + &attr_trigger, &attr_monarch_timeout.attr, + &attr_dont_log_ce.attr, + &attr_ignore_ce.attr, + &attr_cmci_disabled.attr, NULL }; From 9e55e44e39798541ba39d57f4b569deb555ae1ce Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:22:15 +0900 Subject: [PATCH 17/45] x86, mce: unify mce.h There are 2 headers: arch/x86/include/asm/mce.h arch/x86/kernel/cpu/mcheck/mce.h and in the latter small header: #include This patch move all contents in the latter header into the former, and fix all files using the latter to include the former instead. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 36 +++++++++++++++++++-- arch/x86/kernel/cpu/mcheck/k7.c | 3 +- arch/x86/kernel/cpu/mcheck/mce.c | 1 - arch/x86/kernel/cpu/mcheck/mce.h | 38 ----------------------- arch/x86/kernel/cpu/mcheck/mce_intel.c | 3 +- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 -- arch/x86/kernel/cpu/mcheck/non-fatal.c | 3 +- arch/x86/kernel/cpu/mcheck/p4.c | 3 +- arch/x86/kernel/cpu/mcheck/p5.c | 3 +- arch/x86/kernel/cpu/mcheck/p6.c | 3 +- arch/x86/kernel/cpu/mcheck/winchip.c | 3 +- arch/x86/kernel/traps.c | 3 +- 12 files changed, 42 insertions(+), 59 deletions(-) delete mode 100644 arch/x86/kernel/cpu/mcheck/mce.h diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 540a466e50f5..aae6fe2112f9 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -102,10 +102,42 @@ struct mce_log { #ifdef __KERNEL__ +#include +#include +#include + extern int mce_disabled; -#include -#include +#ifdef CONFIG_X86_OLD_MCE +void amd_mcheck_init(struct cpuinfo_x86 *c); +void intel_p4_mcheck_init(struct cpuinfo_x86 *c); +void intel_p6_mcheck_init(struct cpuinfo_x86 *c); +#endif + +#ifdef CONFIG_X86_ANCIENT_MCE +void intel_p5_mcheck_init(struct cpuinfo_x86 *c); +void winchip_mcheck_init(struct cpuinfo_x86 *c); +extern int mce_p5_enable; +static inline int mce_p5_enabled(void) { return mce_p5_enable; } +static inline void enable_p5_mce(void) { mce_p5_enable = 1; } +#else +static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} +static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} +static inline int mce_p5_enabled(void) { return 0; } +static inline void enable_p5_mce(void) { } +#endif + +/* Call the installed machine check handler for this CPU setup. */ +extern void (*machine_check_vector)(struct pt_regs *, long error_code); + +#ifdef CONFIG_X86_OLD_MCE +extern int nr_mce_banks; +extern void intel_set_thermal_handler(void); +#else +static inline void intel_set_thermal_handler(void) { } +#endif + +void intel_init_thermal(struct cpuinfo_x86 *c); void mce_setup(struct mce *m); void mce_log(struct mce *m); diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c index 89e510424152..b945d5dbc609 100644 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ b/arch/x86/kernel/cpu/mcheck/k7.c @@ -10,10 +10,9 @@ #include #include +#include #include -#include "mce.h" - /* Machine Check Handler For AMD Athlon/Duron: */ static void k7_machine_check(struct pt_regs *regs, long error_code) { diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 4b62443b6e72..faedd776847d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -44,7 +44,6 @@ #include #include "mce-internal.h" -#include "mce.h" /* Handle unconfigured int18 (should never happen) */ static void unexpected_machine_check(struct pt_regs *regs, long error_code) diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h deleted file mode 100644 index 84a552b458c8..000000000000 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ /dev/null @@ -1,38 +0,0 @@ -#include -#include - -#ifdef CONFIG_X86_OLD_MCE -void amd_mcheck_init(struct cpuinfo_x86 *c); -void intel_p4_mcheck_init(struct cpuinfo_x86 *c); -void intel_p6_mcheck_init(struct cpuinfo_x86 *c); -#endif - -#ifdef CONFIG_X86_ANCIENT_MCE -void intel_p5_mcheck_init(struct cpuinfo_x86 *c); -void winchip_mcheck_init(struct cpuinfo_x86 *c); -extern int mce_p5_enable; -static inline int mce_p5_enabled(void) { return mce_p5_enable; } -static inline void enable_p5_mce(void) { mce_p5_enable = 1; } -#else -static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} -static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} -static inline int mce_p5_enabled(void) { return 0; } -static inline void enable_p5_mce(void) { } -#endif - -/* Call the installed machine check handler for this CPU setup. */ -extern void (*machine_check_vector)(struct pt_regs *, long error_code); - -#ifdef CONFIG_X86_OLD_MCE - -extern int nr_mce_banks; - -void intel_set_thermal_handler(void); - -#else - -static inline void intel_set_thermal_handler(void) { } - -#endif - -void intel_init_thermal(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 2b011d2d8579..475478b20884 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -11,10 +11,9 @@ #include #include #include +#include #include -#include "mce.h" - void intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index f2ef6952c400..c548111d011b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -16,8 +16,6 @@ #include #include -#include "mce.h" - asmlinkage void smp_thermal_interrupt(void) { __u64 msr_val; diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c index 70b710420f74..f5f2d6f71fb6 100644 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c @@ -17,10 +17,9 @@ #include #include +#include #include -#include "mce.h" - static int firstbank; #define MCE_RATE (15*HZ) /* timer rate is 15s */ diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index 82cee108a2d3..8d3e40edd64d 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -12,10 +12,9 @@ #include #include #include +#include #include -#include "mce.h" - /* as supported by the P4/Xeon family */ struct intel_mce_extended_msrs { u32 eax; diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 015f481ab1b0..747853f9188d 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -10,10 +10,9 @@ #include #include +#include #include -#include "mce.h" - /* By default disabled */ int mce_p5_enable; diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c index 43c24e667457..01e4f8178183 100644 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ b/arch/x86/kernel/cpu/mcheck/p6.c @@ -10,10 +10,9 @@ #include #include +#include #include -#include "mce.h" - /* Machine Check Handler For PII/PIII */ static void intel_machine_check(struct pt_regs *regs, long error_code) { diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 81b02487090b..54060f565974 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -9,10 +9,9 @@ #include #include +#include #include -#include "mce.h" - /* Machine check handler for WinChip C6: */ static void winchip_machine_check(struct pt_regs *regs, long error_code) { diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1e1e27b7d438..71f9c74814d8 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -53,6 +53,7 @@ #include #include #include +#include #include @@ -64,8 +65,6 @@ #include #include -#include "cpu/mcheck/mce.h" - asmlinkage int system_call(void); /* Do we ignore FPU interrupts ? */ From c697836985e18d9c34897428ba563b13044a6dcd Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:22:49 +0900 Subject: [PATCH 18/45] x86, mce: make mce_disabled boolean The mce_disabled on 32bit is a tristate variable [1,0,-1], while 64bit version is boolean [0,1]. This patch makes mce_disabled always boolean, and use mce_p5_enabled to indicate the third state instead. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 8 +++----- arch/x86/kernel/cpu/mcheck/mce.c | 8 +++----- arch/x86/kernel/cpu/mcheck/p5.c | 12 +++++------- 3 files changed, 11 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index aae6fe2112f9..6568cdedcd89 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -107,6 +107,7 @@ struct mce_log { #include extern int mce_disabled; +extern int mce_p5_enabled; #ifdef CONFIG_X86_OLD_MCE void amd_mcheck_init(struct cpuinfo_x86 *c); @@ -117,14 +118,11 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c); #ifdef CONFIG_X86_ANCIENT_MCE void intel_p5_mcheck_init(struct cpuinfo_x86 *c); void winchip_mcheck_init(struct cpuinfo_x86 *c); -extern int mce_p5_enable; -static inline int mce_p5_enabled(void) { return mce_p5_enable; } -static inline void enable_p5_mce(void) { mce_p5_enable = 1; } +static inline void enable_p5_mce(void) { mce_p5_enabled = 1; } #else static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} -static inline int mce_p5_enabled(void) { return 0; } -static inline void enable_p5_mce(void) { } +static inline void enable_p5_mce(void) {} #endif /* Call the installed machine check handler for this CPU setup. */ diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index faedd776847d..6095e0296abd 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1286,8 +1286,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) return; switch (c->x86_vendor) { case X86_VENDOR_INTEL: - if (mce_p5_enabled()) - intel_p5_mcheck_init(c); + intel_p5_mcheck_init(c); break; case X86_VENDOR_CENTAUR: winchip_mcheck_init(c); @@ -2002,7 +2001,7 @@ EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ /* This has to be run for each processor */ void mcheck_init(struct cpuinfo_x86 *c) { - if (mce_disabled == 1) + if (mce_disabled) return; switch (c->x86_vendor) { @@ -2032,10 +2031,9 @@ void mcheck_init(struct cpuinfo_x86 *c) static int __init mcheck_enable(char *str) { - mce_disabled = -1; + mce_p5_enabled = 1; return 1; } - __setup("mce", mcheck_enable); #endif /* CONFIG_X86_OLD_MCE */ diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 747853f9188d..5c0e6533d9bc 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -14,7 +14,7 @@ #include /* By default disabled */ -int mce_p5_enable; +int mce_p5_enabled __read_mostly; /* Machine check handler for Pentium class Intel CPUs: */ static void pentium_machine_check(struct pt_regs *regs, long error_code) @@ -42,16 +42,14 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; + /* Default P5 to off as its often misconnected: */ + if (!mce_p5_enabled) + return; + /* Check for MCE support: */ if (!cpu_has(c, X86_FEATURE_MCE)) return; -#ifdef CONFIG_X86_OLD_MCE - /* Default P5 to off as its often misconnected: */ - if (mce_disabled != -1) - return; -#endif - machine_check_vector = pentium_machine_check; /* Make sure the vector pointer is visible before we enable MCEs: */ wmb(); From 3adacb70d32046ccc9f0333b50bb2ba1582ccdf4 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:23:28 +0900 Subject: [PATCH 19/45] x86, mce: unify smp_thermal_interrupt, prepare p4 Remove unused argument regs from handlers, and use inc_irq_stat. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/p4.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index 8d3e40edd64d..ddeed6e295af 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -35,7 +35,7 @@ static int mce_num_extended_msrs; #ifdef CONFIG_X86_MCE_P4THERMAL -static void unexpected_thermal_interrupt(struct pt_regs *regs) +static void unexpected_thermal_interrupt(void) { printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", smp_processor_id()); @@ -43,7 +43,7 @@ static void unexpected_thermal_interrupt(struct pt_regs *regs) } /* P4/Xeon Thermal transition interrupt handler: */ -static void intel_thermal_interrupt(struct pt_regs *regs) +static void intel_thermal_interrupt(void) { __u64 msr_val; @@ -54,14 +54,13 @@ static void intel_thermal_interrupt(struct pt_regs *regs) } /* Thermal interrupt handler for this CPU setup: */ -static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = - unexpected_thermal_interrupt; +static void (*vendor_thermal_interrupt)(void) = unexpected_thermal_interrupt; void smp_thermal_interrupt(struct pt_regs *regs) { irq_enter(); - vendor_thermal_interrupt(regs); - __get_cpu_var(irq_stat).irq_thermal_count++; + vendor_thermal_interrupt(); + inc_irq_stat(irq_thermal_count); irq_exit(); } From 5335612a574a45beab14193ec641ed2f45e7a523 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:24:09 +0900 Subject: [PATCH 20/45] x86, mce: unify smp_thermal_interrupt, prepare mce_intel_64 Break smp_thermal_interrupt() into two functions. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index c548111d011b..a5232b2c4ca0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -16,19 +16,21 @@ #include #include -asmlinkage void smp_thermal_interrupt(void) +static void intel_thermal_interrupt(void) { __u64 msr_val; - ack_APIC_irq(); - - exit_idle(); - irq_enter(); - rdmsrl(MSR_IA32_THERM_STATUS, msr_val); if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) mce_log_therm_throt_event(msr_val); +} +asmlinkage void smp_thermal_interrupt(void) +{ + ack_APIC_irq(); + exit_idle(); + irq_enter(); + intel_thermal_interrupt(); inc_irq_stat(irq_thermal_count); irq_exit(); } From e8ce2c5ee826b3787202493effcd08d4b1e1e639 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:24:40 +0900 Subject: [PATCH 21/45] x86, mce: unify smp_thermal_interrupt, prepare Let them in same shape. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 25 +++++++++++++---------- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 20 ++++++++++++++++-- arch/x86/kernel/cpu/mcheck/p4.c | 10 +++++---- 3 files changed, 38 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6568cdedcd89..3bc827c0f409 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -110,6 +110,7 @@ extern int mce_disabled; extern int mce_p5_enabled; #ifdef CONFIG_X86_OLD_MCE +extern int nr_mce_banks; void amd_mcheck_init(struct cpuinfo_x86 *c); void intel_p4_mcheck_init(struct cpuinfo_x86 *c); void intel_p6_mcheck_init(struct cpuinfo_x86 *c); @@ -128,15 +129,6 @@ static inline void enable_p5_mce(void) {} /* Call the installed machine check handler for this CPU setup. */ extern void (*machine_check_vector)(struct pt_regs *, long error_code); -#ifdef CONFIG_X86_OLD_MCE -extern int nr_mce_banks; -extern void intel_set_thermal_handler(void); -#else -static inline void intel_set_thermal_handler(void) { } -#endif - -void intel_init_thermal(struct cpuinfo_x86 *c); - void mce_setup(struct mce *m); void mce_log(struct mce *m); DECLARE_PER_CPU(struct sys_device, mce_dev); @@ -175,8 +167,6 @@ int mce_available(struct cpuinfo_x86 *c); DECLARE_PER_CPU(unsigned, mce_exception_count); DECLARE_PER_CPU(unsigned, mce_poll_count); -void mce_log_therm_throt_event(__u64 status); - extern atomic_t mce_entry; void do_machine_check(struct pt_regs *, long); @@ -205,5 +195,18 @@ void mcheck_init(struct cpuinfo_x86 *c); extern void (*mce_threshold_vector)(void); +/* + * Thermal handler + */ + +void intel_set_thermal_handler(void); +void intel_init_thermal(struct cpuinfo_x86 *c); + +#ifdef CONFIG_X86_NEW_MCE +void mce_log_therm_throt_event(__u64 status); +#else +static inline void mce_log_therm_throt_event(__u64 status) {} +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index a5232b2c4ca0..922e3a482f6f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -16,6 +16,14 @@ #include #include +static void unexpected_thermal_interrupt(void) +{ + printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", + smp_processor_id()); + add_taint(TAINT_MACHINE_CHECK); +} + +/* P4/Xeon Thermal transition interrupt handler: */ static void intel_thermal_interrupt(void) { __u64 msr_val; @@ -25,14 +33,22 @@ static void intel_thermal_interrupt(void) mce_log_therm_throt_event(msr_val); } +/* Thermal interrupt handler for this CPU setup: */ +static void (*vendor_thermal_interrupt)(void) = unexpected_thermal_interrupt; + asmlinkage void smp_thermal_interrupt(void) { - ack_APIC_irq(); exit_idle(); irq_enter(); - intel_thermal_interrupt(); inc_irq_stat(irq_thermal_count); + intel_thermal_interrupt(); irq_exit(); + ack_APIC_irq(); +} + +void intel_set_thermal_handler(void) +{ + vendor_thermal_interrupt = intel_thermal_interrupt; } /* diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index ddeed6e295af..75313f5b624e 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -47,10 +48,9 @@ static void intel_thermal_interrupt(void) { __u64 msr_val; - ack_APIC_irq(); - rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - therm_throt_process(msr_val & THERM_STATUS_PROCHOT); + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) + mce_log_therm_throt_event(msr_val); } /* Thermal interrupt handler for this CPU setup: */ @@ -58,10 +58,12 @@ static void (*vendor_thermal_interrupt)(void) = unexpected_thermal_interrupt; void smp_thermal_interrupt(struct pt_regs *regs) { + exit_idle(); irq_enter(); - vendor_thermal_interrupt(); inc_irq_stat(irq_thermal_count); + vendor_thermal_interrupt(); irq_exit(); + ack_APIC_irq(); } void intel_set_thermal_handler(void) From a65c88dd2c83b569dbd13778da689861bdf977f2 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:25:27 +0900 Subject: [PATCH 22/45] x86, mce: unify smp_thermal_interrupt Put common functions into therm_throt.c, modify Makefile. unexpected_thermal_interrupt intel_thermal_interrupt smp_thermal_interrupt intel_set_thermal_handler Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/Makefile | 7 ++-- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 38 ------------------- arch/x86/kernel/cpu/mcheck/p4.c | 45 ----------------------- arch/x86/kernel/cpu/mcheck/therm_throt.c | 40 +++++++++++++++++++- 4 files changed, 43 insertions(+), 87 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 45004faf67ea..53df57d11c50 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,11 +1,12 @@ -obj-y = mce.o therm_throt.o +obj-y = mce.o obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o -obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o -obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o +obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o + +obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o mce_intel.o diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 922e3a482f6f..3b7a0572e2fe 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -9,48 +9,10 @@ #include #include #include -#include #include #include -#include -#include #include -static void unexpected_thermal_interrupt(void) -{ - printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", - smp_processor_id()); - add_taint(TAINT_MACHINE_CHECK); -} - -/* P4/Xeon Thermal transition interrupt handler: */ -static void intel_thermal_interrupt(void) -{ - __u64 msr_val; - - rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) - mce_log_therm_throt_event(msr_val); -} - -/* Thermal interrupt handler for this CPU setup: */ -static void (*vendor_thermal_interrupt)(void) = unexpected_thermal_interrupt; - -asmlinkage void smp_thermal_interrupt(void) -{ - exit_idle(); - irq_enter(); - inc_irq_stat(irq_thermal_count); - intel_thermal_interrupt(); - irq_exit(); - ack_APIC_irq(); -} - -void intel_set_thermal_handler(void) -{ - vendor_thermal_interrupt = intel_thermal_interrupt; -} - /* * Support for Intel Correct Machine Check Interrupts. This allows * the CPU to raise an interrupt when a corrected machine check happened. diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index 75313f5b624e..76c5f0305f9f 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -1,8 +1,6 @@ /* * P4 specific Machine Check Exception Reporting */ - -#include #include #include #include @@ -10,9 +8,6 @@ #include #include -#include -#include -#include #include #include @@ -33,46 +28,6 @@ struct intel_mce_extended_msrs { static int mce_num_extended_msrs; - -#ifdef CONFIG_X86_MCE_P4THERMAL - -static void unexpected_thermal_interrupt(void) -{ - printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", - smp_processor_id()); - add_taint(TAINT_MACHINE_CHECK); -} - -/* P4/Xeon Thermal transition interrupt handler: */ -static void intel_thermal_interrupt(void) -{ - __u64 msr_val; - - rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) - mce_log_therm_throt_event(msr_val); -} - -/* Thermal interrupt handler for this CPU setup: */ -static void (*vendor_thermal_interrupt)(void) = unexpected_thermal_interrupt; - -void smp_thermal_interrupt(struct pt_regs *regs) -{ - exit_idle(); - irq_enter(); - inc_irq_stat(irq_thermal_count); - vendor_thermal_interrupt(); - irq_exit(); - ack_APIC_irq(); -} - -void intel_set_thermal_handler(void) -{ - vendor_thermal_interrupt = intel_thermal_interrupt; -} - -#endif /* CONFIG_X86_MCE_P4THERMAL */ - /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) { diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 7b1ae2e20ba5..b3792b196856 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -13,6 +13,7 @@ * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. * Inspired by Ross Biro's and Al Borchers' counter code. */ +#include #include #include #include @@ -20,6 +21,8 @@ #include #include +#include +#include /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) @@ -186,6 +189,41 @@ static __init int thermal_throttle_init_device(void) return 0; } - device_initcall(thermal_throttle_init_device); + #endif /* CONFIG_SYSFS */ + +/* Thermal transition interrupt handler */ +void intel_thermal_interrupt(void) +{ + __u64 msr_val; + + rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) + mce_log_therm_throt_event(msr_val); +} + +static void unexpected_thermal_interrupt(void) +{ + printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", + smp_processor_id()); + add_taint(TAINT_MACHINE_CHECK); +} + +static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; + +asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) +{ + exit_idle(); + irq_enter(); + inc_irq_stat(irq_thermal_count); + smp_thermal_vector(); + irq_exit(); + /* Ack only at the end to avoid potential reentry */ + ack_APIC_irq(); +} + +void intel_set_thermal_handler(void) +{ + smp_thermal_vector = intel_thermal_interrupt; +} From 895287c0a6aa571160c47ee10de11b542166c4f9 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:26:10 +0900 Subject: [PATCH 23/45] x86, mce: squash mce_intel.c into therm_throt.c move intel_init_thermal() into therm_throt.c Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/Makefile | 2 +- arch/x86/kernel/cpu/mcheck/mce_intel.c | 73 ------------------------ arch/x86/kernel/cpu/mcheck/therm_throt.c | 66 +++++++++++++++++++++ 3 files changed, 67 insertions(+), 74 deletions(-) delete mode 100644 arch/x86/kernel/cpu/mcheck/mce_intel.c diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 53df57d11c50..659564e5fc0f 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -9,4 +9,4 @@ obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o -obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o mce_intel.o +obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c deleted file mode 100644 index 475478b20884..000000000000 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Common code for Intel machine checks - */ -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -void intel_init_thermal(struct cpuinfo_x86 *c) -{ - unsigned int cpu = smp_processor_id(); - int tm2 = 0; - u32 l, h; - - /* Thermal monitoring depends on ACPI and clock modulation*/ - if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) - return; - - /* - * First check if its enabled already, in which case there might - * be some SMM goo which handles it, so we can't even put a handler - * since it might be delivered via SMI already: - */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - h = apic_read(APIC_LVTTHMR); - if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { - printk(KERN_DEBUG - "CPU%d: Thermal monitoring handled by SMI\n", cpu); - return; - } - - if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) - tm2 = 1; - - /* Check whether a vector already exists */ - if (h & APIC_VECTOR_MASK) { - printk(KERN_DEBUG - "CPU%d: Thermal LVT vector (%#x) already installed\n", - cpu, (h & APIC_VECTOR_MASK)); - return; - } - - /* We'll mask the thermal vector in the lapic till we're ready: */ - h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; - apic_write(APIC_LVTTHMR, h); - - rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, - l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); - - intel_set_thermal_handler(); - - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); - - /* Unmask the thermal vector: */ - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - - printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", - cpu, tm2 ? "TM2" : "TM1"); - - /* enable thermal throttle processing */ - atomic_set(&therm_throt_en, 1); -} diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index b3792b196856..7a508aaafcea 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -16,13 +16,21 @@ #include #include #include +#include #include #include +#include +#include +#include #include #include +#include +#include +#include #include #include +#include /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) @@ -227,3 +235,61 @@ void intel_set_thermal_handler(void) { smp_thermal_vector = intel_thermal_interrupt; } + +void intel_init_thermal(struct cpuinfo_x86 *c) +{ + unsigned int cpu = smp_processor_id(); + int tm2 = 0; + u32 l, h; + + /* Thermal monitoring depends on ACPI and clock modulation*/ + if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + return; + + /* + * First check if its enabled already, in which case there might + * be some SMM goo which handles it, so we can't even put a handler + * since it might be delivered via SMI already: + */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + h = apic_read(APIC_LVTTHMR); + if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { + printk(KERN_DEBUG + "CPU%d: Thermal monitoring handled by SMI\n", cpu); + return; + } + + if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) + tm2 = 1; + + /* Check whether a vector already exists */ + if (h & APIC_VECTOR_MASK) { + printk(KERN_DEBUG + "CPU%d: Thermal LVT vector (%#x) already installed\n", + cpu, (h & APIC_VECTOR_MASK)); + return; + } + + /* We'll mask the thermal vector in the lapic till we're ready: */ + h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; + apic_write(APIC_LVTTHMR, h); + + rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); + + intel_set_thermal_handler(); + + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); + + /* Unmask the thermal vector: */ + l = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + + printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", + cpu, tm2 ? "TM2" : "TM1"); + + /* enable thermal throttle processing */ + atomic_set(&therm_throt_en, 1); +} From 8363fc82d36c0886292e33925391dca93f03bd50 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:26:36 +0900 Subject: [PATCH 24/45] x86, mce: remove intel_set_thermal_handler() and make intel_thermal_interrupt() static. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 1 - arch/x86/kernel/cpu/mcheck/therm_throt.c | 9 ++------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 3bc827c0f409..365a594b41bc 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -199,7 +199,6 @@ extern void (*mce_threshold_vector)(void); * Thermal handler */ -void intel_set_thermal_handler(void); void intel_init_thermal(struct cpuinfo_x86 *c); #ifdef CONFIG_X86_NEW_MCE diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 7a508aaafcea..7c7944cc515d 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -202,7 +202,7 @@ device_initcall(thermal_throttle_init_device); #endif /* CONFIG_SYSFS */ /* Thermal transition interrupt handler */ -void intel_thermal_interrupt(void) +static void intel_thermal_interrupt(void) { __u64 msr_val; @@ -231,11 +231,6 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) ack_APIC_irq(); } -void intel_set_thermal_handler(void) -{ - smp_thermal_vector = intel_thermal_interrupt; -} - void intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); @@ -278,7 +273,7 @@ void intel_init_thermal(struct cpuinfo_x86 *c) wrmsr(MSR_IA32_THERM_INTERRUPT, l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); - intel_set_thermal_handler(); + smp_thermal_vector = intel_thermal_interrupt; rdmsr(MSR_IA32_MISC_ENABLE, l, h); wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); From 1149e7264528723b8c3b075a90386ceb2e07070a Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:27:13 +0900 Subject: [PATCH 25/45] x86, mce: remove therm_throt.h Now all symbols in the header are static. Remove the header. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/therm_throt.h | 9 --------- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 1 - arch/x86/kernel/cpu/mcheck/p4.c | 1 - arch/x86/kernel/cpu/mcheck/therm_throt.c | 5 ++--- 4 files changed, 2 insertions(+), 14 deletions(-) delete mode 100644 arch/x86/include/asm/therm_throt.h diff --git a/arch/x86/include/asm/therm_throt.h b/arch/x86/include/asm/therm_throt.h deleted file mode 100644 index c62349ee7860..000000000000 --- a/arch/x86/include/asm/therm_throt.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef _ASM_X86_THERM_THROT_H -#define _ASM_X86_THERM_THROT_H - -#include - -extern atomic_t therm_throt_en; -int therm_throt_process(int curr); - -#endif /* _ASM_X86_THERM_THROT_H */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 3b7a0572e2fe..663a88e8b3c7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -11,7 +11,6 @@ #include #include #include -#include /* * Support for Intel Correct Machine Check Interrupts. This allows diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index 76c5f0305f9f..4482aea9aa2e 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -6,7 +6,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 7c7944cc515d..bff8dd191dd5 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -24,7 +24,6 @@ #include #include -#include #include #include #include @@ -38,7 +37,7 @@ static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); -atomic_t therm_throt_en = ATOMIC_INIT(0); +static atomic_t therm_throt_en = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS #define define_therm_throt_sysdev_one_ro(_name) \ @@ -93,7 +92,7 @@ static struct attribute_group thermal_throttle_attr_group = { * 1 : Event should be logged further, and a message has been * printed to the syslog. */ -int therm_throt_process(int curr) +static int therm_throt_process(int curr) { unsigned int cpu = smp_processor_id(); __u64 tmp_jiffs = get_jiffies_64(); From 58995d2d58e8e555bc92582abe7554921deea3aa Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:27:47 +0900 Subject: [PATCH 26/45] x86, mce: mce.h cleanup Reorder definitions. - static inline dummy mcheck_init() for !CONFIG_X86_MCE - gather defs for exception, threshold handler Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 365a594b41bc..5cdd8d100ec9 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -109,6 +109,12 @@ struct mce_log { extern int mce_disabled; extern int mce_p5_enabled; +#ifdef CONFIG_X86_MCE +void mcheck_init(struct cpuinfo_x86 *c); +#else +static inline void mcheck_init(struct cpuinfo_x86 *c) {} +#endif + #ifdef CONFIG_X86_OLD_MCE extern int nr_mce_banks; void amd_mcheck_init(struct cpuinfo_x86 *c); @@ -126,13 +132,9 @@ static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} static inline void enable_p5_mce(void) {} #endif -/* Call the installed machine check handler for this CPU setup. */ -extern void (*machine_check_vector)(struct pt_regs *, long error_code); - void mce_setup(struct mce *m); void mce_log(struct mce *m); DECLARE_PER_CPU(struct sys_device, mce_dev); -extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); /* * To support more than 128 would need to escape the predefined @@ -169,8 +171,6 @@ DECLARE_PER_CPU(unsigned, mce_poll_count); extern atomic_t mce_entry; -void do_machine_check(struct pt_regs *, long); - typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); @@ -187,13 +187,20 @@ void mce_notify_process(void); DECLARE_PER_CPU(struct mce, injectm); extern struct file_operations mce_chrdev_ops; -#ifdef CONFIG_X86_MCE -void mcheck_init(struct cpuinfo_x86 *c); -#else -#define mcheck_init(c) do { } while (0) -#endif +/* + * Exception handler + */ + +/* Call the installed machine check handler for this CPU setup. */ +extern void (*machine_check_vector)(struct pt_regs *, long error_code); +void do_machine_check(struct pt_regs *, long); + +/* + * Threshold handler + */ extern void (*mce_threshold_vector)(void); +extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); /* * Thermal handler From 1af0815f9639752409a9c15ba6a3dc0f322fd114 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:28:38 +0900 Subject: [PATCH 27/45] x86, mce: rename _64.c files which are no longer 64-bit-specific Rename files that are no longer 64bit specific: mce_amd_64.c => mce_amd.c mce_intel_64.c => mce_intel.c Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/Makefile | 4 ++-- arch/x86/kernel/cpu/mcheck/{mce_amd_64.c => mce_amd.c} | 0 arch/x86/kernel/cpu/mcheck/{mce_intel_64.c => mce_intel.c} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename arch/x86/kernel/cpu/mcheck/{mce_amd_64.c => mce_amd.c} (100%) rename arch/x86/kernel/cpu/mcheck/{mce_intel_64.c => mce_intel.c} (100%) diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 659564e5fc0f..188a1ca5ad2b 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -3,8 +3,8 @@ obj-y = mce.o obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o -obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o -obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o +obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o +obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c similarity index 100% rename from arch/x86/kernel/cpu/mcheck/mce_amd_64.c rename to arch/x86/kernel/cpu/mcheck/mce_amd.c diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c similarity index 100% rename from arch/x86/kernel/cpu/mcheck/mce_intel_64.c rename to arch/x86/kernel/cpu/mcheck/mce_intel.c From 95ee14e4379c5e19c0897c872350570402014742 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 9 Jun 2009 18:20:39 -0700 Subject: [PATCH 28/45] x86: cap iomem_resource to addressable physical memory iomem_resource is by default initialized to -1, which means 64 bits of physical address space if 64-bit resources are enabled. However, x86 CPUs cannot address 64 bits of physical address space. Thus, we want to cap the physical address space to what the union of all CPU can actually address. Without this patch, we may end up assigning inaccessible values to uninitialized 64-bit PCI memory resources. Signed-off-by: H. Peter Anvin Cc: Matthew Wilcox Cc: Jesse Barnes Cc: Martin Mares Cc: stable@kernel.org --- arch/x86/kernel/cpu/common.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3ffdcfa9abdf..5b9cb8839cae 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -853,6 +853,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) numa_add_cpu(smp_processor_id()); #endif + + /* Cap the iomem address space to what is addressable on all CPUs */ + iomem_resource.end &= (1ULL << c->x86_phys_bits) - 1; } #ifdef CONFIG_X86_64 From e2a7147640a54eb812c8ab5f3ee4424b92db4856 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Tue, 16 Jun 2009 16:43:40 -0500 Subject: [PATCH 29/45] x86: correct the conversion of EFI memory types This patch causes all the EFI_RESERVED_TYPE memory reservations to be recorded in the e820 table as type E820_RESERVED. (This patch replaces one called 'x86: vendor reserved memory type'. This version has been discussed a bit with Peter and Yinghai but not given a final opinion.) Without this patch EFI_RESERVED_TYPE memory reservations may be marked usable in the e820 table. There may be a collision between kernel use and some reserver's use of this memory. (An example use of this functionality is the UV system, which will access extremely large areas of memory with a memory engine that allows a user to address beyond the processor's range. Such areas are reserved in the EFI table by the BIOS. Some loaders have a restricted number of entries possible in the e820 table, hence the need to record the reservations in the unrestricted EFI table.) The call to do_add_efi_memmap() is only made if "add_efi_memmap" is specified on the kernel command line. Signed-off-by: Cliff Wickman Signed-off-by: H. Peter Anvin --- arch/x86/kernel/efi.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 1736acc4d7aa..96f7ac0bbf01 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -240,10 +240,35 @@ static void __init do_add_efi_memmap(void) unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; int e820_type; - if (md->attribute & EFI_MEMORY_WB) - e820_type = E820_RAM; - else + switch (md->type) { + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: + case EFI_BOOT_SERVICES_CODE: + case EFI_BOOT_SERVICES_DATA: + case EFI_CONVENTIONAL_MEMORY: + if (md->attribute & EFI_MEMORY_WB) + e820_type = E820_RAM; + else + e820_type = E820_RESERVED; + break; + case EFI_ACPI_RECLAIM_MEMORY: + e820_type = E820_ACPI; + break; + case EFI_ACPI_MEMORY_NVS: + e820_type = E820_NVS; + break; + case EFI_UNUSABLE_MEMORY: + e820_type = E820_UNUSABLE; + break; + default: + /* + * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE + * EFI_RUNTIME_SERVICES_DATA EFI_MEMORY_MAPPED_IO + * EFI_MEMORY_MAPPED_IO_PORT_SPACE EFI_PAL_CODE + */ e820_type = E820_RESERVED; + break; + } e820_add_region(start, size, e820_type); } sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); From 28b4868820a56de661f54742ff91b78e12f1e582 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 16 Jun 2009 17:39:04 -0700 Subject: [PATCH 30/45] x86, boot: use .code16gcc instead of .code16 Use .code16gcc to compile arch/x86/boot/bioscall.S rather than .code16, since some older versions of binutils can't generate 32-bit addressing expressions (67 prefixes) in .code16 mode, only in .code16gcc mode. Reported-by: Tetsuo Handa Signed-off-by: H. Peter Anvin --- arch/x86/boot/bioscall.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S index 507793739ea5..1dfbf64e52a2 100644 --- a/arch/x86/boot/bioscall.S +++ b/arch/x86/boot/bioscall.S @@ -13,7 +13,7 @@ * touching registers they shouldn't be. */ - .code16 + .code16gcc .text .globl intcall .type intcall, @function From 203abd67b75f7714ce98ab0cdbd6cfd7ad79dec4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 15 Jun 2009 14:52:01 +0200 Subject: [PATCH 31/45] x86: mce: Handle banks == 0 case in K7 quirk Vegard Nossum reported: > I get an MCE-related crash like this in latest linus tree: > > [ 0.115341] CPU: L1 I Cache: 64K (64 bytes/line), D cache 64K (64 bytes/line) > [ 0.116396] CPU: L2 Cache: 512K (64 bytes/line) > [ 0.120570] mce: CPU supports 0 MCE banks > [ 0.124870] BUG: unable to handle kernel NULL pointer dereference at 00000000 00000010 > [ 0.128001] IP: [] mcheck_init+0x278/0x320 > [ 0.128001] PGD 0 > [ 0.128001] Thread overran stack, or stack corrupted > [ 0.128001] Oops: 0002 [#1] PREEMPT SMP > [ 0.128001] last sysfs file: > [ 0.128001] CPU 0 > [ 0.128001] Modules linked in: > [ 0.128001] Pid: 0, comm: swapper Not tainted 2.6.30 #426 > [ 0.128001] RIP: 0010:[] [] mcheck_init+0x278/0x320 > [ 0.128001] RSP: 0018:ffffffff81595e38 EFLAGS: 00000246 > [ 0.128001] RAX: 0000000000000010 RBX: ffffffff8158f900 RCX: 0000000000000000 > [ 0.128001] RDX: 0000000000000000 RSI: 00000000000000ff RDI: 0000000000000010 > [ 0.128001] RBP: ffffffff81595e68 R08: 0000000000000001 R09: 0000000000000000 > [ 0.128001] R10: 0000000000000010 R11: 0000000000000000 R12: 0000000000000000 > [ 0.128001] R13: 00000000ffffffff R14: 0000000000000000 R15: 0000000000000000 > [ 0.128001] FS: 0000000000000000(0000) GS:ffff880002288000(0000) knlGS:00000 > 00000000000 > [ 0.128001] CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b > [ 0.128001] CR2: 0000000000000010 CR3: 0000000001001000 CR4: 00000000000006b0 > [ 0.128001] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > [ 0.128001] DR3: 0000000000000000 DR6: 0000000000000000 DR7: 0000000000000000 > [ 0.128001] Process swapper (pid: 0, threadinfo ffffffff81594000, task ffffff > ff8152a4a0) > [ 0.128001] Stack: > [ 0.128001] 0000000081595e68 5aa50ed3b4ddbe6e ffffffff8158f900 ffffffff8158f > 914 > [ 0.128001] ffffffff8158f948 0000000000000000 ffffffff81595eb8 ffffffff813b8 > 69c > [ 0.128001] 5aa50ed3b4ddbe6e 00000001078bfbfd 0000062300000800 5aa50ed3b4ddb > e6e > [ 0.128001] Call Trace: > [ 0.128001] [] identify_cpu+0x331/0x392 > [ 0.128001] [] identify_boot_cpu+0x23/0x6e > [ 0.128001] [] check_bugs+0x1c/0x60 > [ 0.128001] [] start_kernel+0x403/0x46e > [ 0.128001] [] x86_64_start_reservations+0xac/0xd5 > [ 0.128001] [] x86_64_start_kernel+0x115/0x14b > [ 0.128001] [] ? early_idt_handler+0x0/0x71 This happens on QEMU which reports MCA capability, but no banks. Without this patch there is a buffer overrun and boot ops because the code would try to initialize the 0 element of a zero length kmalloc() buffer. Reported-by: Vegard Nossum Tested-by: Pekka Enberg Signed-off-by: Andi Kleen LKML-Reference: <20090615125200.GD31969@one.firstfloor.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index fabba15e4558..d9d77cfd8cce 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1245,7 +1245,7 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) * Various K7s with broken bank 0 around. Always disable * by default. */ - if (c->x86 == 6) + if (c->x86 == 6 && banks > 0) bank[0] = 0; } From 5ce4243dcefbbc43791ffc36e1be55067ceec916 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 15 Jun 2009 22:26:33 +0400 Subject: [PATCH 32/45] x86: mce: Don't touch THERMAL_APIC_VECTOR if no active APIC present MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If APIC was disabled (for some reason) and as result it's not even mapped we should not try to enable thermal interrupts at all. Reported-by: Simon Holm Thøgersen Tested-by: Simon Holm Thøgersen Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090615182633.GA7606@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_intel.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 2b011d2d8579..61e32881f41b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -21,9 +21,15 @@ void intel_init_thermal(struct cpuinfo_x86 *c) int tm2 = 0; u32 l, h; - /* Thermal monitoring depends on ACPI and clock modulation*/ - if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + /* + * Thermal monitoring depends on ACPI, clock modulation + * and APIC as well + */ + if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC) || + !cpu_has(c, X86_FEATURE_APIC)) { + pr_debug("Thermal monitoring disabled\n"); return; + } /* * First check if its enabled already, in which case there might From 50a8d4d29735ec99139e58ea3cb11bed3331cf87 Mon Sep 17 00:00:00 2001 From: "Figo.zhang" Date: Wed, 17 Jun 2009 22:25:20 +0800 Subject: [PATCH 33/45] x86, io_apic.c: Work around compiler warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This compiler warning: arch/x86/kernel/apic/io_apic.c: In function ‘ioapic_write_entry’: arch/x86/kernel/apic/io_apic.c:466: warning: ‘eu’ is used uninitialized in this function arch/x86/kernel/apic/io_apic.c:465: note: ‘eu’ was declared here Is bogus as 'eu' is always initialized. But annotate it away by initializing the variable, to make it easier for people to notice real warnings. A compiler that sees through this logic will optimize away the initialization. Signed-off-by: Figo.zhang LKML-Reference: <1245248720.3312.27.camel@myhost> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ef8d9290c7ea..95e03701106a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -462,7 +462,8 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { - union entry_union eu; + union entry_union eu = {{0, 0}}; + eu.entry = e; io_apic_write(apic, 0x11 + 2*pin, eu.w2); io_apic_write(apic, 0x10 + 2*pin, eu.w1); From 8f7007aabee22d6c186070e9532b0965f6036bb7 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 10 Jun 2009 12:41:01 -0700 Subject: [PATCH 34/45] x86: apic/io_apic.c: dmar_msi_type should be static Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 95e03701106a..29d0752d9517 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3568,7 +3568,7 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) #endif /* CONFIG_SMP */ -struct irq_chip dmar_msi_type = { +static struct irq_chip dmar_msi_type = { .name = "DMAR_MSI", .unmask = dmar_msi_unmask, .mask = dmar_msi_mask, From 1bf7b31efa0c322d93cb3f772cd9bc743e8bb42d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 17 Jun 2009 08:31:15 -0700 Subject: [PATCH 35/45] x86, mce: mce_intel.c needs mce_intel.c uses apic_write() and lapic_get_maxlvt(), and so it needs . Signed-off-by: H. Peter Anvin Cc: Andi Kleen Cc: Hidetoshi Seto --- arch/x86/kernel/cpu/mcheck/mce_intel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 663a88e8b3c7..e1acec0f7a32 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include From fe955e5c793aab398794be4c5ede172d48446c4a Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Wed, 10 Jun 2009 12:41:02 -0700 Subject: [PATCH 36/45] x86: nmi: Add Intel processor 0x6f4 to NMI perfctr1 workaround Expand Intel NMI perfctr1 workaround to include a Core2 processor stepping (cpuid family-6, model-f, stepping-4). Resolves a situation where the NMI would not enable on these processors. Signed-off-by: Prarit Bhargava Acked-by: Suresh Siddha Cc: prarit@redhat.com Cc: suresh.b.siddha@intel.com Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perfctr-watchdog.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index d6f5b9fbde32..5c481f6205bf 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -716,11 +716,15 @@ static void probe_nmi_watchdog(void) wd_ops = &k7_wd_ops; break; case X86_VENDOR_INTEL: - /* - * Work around Core Duo (Yonah) errata AE49 where perfctr1 - * doesn't have a working enable bit. + /* Work around where perfctr1 doesn't have a working enable + * bit as described in the following errata: + * AE49 Core Duo and Intel Core Solo 65 nm + * AN49 Intel Pentium Dual-Core + * AF49 Dual-Core Intel Xeon Processor LV */ - if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) { + if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) || + ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 && + boot_cpu_data.x86_mask == 4))) { intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0; intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0; } From 8fa62ad9d24e24707164ac7a97cbe59fe78a8591 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 17 Jun 2009 14:11:10 +0530 Subject: [PATCH 37/45] x86: msr.h linux/types.h is only required for __KERNEL__ is only required for __KERNEL__ as whole file is covered with it Also fixed some spacing issues for usr/include/asm-x86/msr.h Signed-off-by: Jaswinder Singh Rajput Cc: "H. Peter Anvin" LKML-Reference: <1245228070.2662.1.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 22603764e7db..48ad9d29484a 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -3,13 +3,10 @@ #include -#ifndef __ASSEMBLY__ -# include -#endif - #ifdef __KERNEL__ #ifndef __ASSEMBLY__ +#include #include #include #include @@ -264,6 +261,4 @@ static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) #endif /* CONFIG_SMP */ #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ - - #endif /* _ASM_X86_MSR_H */ From 8653f88ff93ba6bd6df4bac48908d41ad09dba7e Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Sat, 13 Jun 2009 20:21:26 +0800 Subject: [PATCH 38/45] x86: Remove duplicated #include's Signed-off-by: Huang Weiyi LKML-Reference: <1244895686-2348-1-git-send-email-weiyi.huang@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/probe_32.c | 11 ----------- arch/x86/kernel/apic/summit_32.c | 1 - 2 files changed, 12 deletions(-) diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 440a8bccd91a..0c0182cc947d 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -20,23 +20,12 @@ #include #include -#include -#include -#include -#include -#include -#include -#include #include -#include #include -#include -#include #include #include #include -#include #ifdef CONFIG_HOTPLUG_CPU #define DEFAULT_SEND_IPI (1) diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 344eee4ac0a4..eafdfbd1ea95 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -44,7 +44,6 @@ #include #include #include -#include #include #include From 3f4c3955ea320bde870ac2ce587466295aba5710 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 17 Jun 2009 22:13:22 +0400 Subject: [PATCH 39/45] x86, ioapic: Don't call disconnect_bsp_APIC if no APIC present Vegard Nossum reported: [ 503.576724] ACPI: Preparing to enter system sleep state S5 [ 503.710857] Disabling non-boot CPUs ... [ 503.716853] Power down. [ 503.717770] ------------[ cut here ]------------ [ 503.717770] WARNING: at arch/x86/kernel/apic/apic.c:249 native_apic_write_du) [ 503.717770] Hardware name: OptiPlex GX100 [ 503.717770] Modules linked in: [ 503.717770] Pid: 2136, comm: halt Not tainted 2.6.30 #443 [ 503.717770] Call Trace: [ 503.717770] [] ? printk+0x18/0x1a [ 503.717770] [] ? native_apic_write_dummy+0x38/0x50 [ 503.717770] [] warn_slowpath_common+0x6c/0xc0 [ 503.717770] [] ? native_apic_write_dummy+0x38/0x50 [ 503.717770] [] warn_slowpath_null+0x15/0x20 [ 503.717770] [] native_apic_write_dummy+0x38/0x50 [ 503.717770] [] disconnect_bsp_APIC+0x63/0x100 [ 503.717770] [] disable_IO_APIC+0xb8/0xc0 [ 503.717770] [] ? acpi_power_off+0x0/0x29 [ 503.717770] [] native_machine_shutdown+0x65/0x80 [ 503.717770] [] native_machine_power_off+0x26/0x30 [ 503.717770] [] machine_power_off+0x9/0x10 [ 503.717770] [] kernel_power_off+0x36/0x40 [ 503.717770] [] sys_reboot+0xfd/0x1f0 [ 503.717770] [] ? perf_swcounter_event+0xb0/0x130 [ 503.717770] [] ? perf_counter_task_sched_out+0x5d/0x120 [ 503.717770] [] ? finish_task_switch+0x56/0xd0 [ 503.717770] [] ? schedule+0x49e/0xb40 [ 503.717770] [] ? sys_kill+0x70/0x160 [ 503.717770] [] ? selinux_file_ioctl+0x3b/0x50 [ 503.717770] [] ? sys_ioctl+0x63/0x70 [ 503.717770] [] sysenter_do_call+0x12/0x22 [ 503.717770] ---[ end trace 8157b5d0ed378f15 ]--- | | That's including this commit: | | commit 103428e57be323c3c5545db8ad12667099bc6005 |Author: Cyrill Gorcunov |Date: Sun Jun 7 16:48:40 2009 +0400 | | x86, apic: Fix dummy apic read operation together with broken MP handling | If we have apic disabled we don't even switch to APIC mode and do not calling for connect_bsp_APIC. Though on SMP compiled kernel the native_machine_shutdown does try to write the apic register anyway. Fix it with explicit check if we really should touch apic registers. Reported-by: Vegard Nossum Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu LKML-Reference: <20090617181322.GG10822@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 29d0752d9517..b7a79207295e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2004,7 +2004,9 @@ void disable_IO_APIC(void) /* * Use virtual wire A mode when interrupt remapping is enabled. */ - disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1); + if (cpu_has_apic) + disconnect_bsp_APIC(!intr_remapping_enabled && + ioapic_i8259.pin != -1); } #ifdef CONFIG_X86_32 From 2e04bc76560decd9270be2a805927316f617ef56 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Thu, 18 Jun 2009 00:35:57 +0200 Subject: [PATCH 40/45] i386: fix return to 16-bit stack from NMI handler Returning to a task with a 16-bit stack requires special care: the iret instruction does not restore the high word of esp in that case. The espfix code fixes this, but currently is not invoked on NMIs. This means that a running task gets the upper word of esp clobbered due intervening NMIs. To reproduce, compile and run the following program with the nmi watchdog enabled (nmi_watchdog=2 on the command line). Using gdb you can see that the high bits of esp contain garbage, while the low bits are still correct. This patch puts the espfix code back into the NMI code path. The patch is slightly complicated due to the irqtrace infrastructure not being NMI-safe. The NMI return path cannot call TRACE_IRQS_IRET. Otherwise, the tail of the normal iret-code is correct for the nmi code path too. To be able to share this code-path, the TRACE_IRQS_IRET was move up a bit. The espfix code exists after the TRACE_IRQS_IRET, but this code explicitly disables interrupts. This short interrupts-off section is now not traced anymore. The return-to-kernel path now always includes the preliminary test to decide if the espfix code should be called. This is never the case, but doing it this way keeps the patch as simple as possible and the few extra instructions should not affect timing in any significant way. #define _GNU_SOURCE #include #include #include #include #include #include int modify_ldt(int func, void *ptr, unsigned long bytecount) { return syscall(SYS_modify_ldt, func, ptr, bytecount); } /* this is assumed to be usable */ #define SEGBASEADDR 0x10000 #define SEGLIMIT 0x20000 /* 16-bit segment */ struct user_desc desc = { .entry_number = 0, .base_addr = SEGBASEADDR, .limit = SEGLIMIT, .seg_32bit = 0, .contents = 0, /* ??? */ .read_exec_only = 0, .limit_in_pages = 0, .seg_not_present = 0, .useable = 1 }; int main(void) { setvbuf(stdout, NULL, _IONBF, 0); /* map a 64 kb segment */ char *pointer = mmap((void *)SEGBASEADDR, SEGLIMIT+1, PROT_EXEC|PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0); if (pointer == NULL) { printf("could not map space\n"); return 0; } /* write ldt, new mode */ int err = modify_ldt(0x11, &desc, sizeof(desc)); if (err) { printf("error modifying ldt: %i\n", err); return 0; } for (int i=0; i<1000; i++) { asm volatile ( "pusha\n\t" "mov %ss, %eax\n\t" /* preserve ss:esp */ "mov %esp, %ebp\n\t" "push $7\n\t" /* index 0, ldt, user mode */ "push $65536-4096\n\t" /* esp */ "lss (%esp), %esp\n\t" /* switch to new stack */ "push %eax\n\t" /* save old ss:esp on new stack */ "push %ebp\n\t" "add $17*65536, %esp\n\t" /* set high bits */ "mov %esp, %edx\n\t" "mov $10000000, %ecx\n\t" /* wait... */ "1: loop 1b\n\t" /* ... a bit */ "cmp %esp, %edx\n\t" "je 1f\n\t" "ud2\n\t" /* esp changed inexplicably! */ "1:\n\t" "sub $17*65536, %esp\n\t" /* restore high bits */ "lss (%esp), %esp\n\t" /* restore old ss:esp */ "popa\n\t"); printf("\rx%ix", i); } return 0; } Signed-off-by: Alexander van Heukelum Acked-by: Stas Sergeev Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c929add475c9..d7d1c7d20e4e 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -84,7 +84,7 @@ #define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else #define preempt_stop(clobbers) -#define resume_kernel restore_nocheck +#define resume_kernel restore_all #endif .macro TRACE_IRQS_IRET @@ -372,7 +372,7 @@ END(ret_from_exception) ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? - jnz restore_nocheck + jnz restore_all need_resched: movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl @@ -540,6 +540,8 @@ syscall_exit: jne syscall_exit_work restore_all: + TRACE_IRQS_IRET +restore_all_notrace: movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS # Warning: PT_OLDSS(%esp) contains the wrong/random values if we # are returning to the kernel. @@ -551,8 +553,6 @@ restore_all: CFI_REMEMBER_STATE je ldt_ss # returning to user-space with LDT SS restore_nocheck: - TRACE_IRQS_IRET -restore_nocheck_notrace: RESTORE_REGS 4 # skip orig_eax/error_code CFI_ADJUST_CFA_OFFSET -4 irq_return: @@ -601,8 +601,10 @@ ldt_ss: CFI_ADJUST_CFA_OFFSET 4 pushl %eax CFI_ADJUST_CFA_OFFSET 4 + /* Disable interrupts, but do not irqtrace this section: we + * will soon execute iret and the tracer was already set to + * the irqstate after the iret */ DISABLE_INTERRUPTS(CLBR_EAX) - TRACE_IRQS_OFF lss (%esp), %esp CFI_ADJUST_CFA_OFFSET -8 jmp restore_nocheck @@ -1329,7 +1331,7 @@ nmi_stack_correct: xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_nmi - jmp restore_nocheck_notrace + jmp restore_all_notrace CFI_ENDPROC nmi_stack_fixup: From dc4c2a0aed3b09f6e255bd5c3faa50fe6e0b2ded Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Thu, 18 Jun 2009 00:35:58 +0200 Subject: [PATCH 41/45] i386: fix/simplify espfix stack switching, move it into assembly The espfix code triggers if we have a protected mode userspace application with a 16-bit stack. On returning to userspace, with iret, the CPU doesn't restore the high word of the stack pointer. This is an "official" bug, and the work-around used in the kernel is to temporarily switch to a 32-bit stack segment/pointer pair where the high word of the pointer is equal to the high word of the userspace stackpointer. The current implementation uses THREAD_SIZE to determine the cut-off, but there is no good reason not to use the more natural 64kb... However, implementing this by simply substituting THREAD_SIZE with 65536 in patch_espfix_desc crashed the test application. patch_espfix_desc tries to do what is described above, but gets it subtly wrong if the userspace stack pointer is just below a multiple of THREAD_SIZE: an overflow occurs to bit 13... With a bit of luck, when the kernelspace stackpointer is just below a 64kb-boundary, the overflow then ripples trough to bit 16 and userspace will see its stack pointer changed by 65536. This patch moves all espfix code into entry_32.S. Selecting a 16-bit cut-off simplifies the code. The game with changing the limit dynamically is removed too. It complicates matters and I see no value in it. Changing only the top 16-bit word of ESP is one instruction and it also implies that only two bytes of the ESPFIX GDT entry need to be changed and this can be implemented in just a handful simple to understand instructions. As a side effect, the operation to compute the original ESP from the ESPFIX ESP and the GDT entry simplifies a bit too, and the remaining three instructions have been expanded inline in entry_32.S. impact: can now reliably run userspace with ESP=xxxxfffc on 16-bit stack segment Signed-off-by: Alexander van Heukelum Acked-by: Stas Sergeev Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/entry_32.S | 49 +++++++++++++++++++++++++----------- 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 5b9cb8839cae..ba1131ac9434 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -108,7 +108,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { /* data */ [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, - [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, + [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } }, [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, GDT_STACK_CANARY_INIT #endif diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index d7d1c7d20e4e..848f73f09549 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -588,24 +588,34 @@ ldt_ss: jne restore_nocheck #endif - /* If returning to userspace with 16bit stack, - * try to fix the higher word of ESP, as the CPU - * won't restore it. - * This is an "official" bug of all the x86-compatible - * CPUs, which we can try to work around to make - * dosemu and wine happy. */ - movl PT_OLDESP(%esp), %eax - movl %esp, %edx - call patch_espfix_desc +/* + * Setup and switch to ESPFIX stack + * + * We're returning to userspace with a 16 bit stack. The CPU will not + * restore the high word of ESP for us on executing iret... This is an + * "official" bug of all the x86-compatible CPUs, which we can work + * around to make dosemu and wine happy. We do this by preloading the + * high word of ESP with the high word of the userspace ESP while + * compensating for the offset by changing to the ESPFIX segment with + * a base address that matches for the difference. + */ + mov %esp, %edx /* load kernel esp */ + mov PT_OLDESP(%esp), %eax /* load userspace esp */ + mov %dx, %ax /* eax: new kernel esp */ + sub %eax, %edx /* offset (low word is 0) */ + PER_CPU(gdt_page, %ebx) + shr $16, %edx + mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ + mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */ pushl $__ESPFIX_SS CFI_ADJUST_CFA_OFFSET 4 - pushl %eax + push %eax /* new kernel esp */ CFI_ADJUST_CFA_OFFSET 4 /* Disable interrupts, but do not irqtrace this section: we * will soon execute iret and the tracer was already set to * the irqstate after the iret */ DISABLE_INTERRUPTS(CLBR_EAX) - lss (%esp), %esp + lss (%esp), %esp /* switch to espfix segment */ CFI_ADJUST_CFA_OFFSET -8 jmp restore_nocheck CFI_ENDPROC @@ -718,15 +728,24 @@ PTREGSCALL(vm86) PTREGSCALL(vm86old) .macro FIXUP_ESPFIX_STACK - /* since we are on a wrong stack, we cant make it a C code :( */ +/* + * Switch back for ESPFIX stack to the normal zerobased stack + * + * We can't call C functions using the ESPFIX stack. This code reads + * the high word of the segment base from the GDT and swiches to the + * normal stack and adjusts ESP with the matching offset. + */ + /* fixup the stack */ PER_CPU(gdt_page, %ebx) - GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) - addl %esp, %eax + mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ + mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */ + shl $16, %eax + addl %esp, %eax /* the adjusted stack pointer */ pushl $__KERNEL_DS CFI_ADJUST_CFA_OFFSET 4 pushl %eax CFI_ADJUST_CFA_OFFSET 4 - lss (%esp), %esp + lss (%esp), %esp /* switch to the normal stack segment */ CFI_ADJUST_CFA_OFFSET -8 .endm .macro UNWIND_ESPFIX_STACK From bc3f5d3dbd576da94a575b1477b8e38551bf11da Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Thu, 18 Jun 2009 00:35:59 +0200 Subject: [PATCH 42/45] x86: de-assembler-ize asm/desc.h asm/desc.h is included in three assembly files, but the only macro it defines, GET_DESC_BASE, is never used. This patch removes the includes, removes the macro GET_DESC_BASE and the ASSEMBLY guard from asm/desc.h. Signed-off-by: Alexander van Heukelum Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/desc.h | 26 -------------------------- arch/x86/kernel/entry_32.S | 1 - arch/x86/kernel/head_32.S | 1 - arch/x86/kernel/head_64.S | 1 - 4 files changed, 29 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index c45f415ce315..c993e9e0fed4 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -1,7 +1,6 @@ #ifndef _ASM_X86_DESC_H #define _ASM_X86_DESC_H -#ifndef __ASSEMBLY__ #include #include #include @@ -380,29 +379,4 @@ static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); } -#else -/* - * GET_DESC_BASE reads the descriptor base of the specified segment. - * - * Args: - * idx - descriptor index - * gdt - GDT pointer - * base - 32bit register to which the base will be written - * lo_w - lo word of the "base" register - * lo_b - lo byte of the "base" register - * hi_b - hi byte of the low word of the "base" register - * - * Example: - * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) - * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax. - */ -#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \ - movb idx * 8 + 4(gdt), lo_b; \ - movb idx * 8 + 7(gdt), hi_b; \ - shll $16, base; \ - movw idx * 8 + 2(gdt), lo_w; - - -#endif /* __ASSEMBLY__ */ - #endif /* _ASM_X86_DESC_H */ diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 848f73f09549..9f8ce77dbc64 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -48,7 +48,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index dc5ed4bdd88d..8663afb56535 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 54b29bb24e71..fa54f78e2a05 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include From 74b602c7147212a7495879ec23fe6c2d3b470e06 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 17 Jun 2009 14:43:32 -0700 Subject: [PATCH 43/45] x86: fix duplicated sysfs attribute The sysfs attribute cmci_disabled was accidentall turned into a duplicate of ignore_ce, breaking all other attributes. Signed-off-by: Yinghai Lu Acked-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2a560cefb675..5aac9e4dd136 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1777,7 +1777,7 @@ static struct sysdev_ext_attribute attr_ignore_ce = { }; static struct sysdev_ext_attribute attr_cmci_disabled = { - _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_cmci_disabled), + _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), &mce_cmci_disabled }; From e92fae064ae42b2a4a77646f7655bca4c87bb1eb Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 17 Jun 2009 16:21:33 -0700 Subject: [PATCH 44/45] x86: use zalloc_cpumask_var for mce_dev_initialized We need a cleared cpu_mask to record if mce is initialized, especially when MAXSMP is used. used zalloc_... instead Signed-off-by: Yinghai Lu Reviewed-by: Hidetoshi Seto Cc: stable@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5aac9e4dd136..c2fb70d0286f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1969,7 +1969,7 @@ static __init int mce_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; - alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); + zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); err = mce_init_banks(); if (err) From b1f49f9582f9be6de5055cfa97eabf6246f2eaf7 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 18 Jun 2009 14:53:24 +0900 Subject: [PATCH 45/45] x86, mce: fix error path in mce_create_device() Don't skip removing mce_attrs in route from error2. Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Cc: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index c2fb70d0286f..284d1de968bc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1798,7 +1798,7 @@ static cpumask_var_t mce_dev_initialized; static __cpuinit int mce_create_device(unsigned int cpu) { int err; - int i; + int i, j; if (!mce_available(&boot_cpu_data)) return -EIO; @@ -1816,9 +1816,9 @@ static __cpuinit int mce_create_device(unsigned int cpu) if (err) goto error; } - for (i = 0; i < banks; i++) { + for (j = 0; j < banks; j++) { err = sysdev_create_file(&per_cpu(mce_dev, cpu), - &bank_attrs[i]); + &bank_attrs[j]); if (err) goto error2; } @@ -1826,8 +1826,8 @@ static __cpuinit int mce_create_device(unsigned int cpu) return 0; error2: - while (--i >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); + while (--j >= 0) + sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]); error: while (--i >= 0) sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);