linux-stable/arch/s390/kernel/nmi.c

516 lines
13 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* Machine check handler
*
* Copyright IBM Corp. 2000, 2009
* Author(s): Ingo Adlung <adlung@de.ibm.com>,
* Martin Schwidefsky <schwidefsky@de.ibm.com>,
* Cornelia Huck <cornelia.huck@de.ibm.com>,
*/
#include <linux/kernel_stat.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/entry-common.h>
#include <linux/hardirq.h>
#include <linux/log2.h>
#include <linux/kprobes.h>
headers: untangle kmemleak.h from mm.h Currently <linux/slab.h> #includes <linux/kmemleak.h> for no obvious reason. It looks like it's only a convenience, so remove kmemleak.h from slab.h and add <linux/kmemleak.h> to any users of kmemleak_* that don't already #include it. Also remove <linux/kmemleak.h> from source files that do not use it. This is tested on i386 allmodconfig and x86_64 allmodconfig. It would be good to run it through the 0day bot for other $ARCHes. I have neither the horsepower nor the storage space for the other $ARCHes. Update: This patch has been extensively build-tested by both the 0day bot & kisskb/ozlabs build farms. Both of them reported 2 build failures for which patches are included here (in v2). [ slab.h is the second most used header file after module.h; kernel.h is right there with slab.h. There could be some minor error in the counting due to some #includes having comments after them and I didn't combine all of those. ] [akpm@linux-foundation.org: security/keys/big_key.c needs vmalloc.h, per sfr] Link: http://lkml.kernel.org/r/e4309f98-3749-93e1-4bb7-d9501a39d015@infradead.org Link: http://kisskb.ellerman.id.au/kisskb/head/13396/ Signed-off-by: Randy Dunlap <rdunlap@infradead.org> Reviewed-by: Ingo Molnar <mingo@kernel.org> Reported-by: Michael Ellerman <mpe@ellerman.id.au> [2 build failures] Reported-by: Fengguang Wu <fengguang.wu@intel.com> [2 build failures] Reviewed-by: Andrew Morton <akpm@linux-foundation.org> Cc: Wei Yongjun <weiyongjun1@huawei.com> Cc: Luis R. Rodriguez <mcgrof@kernel.org> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Mimi Zohar <zohar@linux.vnet.ibm.com> Cc: John Johansen <john.johansen@canonical.com> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-04-05 23:25:34 +00:00
#include <linux/kmemleak.h>
#include <linux/time.h>
#include <linux/module.h>
#include <linux/sched/signal.h>
#include <linux/kvm_host.h>
#include <linux/export.h>
#include <asm/lowcore.h>
#include <asm/ctlreg.h>
#include <asm/smp.h>
#include <asm/stp.h>
#include <asm/cputime.h>
#include <asm/nmi.h>
#include <asm/crw.h>
#include <asm/switch_to.h>
#include <asm/asm-offsets.h>
s390/pai: add support for cryptography counters PMU device driver perf_pai_crypto supports Processor Activity Instrumentation (PAI), available with IBM z16: - maps a full page to lowcore address 0x1500. - uses CR0 bit 13 to turn PAI crypto counting on and off. - creates a sample with raw data on each context switch out when at context switch some mapped counters have a value of nonzero. This device driver only supports CPU wide context, no task context is allowed. Support for counting: - one or more counters can be specified using perf stat -e pai_crypto/xxx/ where xxx stands for the counter event name. Multiple invocation of this command is possible. The counter names are listed in /sys/devices/pai_crypto/events directory. - one special counters can be specified using perf stat -e pai_crypto/CRYPTO_ALL/ which returns the sum of all incremented crypto counters. - one event pai_crypto/CRYPTO_ALL/ is reserved for sampling. No multiple invocations are possible. The event collects data at context switch out and saves them in the ring buffer. Add qpaci assembly instruction to query supported memory mapped crypto counters. It returns the number of counters (no holes allowed in that range). The PAI crypto counter events are system wide and can not be executed in parallel. Therefore some restrictions documented in function paicrypt_busy apply. In particular event CRYPTO_ALL for sampling must run exclusive. Only counting events can run in parallel. PAI crypto counter events can not be created when a CPU hot plug add is processed. This means a CPU hot plug add does not get the necessary PAI event to record PAI cryptography counter increments on the newly added CPU. CPU hot plug remove removes the event and terminates the counting of PAI counters immediately. Co-developed-by: Sven Schnelle <svens@linux.ibm.com> Signed-off-by: Sven Schnelle <svens@linux.ibm.com> Reviewed-by: Juergen Christ <jchrist@linux.ibm.com> Signed-off-by: Thomas Richter <tmricht@linux.ibm.com> Link: https://lore.kernel.org/r/20220504062351.2954280-3-tmricht@linux.ibm.com Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
2022-05-04 06:23:51 +00:00
#include <asm/pai.h>
#include <asm/vx-insn.h>
#include <asm/fpu/api.h>
struct mcck_struct {
unsigned int kill_task : 1;
unsigned int channel_report : 1;
unsigned int warning : 1;
unsigned int stp_queue : 1;
unsigned long mcck_code;
};
static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck);
static inline int nmi_needs_mcesa(void)
{
return cpu_has_vx() || MACHINE_HAS_GS;
}
/*
* The initial machine check extended save area for the boot CPU.
* It will be replaced on the boot CPU reinit with an allocated
* structure. The structure is required for machine check happening
* early in the boot process.
*/
static struct mcesa boot_mcesa __aligned(MCESA_MAX_SIZE);
void __init nmi_alloc_mcesa_early(u64 *mcesad)
{
if (!nmi_needs_mcesa())
return;
*mcesad = __pa(&boot_mcesa);
if (MACHINE_HAS_GS)
*mcesad |= ilog2(MCESA_MAX_SIZE);
}
int nmi_alloc_mcesa(u64 *mcesad)
{
unsigned long size;
void *origin;
*mcesad = 0;
if (!nmi_needs_mcesa())
return 0;
size = MACHINE_HAS_GS ? MCESA_MAX_SIZE : MCESA_MIN_SIZE;
origin = kmalloc(size, GFP_KERNEL);
if (!origin)
return -ENOMEM;
/* The pointer is stored with mcesa_bits ORed in */
kmemleak_not_leak(origin);
*mcesad = __pa(origin);
if (MACHINE_HAS_GS)
*mcesad |= ilog2(MCESA_MAX_SIZE);
return 0;
}
void nmi_free_mcesa(u64 *mcesad)
{
if (!nmi_needs_mcesa())
return;
kfree(__va(*mcesad & MCESA_ORIGIN_MASK));
}
static __always_inline char *nmi_puts(char *dest, const char *src)
{
while (*src)
*dest++ = *src++;
*dest = 0;
return dest;
}
static __always_inline char *u64_to_hex(char *dest, u64 val)
{
int i, num;
for (i = 1; i <= 16; i++) {
num = (val >> (64 - 4 * i)) & 0xf;
if (num >= 10)
*dest++ = 'A' + num - 10;
else
*dest++ = '0' + num;
}
*dest = 0;
return dest;
}
static notrace void s390_handle_damage(void)
{
union ctlreg0 cr0, cr0_new;
char message[100];
psw_t psw_save;
char *ptr;
smp_emergency_stop();
diag_amode31_ops.diag308_reset();
ptr = nmi_puts(message, "System stopped due to unrecoverable machine check, code: 0x");
u64_to_hex(ptr, S390_lowcore.mcck_interruption_code);
/*
* Disable low address protection and make machine check new PSW a
* disabled wait PSW. Any additional machine check cannot be handled.
*/
local_ctl_store(0, &cr0.reg);
cr0_new = cr0;
cr0_new.lap = 0;
local_ctl_load(0, &cr0_new.reg);
psw_save = S390_lowcore.mcck_new_psw;
psw_bits(S390_lowcore.mcck_new_psw).io = 0;
psw_bits(S390_lowcore.mcck_new_psw).ext = 0;
psw_bits(S390_lowcore.mcck_new_psw).wait = 1;
sclp_emergency_printk(message);
/*
* Restore machine check new PSW and control register 0 to original
* values. This makes possible system dump analysis easier.
*/
S390_lowcore.mcck_new_psw = psw_save;
local_ctl_load(0, &cr0.reg);
disabled_wait();
while (1);
}
NOKPROBE_SYMBOL(s390_handle_damage);
/*
s390: convert to generic entry This patch converts s390 to use the generic entry infrastructure from kernel/entry/*. There are a few special things on s390: - PIF_PER_TRAP is moved to TIF_PER_TRAP as the generic code doesn't know about our PIF flags in exit_to_user_mode_loop(). - The old code had several ways to restart syscalls: a) PIF_SYSCALL_RESTART, which was only set during execve to force a restart after upgrading a process (usually qemu-kvm) to pgste page table extensions. b) PIF_SYSCALL, which is set by do_signal() to indicate that the current syscall should be restarted. This is changed so that do_signal() now also uses PIF_SYSCALL_RESTART. Continuing to use PIF_SYSCALL doesn't work with the generic code, and changing it to PIF_SYSCALL_RESTART makes PIF_SYSCALL and PIF_SYSCALL_RESTART more unique. - On s390 calling sys_sigreturn or sys_rt_sigreturn is implemented by executing a svc instruction on the process stack which causes a fault. While handling that fault the fault code sets PIF_SYSCALL to hand over processing to the syscall code on exit to usermode. The patch introduces PIF_SYSCALL_RET_SET, which is set if ptrace sets a return value for a syscall. The s390x ptrace ABI uses r2 both for the syscall number and return value, so ptrace cannot set the syscall number + return value at the same time. The flag makes handling that a bit easier. do_syscall() will just skip executing the syscall if PIF_SYSCALL_RET_SET is set. CONFIG_DEBUG_ASCE was removd in favour of the generic CONFIG_DEBUG_ENTRY. CR1/7/13 will be checked both on kernel entry and exit to contain the correct asces. Signed-off-by: Sven Schnelle <svens@linux.ibm.com> Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
2020-11-21 10:14:56 +00:00
* Main machine check handler function. Will be called with interrupts disabled
* and machine checks enabled.
*/
s390/mcck: cleanup user process termination path If a machine check interrupt hits while user process is running __s390_handle_mcck() helper function is called directly from the interrupt handler and terminates the current process by calling make_task_dead() routine. The make_task_dead() is not allowed to be called from interrupt context which forces the machine check handler switch to the kernel stack and enable local interrupts first. The __s390_handle_mcck() could also be called to service pending work, but this time from the external interrupts handler. It is the machine check handler that establishes the work and schedules the external interrupt, therefore the machine check interrupt itself should be disabled while reading out the corresponding variable: local_mcck_disable(); mcck = *this_cpu_ptr(&cpu_mcck); memset(this_cpu_ptr(&cpu_mcck), 0, sizeof(mcck)); local_mcck_enable(); However, local_mcck_disable() does not have effect when __s390_handle_mcck() is called directly form the machine check handler, since the machine check interrupt is still disabled. Therefore, it is not the opening bracket to the following local_mcck_enable() function. Simplify the user process termination flow by scheduling the external interrupt and killing the affected process from the interrupt context. Assume a kernel-generated signal is always delivered and ignore a value returned by do_send_sig_info() funciton. Reviewed-by: Heiko Carstens <hca@linux.ibm.com> Reviewed-by: Sven Schnelle <svens@linux.ibm.com> Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com> Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
2022-12-17 10:01:16 +00:00
void s390_handle_mcck(void)
{
struct mcck_struct mcck;
unsigned long mflags;
/*
* Disable machine checks and get the current state of accumulated
* machine checks. Afterwards delete the old state and enable machine
* checks again.
*/
local_mcck_save(mflags);
mcck = *this_cpu_ptr(&cpu_mcck);
memset(this_cpu_ptr(&cpu_mcck), 0, sizeof(mcck));
local_mcck_restore(mflags);
if (mcck.channel_report)
crw_handle_channel_report();
/*
* A warning may remain for a prolonged period on the bare iron.
* (actually until the machine is powered off, or the problem is gone)
* So we just stop listening for the WARNING MCH and avoid continuously
* being interrupted. One caveat is however, that we must do this per
* processor and cannot use the smp version of ctl_clear_bit().
* On VM we only get one interrupt per virtally presented machinecheck.
* Though one suffices, we may get one interrupt per (virtual) cpu.
*/
if (mcck.warning) { /* WARNING pending ? */
static int mchchk_wng_posted = 0;
/* Use single cpu clear, as we cannot handle smp here. */
local_ctl_clear_bit(14, CR14_WARNING_SUBMASK_BIT);
if (xchg(&mchchk_wng_posted, 1) == 0)
kill_cad_pid(SIGPWR, 1);
}
if (mcck.stp_queue)
stp_queue_work();
if (mcck.kill_task) {
printk(KERN_EMERG "mcck: Terminating task because of machine "
"malfunction (code 0x%016lx).\n", mcck.mcck_code);
printk(KERN_EMERG "mcck: task: %s, pid: %d.\n",
current->comm, current->pid);
s390/mcck: cleanup user process termination path If a machine check interrupt hits while user process is running __s390_handle_mcck() helper function is called directly from the interrupt handler and terminates the current process by calling make_task_dead() routine. The make_task_dead() is not allowed to be called from interrupt context which forces the machine check handler switch to the kernel stack and enable local interrupts first. The __s390_handle_mcck() could also be called to service pending work, but this time from the external interrupts handler. It is the machine check handler that establishes the work and schedules the external interrupt, therefore the machine check interrupt itself should be disabled while reading out the corresponding variable: local_mcck_disable(); mcck = *this_cpu_ptr(&cpu_mcck); memset(this_cpu_ptr(&cpu_mcck), 0, sizeof(mcck)); local_mcck_enable(); However, local_mcck_disable() does not have effect when __s390_handle_mcck() is called directly form the machine check handler, since the machine check interrupt is still disabled. Therefore, it is not the opening bracket to the following local_mcck_enable() function. Simplify the user process termination flow by scheduling the external interrupt and killing the affected process from the interrupt context. Assume a kernel-generated signal is always delivered and ignore a value returned by do_send_sig_info() funciton. Reviewed-by: Heiko Carstens <hca@linux.ibm.com> Reviewed-by: Sven Schnelle <svens@linux.ibm.com> Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com> Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
2022-12-17 10:01:16 +00:00
if (is_global_init(current))
panic("mcck: Attempting to kill init!\n");
do_send_sig_info(SIGKILL, SEND_SIG_PRIV, current, PIDTYPE_PID);
}
}
/*
* returns 0 if register contents could be validated
* returns 1 otherwise
*/
static int notrace s390_validate_registers(union mci mci)
{
struct mcesa *mcesa;
void *fpt_save_area;
union ctlreg2 cr2;
int kill_task;
u64 zero;
kill_task = 0;
zero = 0;
if (!mci.gr || !mci.fp)
kill_task = 1;
fpt_save_area = &S390_lowcore.floating_pt_save_area;
if (!mci.fc) {
kill_task = 1;
asm volatile(
" lfpc %0\n"
:
: "Q" (zero));
} else {
asm volatile(
" lfpc %0\n"
:
: "Q" (S390_lowcore.fpt_creg_save_area));
}
mcesa = __va(S390_lowcore.mcesad & MCESA_ORIGIN_MASK);
if (!cpu_has_vx()) {
/* Validate floating point registers */
asm volatile(
" ld 0,0(%0)\n"
" ld 1,8(%0)\n"
" ld 2,16(%0)\n"
" ld 3,24(%0)\n"
" ld 4,32(%0)\n"
" ld 5,40(%0)\n"
" ld 6,48(%0)\n"
" ld 7,56(%0)\n"
" ld 8,64(%0)\n"
" ld 9,72(%0)\n"
" ld 10,80(%0)\n"
" ld 11,88(%0)\n"
" ld 12,96(%0)\n"
" ld 13,104(%0)\n"
" ld 14,112(%0)\n"
" ld 15,120(%0)\n"
:
: "a" (fpt_save_area)
: "memory");
} else {
/* Validate vector registers */
union ctlreg0 cr0;
/*
* The vector validity must only be checked if not running a
* KVM guest. For KVM guests the machine check is forwarded by
* KVM and it is the responsibility of the guest to take
* appropriate actions. The host vector or FPU values have been
* saved by KVM and will be restored by KVM.
*/
if (!mci.vr && !test_cpu_flag(CIF_MCCK_GUEST))
kill_task = 1;
cr0.reg = S390_lowcore.cregs_save_area[0];
cr0.afp = cr0.vx = 1;
local_ctl_load(0, &cr0.reg);
asm volatile(
" la 1,%0\n"
" VLM 0,15,0,1\n"
" VLM 16,31,256,1\n"
:
: "Q" (*(struct vx_array *)mcesa->vector_save_area)
: "1");
local_ctl_load(0, &S390_lowcore.cregs_save_area[0]);
}
/* Validate access registers */
asm volatile(
" lam 0,15,0(%0)\n"
:
: "a" (&S390_lowcore.access_regs_save_area)
: "memory");
if (!mci.ar)
kill_task = 1;
/* Validate guarded storage registers */
cr2.reg = S390_lowcore.cregs_save_area[2];
if (cr2.gse) {
if (!mci.gs) {
s390: add a system call for guarded storage This adds a new system call to enable the use of guarded storage for user space processes. The system call takes two arguments, a command and pointer to a guarded storage control block: s390_guarded_storage(int command, struct gs_cb *gs_cb); The second argument is relevant only for the GS_SET_BC_CB command. The commands in detail: 0 - GS_ENABLE Enable the guarded storage facility for the current task. The initial content of the guarded storage control block will be all zeros. After the enablement the user space code can use load-guarded-storage-controls instruction (LGSC) to load an arbitrary control block. While a task is enabled the kernel will save and restore the current content of the guarded storage registers on context switch. 1 - GS_DISABLE Disables the use of the guarded storage facility for the current task. The kernel will cease to save and restore the content of the guarded storage registers, the task specific content of these registers is lost. 2 - GS_SET_BC_CB Set a broadcast guarded storage control block. This is called per thread and stores a specific guarded storage control block in the task struct of the current task. This control block will be used for the broadcast event GS_BROADCAST. 3 - GS_CLEAR_BC_CB Clears the broadcast guarded storage control block. The guarded- storage control block is removed from the task struct that was established by GS_SET_BC_CB. 4 - GS_BROADCAST Sends a broadcast to all thread siblings of the current task. Every sibling that has established a broadcast guarded storage control block will load this control block and will be enabled for guarded storage. The broadcast guarded storage control block is used up, a second broadcast without a refresh of the stored control block with GS_SET_BC_CB will not have any effect. Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
2016-01-26 13:10:34 +00:00
/*
* 2 cases:
* - machine check in kernel or userspace
* - machine check while running SIE (KVM guest)
* For kernel or userspace the userspace values of
* guarded storage control can not be recreated, the
* process must be terminated.
* For SIE the guest values of guarded storage can not
* be recreated. This is either due to a bug or due to
* GS being disabled in the guest. The guest will be
* notified by KVM code and the guests machine check
* handling must take care of this. The host values
* are saved by KVM and are not affected.
s390: add a system call for guarded storage This adds a new system call to enable the use of guarded storage for user space processes. The system call takes two arguments, a command and pointer to a guarded storage control block: s390_guarded_storage(int command, struct gs_cb *gs_cb); The second argument is relevant only for the GS_SET_BC_CB command. The commands in detail: 0 - GS_ENABLE Enable the guarded storage facility for the current task. The initial content of the guarded storage control block will be all zeros. After the enablement the user space code can use load-guarded-storage-controls instruction (LGSC) to load an arbitrary control block. While a task is enabled the kernel will save and restore the current content of the guarded storage registers on context switch. 1 - GS_DISABLE Disables the use of the guarded storage facility for the current task. The kernel will cease to save and restore the content of the guarded storage registers, the task specific content of these registers is lost. 2 - GS_SET_BC_CB Set a broadcast guarded storage control block. This is called per thread and stores a specific guarded storage control block in the task struct of the current task. This control block will be used for the broadcast event GS_BROADCAST. 3 - GS_CLEAR_BC_CB Clears the broadcast guarded storage control block. The guarded- storage control block is removed from the task struct that was established by GS_SET_BC_CB. 4 - GS_BROADCAST Sends a broadcast to all thread siblings of the current task. Every sibling that has established a broadcast guarded storage control block will load this control block and will be enabled for guarded storage. The broadcast guarded storage control block is used up, a second broadcast without a refresh of the stored control block with GS_SET_BC_CB will not have any effect. Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
2016-01-26 13:10:34 +00:00
*/
if (!test_cpu_flag(CIF_MCCK_GUEST))
kill_task = 1;
} else {
load_gs_cb((struct gs_cb *)mcesa->guarded_storage_save_area);
}
s390: add a system call for guarded storage This adds a new system call to enable the use of guarded storage for user space processes. The system call takes two arguments, a command and pointer to a guarded storage control block: s390_guarded_storage(int command, struct gs_cb *gs_cb); The second argument is relevant only for the GS_SET_BC_CB command. The commands in detail: 0 - GS_ENABLE Enable the guarded storage facility for the current task. The initial content of the guarded storage control block will be all zeros. After the enablement the user space code can use load-guarded-storage-controls instruction (LGSC) to load an arbitrary control block. While a task is enabled the kernel will save and restore the current content of the guarded storage registers on context switch. 1 - GS_DISABLE Disables the use of the guarded storage facility for the current task. The kernel will cease to save and restore the content of the guarded storage registers, the task specific content of these registers is lost. 2 - GS_SET_BC_CB Set a broadcast guarded storage control block. This is called per thread and stores a specific guarded storage control block in the task struct of the current task. This control block will be used for the broadcast event GS_BROADCAST. 3 - GS_CLEAR_BC_CB Clears the broadcast guarded storage control block. The guarded- storage control block is removed from the task struct that was established by GS_SET_BC_CB. 4 - GS_BROADCAST Sends a broadcast to all thread siblings of the current task. Every sibling that has established a broadcast guarded storage control block will load this control block and will be enabled for guarded storage. The broadcast guarded storage control block is used up, a second broadcast without a refresh of the stored control block with GS_SET_BC_CB will not have any effect. Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
2016-01-26 13:10:34 +00:00
}
/*
* The getcpu vdso syscall reads CPU number from the programmable
* field of the TOD clock. Disregard the TOD programmable register
* validity bit and load the CPU number into the TOD programmable
* field unconditionally.
*/
set_tod_programmable_field(raw_smp_processor_id());
/* Validate clock comparator register */
set_clock_comparator(S390_lowcore.clock_comparator);
if (!mci.ms || !mci.pm || !mci.ia)
kill_task = 1;
return kill_task;
}
NOKPROBE_SYMBOL(s390_validate_registers);
/*
* Backup the guest's machine check info to its description block
*/
static void notrace s390_backup_mcck_info(struct pt_regs *regs)
{
struct mcck_volatile_info *mcck_backup;
struct sie_page *sie_page;
/* r14 contains the sie block, which was set in sie64a */
struct kvm_s390_sie_block *sie_block = phys_to_virt(regs->gprs[14]);
if (sie_block == NULL)
/* Something's seriously wrong, stop system. */
s390_handle_damage();
sie_page = container_of(sie_block, struct sie_page, sie_block);
mcck_backup = &sie_page->mcck_info;
mcck_backup->mcic = S390_lowcore.mcck_interruption_code &
~(MCCK_CODE_CP | MCCK_CODE_EXT_DAMAGE);
mcck_backup->ext_damage_code = S390_lowcore.external_damage_code;
mcck_backup->failing_storage_address
= S390_lowcore.failing_storage_address;
}
NOKPROBE_SYMBOL(s390_backup_mcck_info);
#define MAX_IPD_COUNT 29
#define MAX_IPD_TIME (5 * 60 * USEC_PER_SEC) /* 5 minutes */
#define ED_STP_ISLAND 6 /* External damage STP island check */
#define ED_STP_SYNC 7 /* External damage STP sync check */
#define MCCK_CODE_NO_GUEST (MCCK_CODE_CP | MCCK_CODE_EXT_DAMAGE)
/*
* machine check handler.
*/
s390/mcck: cleanup user process termination path If a machine check interrupt hits while user process is running __s390_handle_mcck() helper function is called directly from the interrupt handler and terminates the current process by calling make_task_dead() routine. The make_task_dead() is not allowed to be called from interrupt context which forces the machine check handler switch to the kernel stack and enable local interrupts first. The __s390_handle_mcck() could also be called to service pending work, but this time from the external interrupts handler. It is the machine check handler that establishes the work and schedules the external interrupt, therefore the machine check interrupt itself should be disabled while reading out the corresponding variable: local_mcck_disable(); mcck = *this_cpu_ptr(&cpu_mcck); memset(this_cpu_ptr(&cpu_mcck), 0, sizeof(mcck)); local_mcck_enable(); However, local_mcck_disable() does not have effect when __s390_handle_mcck() is called directly form the machine check handler, since the machine check interrupt is still disabled. Therefore, it is not the opening bracket to the following local_mcck_enable() function. Simplify the user process termination flow by scheduling the external interrupt and killing the affected process from the interrupt context. Assume a kernel-generated signal is always delivered and ignore a value returned by do_send_sig_info() funciton. Reviewed-by: Heiko Carstens <hca@linux.ibm.com> Reviewed-by: Sven Schnelle <svens@linux.ibm.com> Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com> Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
2022-12-17 10:01:16 +00:00
void notrace s390_do_machine_check(struct pt_regs *regs)
{
static int ipd_count;
static DEFINE_SPINLOCK(ipd_lock);
static unsigned long long last_ipd;
struct mcck_struct *mcck;
unsigned long long tmp;
irqentry_state_t irq_state;
union mci mci;
unsigned long mcck_dam_code;
int mcck_pending = 0;
irq_state = irqentry_nmi_enter(regs);
s390: convert to generic entry This patch converts s390 to use the generic entry infrastructure from kernel/entry/*. There are a few special things on s390: - PIF_PER_TRAP is moved to TIF_PER_TRAP as the generic code doesn't know about our PIF flags in exit_to_user_mode_loop(). - The old code had several ways to restart syscalls: a) PIF_SYSCALL_RESTART, which was only set during execve to force a restart after upgrading a process (usually qemu-kvm) to pgste page table extensions. b) PIF_SYSCALL, which is set by do_signal() to indicate that the current syscall should be restarted. This is changed so that do_signal() now also uses PIF_SYSCALL_RESTART. Continuing to use PIF_SYSCALL doesn't work with the generic code, and changing it to PIF_SYSCALL_RESTART makes PIF_SYSCALL and PIF_SYSCALL_RESTART more unique. - On s390 calling sys_sigreturn or sys_rt_sigreturn is implemented by executing a svc instruction on the process stack which causes a fault. While handling that fault the fault code sets PIF_SYSCALL to hand over processing to the syscall code on exit to usermode. The patch introduces PIF_SYSCALL_RET_SET, which is set if ptrace sets a return value for a syscall. The s390x ptrace ABI uses r2 both for the syscall number and return value, so ptrace cannot set the syscall number + return value at the same time. The flag makes handling that a bit easier. do_syscall() will just skip executing the syscall if PIF_SYSCALL_RET_SET is set. CONFIG_DEBUG_ASCE was removd in favour of the generic CONFIG_DEBUG_ENTRY. CR1/7/13 will be checked both on kernel entry and exit to contain the correct asces. Signed-off-by: Sven Schnelle <svens@linux.ibm.com> Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
2020-11-21 10:14:56 +00:00
if (user_mode(regs))
update_timer_mcck();
inc_irq_stat(NMI_NMI);
mci.val = S390_lowcore.mcck_interruption_code;
s390: Replace __get_cpu_var uses __get_cpu_var() is used for multiple purposes in the kernel source. One of them is address calculation via the form &__get_cpu_var(x). This calculates the address for the instance of the percpu variable of the current processor based on an offset. Other use cases are for storing and retrieving data from the current processors percpu area. __get_cpu_var() can be used as an lvalue when writing data or on the right side of an assignment. __get_cpu_var() is defined as : #define __get_cpu_var(var) (*this_cpu_ptr(&(var))) __get_cpu_var() always only does an address determination. However, store and retrieve operations could use a segment prefix (or global register on other platforms) to avoid the address calculation. this_cpu_write() and this_cpu_read() can directly take an offset into a percpu area and use optimized assembly code to read and write per cpu variables. This patch converts __get_cpu_var into either an explicit address calculation using this_cpu_ptr() or into a use of this_cpu operations that use the offset. Thereby address calculations are avoided and less registers are used when code is generated. At the end of the patch set all uses of __get_cpu_var have been removed so the macro is removed too. The patch set includes passes over all arches as well. Once these operations are used throughout then specialized macros can be defined in non -x86 arches as well in order to optimize per cpu access by f.e. using a global register that may be set to the per cpu base. Transformations done to __get_cpu_var() 1. Determine the address of the percpu instance of the current processor. DEFINE_PER_CPU(int, y); int *x = &__get_cpu_var(y); Converts to int *x = this_cpu_ptr(&y); 2. Same as #1 but this time an array structure is involved. DEFINE_PER_CPU(int, y[20]); int *x = __get_cpu_var(y); Converts to int *x = this_cpu_ptr(y); 3. Retrieve the content of the current processors instance of a per cpu variable. DEFINE_PER_CPU(int, y); int x = __get_cpu_var(y) Converts to int x = __this_cpu_read(y); 4. Retrieve the content of a percpu struct DEFINE_PER_CPU(struct mystruct, y); struct mystruct x = __get_cpu_var(y); Converts to memcpy(&x, this_cpu_ptr(&y), sizeof(x)); 5. Assignment to a per cpu variable DEFINE_PER_CPU(int, y) __get_cpu_var(y) = x; Converts to this_cpu_write(y, x); 6. Increment/Decrement etc of a per cpu variable DEFINE_PER_CPU(int, y); __get_cpu_var(y)++ Converts to this_cpu_inc(y) Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> CC: linux390@de.ibm.com Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com> Signed-off-by: Christoph Lameter <cl@linux.com> Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-17 17:30:45 +00:00
mcck = this_cpu_ptr(&cpu_mcck);
/*
* Reinject the instruction processing damages' machine checks
* including Delayed Access Exception into the guest
* instead of damaging the host if they happen in the guest.
*/
if (mci.pd && !test_cpu_flag(CIF_MCCK_GUEST)) {
if (mci.b) {
/* Processing backup -> verify if we can survive this */
u64 z_mcic, o_mcic, t_mcic;
z_mcic = (1ULL<<63 | 1ULL<<59 | 1ULL<<29);
o_mcic = (1ULL<<43 | 1ULL<<42 | 1ULL<<41 | 1ULL<<40 |
1ULL<<36 | 1ULL<<35 | 1ULL<<34 | 1ULL<<32 |
1ULL<<30 | 1ULL<<21 | 1ULL<<20 | 1ULL<<17 |
1ULL<<16);
t_mcic = mci.val;
if (((t_mcic & z_mcic) != 0) ||
((t_mcic & o_mcic) != o_mcic)) {
s390_handle_damage();
}
/*
* Nullifying exigent condition, therefore we might
* retry this instruction.
*/
spin_lock(&ipd_lock);
tmp = get_tod_clock();
if (((tmp - last_ipd) >> 12) < MAX_IPD_TIME)
ipd_count++;
else
ipd_count = 1;
last_ipd = tmp;
if (ipd_count == MAX_IPD_COUNT)
s390_handle_damage();
spin_unlock(&ipd_lock);
} else {
/* Processing damage -> stopping machine */
s390_handle_damage();
}
}
if (s390_validate_registers(mci)) {
if (!user_mode(regs))
s390_handle_damage();
/*
* Couldn't restore all register contents for the
* user space process -> mark task for termination.
*/
mcck->kill_task = 1;
mcck->mcck_code = mci.val;
mcck_pending = 1;
}
/*
* Backup the machine check's info if it happens when the guest
* is running.
*/
if (test_cpu_flag(CIF_MCCK_GUEST))
s390_backup_mcck_info(regs);
if (mci.cd) {
/* Timing facility damage */
s390_handle_damage();
}
if (mci.ed && mci.ec) {
/* External damage */
if (S390_lowcore.external_damage_code & (1U << ED_STP_SYNC))
mcck->stp_queue |= stp_sync_check();
if (S390_lowcore.external_damage_code & (1U << ED_STP_ISLAND))
mcck->stp_queue |= stp_island_check();
mcck_pending = 1;
}
/*
* Reinject storage related machine checks into the guest if they
* happen when the guest is running.
*/
if (!test_cpu_flag(CIF_MCCK_GUEST)) {
/* Storage error uncorrected */
if (mci.se)
s390_handle_damage();
/* Storage key-error uncorrected */
if (mci.ke)
s390_handle_damage();
/* Storage degradation */
if (mci.ds && mci.fa)
s390_handle_damage();
}
if (mci.cp) {
/* Channel report word pending */
mcck->channel_report = 1;
mcck_pending = 1;
}
if (mci.w) {
/* Warning pending */
mcck->warning = 1;
mcck_pending = 1;
}
/*
* If there are only Channel Report Pending and External Damage
* machine checks, they will not be reinjected into the guest
* because they refer to host conditions only.
*/
mcck_dam_code = (mci.val & MCIC_SUBCLASS_MASK);
if (test_cpu_flag(CIF_MCCK_GUEST) &&
(mcck_dam_code & MCCK_CODE_NO_GUEST) != mcck_dam_code) {
/* Set exit reason code for host's later handling */
*((long *)(regs->gprs[15] + __SF_SIE_REASON)) = -EINTR;
}
clear_cpu_flag(CIF_MCCK_GUEST);
if (mcck_pending)
schedule_mcck_handler();
irqentry_nmi_exit(regs, irq_state);
}
NOKPROBE_SYMBOL(s390_do_machine_check);
static int __init machine_check_init(void)
{
system_ctl_set_bit(14, CR14_EXTERNAL_DAMAGE_SUBMASK_BIT);
system_ctl_set_bit(14, CR14_RECOVERY_SUBMASK_BIT);
system_ctl_set_bit(14, CR14_WARNING_SUBMASK_BIT);
return 0;
}
early_initcall(machine_check_init);