185 lines
3.9 KiB
C
Raw Normal View History

/*
* Copyright 2011 IBM Corporation.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/irq.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/cpu.h>
#include <linux/of.h>
#include <asm/smp.h>
#include <asm/irq.h>
#include <asm/errno.h>
#include <asm/xics.h>
#include <asm/io.h>
#include <asm/hvcall.h>
static inline unsigned int icp_hv_get_xirr(unsigned char cppr)
{
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
long rc;
unsigned int ret = XICS_IRQ_SPURIOUS;
rc = plpar_hcall(H_XIRR, retbuf, cppr);
if (rc == H_SUCCESS) {
ret = (unsigned int)retbuf[0];
} else {
pr_err("%s: bad return code xirr cppr=0x%x returned %ld\n",
__func__, cppr, rc);
WARN_ON_ONCE(1);
}
return ret;
}
static inline void icp_hv_set_cppr(u8 value)
{
long rc = plpar_hcall_norets(H_CPPR, value);
if (rc != H_SUCCESS) {
pr_err("%s: bad return code cppr cppr=0x%x returned %ld\n",
__func__, value, rc);
WARN_ON_ONCE(1);
}
}
static inline void icp_hv_set_xirr(unsigned int value)
{
long rc = plpar_hcall_norets(H_EOI, value);
if (rc != H_SUCCESS) {
pr_err("%s: bad return code eoi xirr=0x%x returned %ld\n",
__func__, value, rc);
WARN_ON_ONCE(1);
icp_hv_set_cppr(value >> 24);
}
}
static inline void icp_hv_set_qirr(int n_cpu , u8 value)
{
int hw_cpu = get_hard_smp_processor_id(n_cpu);
powerpc: Make sure IPI handlers see data written by IPI senders We have been observing hangs, both of KVM guest vcpu tasks and more generally, where a process that is woken doesn't properly wake up and continue to run, but instead sticks in TASK_WAKING state. This happens because the update of rq->wake_list in ttwu_queue_remote() is not ordered with the update of ipi_message in smp_muxed_ipi_message_pass(), and the reading of rq->wake_list in scheduler_ipi() is not ordered with the reading of ipi_message in smp_ipi_demux(). Thus it is possible for the IPI receiver not to see the updated rq->wake_list and therefore conclude that there is nothing for it to do. In order to make sure that anything done before smp_send_reschedule() is ordered before anything done in the resulting call to scheduler_ipi(), this adds barriers in smp_muxed_message_pass() and smp_ipi_demux(). The barrier in smp_muxed_message_pass() is a full barrier to ensure that there is a full ordering between the smp_send_reschedule() caller and scheduler_ipi(). In smp_ipi_demux(), we use xchg() rather than xchg_local() because xchg() includes release and acquire barriers. Using xchg() rather than xchg_local() makes sense given that ipi_message is not just accessed locally. This moves the barrier between setting the message and calling the cause_ipi() function into the individual cause_ipi implementations. Most of them -- those that used outb, out_8 or similar -- already had a full barrier because out_8 etc. include a sync before the MMIO store. This adds an explicit barrier in the two remaining cases. These changes made no measurable difference to the speed of IPIs as measured using a simple ping-pong latency test across two CPUs on different cores of a POWER7 machine. The analysis of the reason why processes were not waking up properly is due to Milton Miller. Cc: stable@vger.kernel.org # v3.0+ Reported-by: Milton Miller <miltonm@bga.com> Signed-off-by: Paul Mackerras <paulus@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2012-09-04 18:33:08 +00:00
long rc;
/* Make sure all previous accesses are ordered before IPI sending */
mb();
rc = plpar_hcall_norets(H_IPI, hw_cpu, value);
if (rc != H_SUCCESS) {
pr_err("%s: bad return code qirr cpu=%d hw_cpu=%d mfrr=0x%x "
"returned %ld\n", __func__, n_cpu, hw_cpu, value, rc);
WARN_ON_ONCE(1);
}
}
static void icp_hv_eoi(struct irq_data *d)
{
unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
iosync();
icp_hv_set_xirr((xics_pop_cppr() << 24) | hw_irq);
}
static void icp_hv_teardown_cpu(void)
{
int cpu = smp_processor_id();
/* Clear any pending IPI */
icp_hv_set_qirr(cpu, 0xff);
}
static void icp_hv_flush_ipi(void)
{
/* We take the ipi irq but and never return so we
* need to EOI the IPI, but want to leave our priority 0
*
* should we check all the other interrupts too?
* should we be flagging idle loop instead?
* or creating some task to be scheduled?
*/
icp_hv_set_xirr((0x00 << 24) | XICS_IPI);
}
static unsigned int icp_hv_get_irq(void)
{
unsigned int xirr = icp_hv_get_xirr(xics_cppr_top());
unsigned int vec = xirr & 0x00ffffff;
unsigned int irq;
if (vec == XICS_IRQ_SPURIOUS)
return NO_IRQ;
irq = irq_find_mapping(xics_host, vec);
if (likely(irq != NO_IRQ)) {
xics_push_cppr(vec);
return irq;
}
/* We don't have a linux mapping, so have rtas mask it. */
xics_mask_unknown_vec(vec);
/* We might learn about it later, so EOI it */
icp_hv_set_xirr(xirr);
return NO_IRQ;
}
static void icp_hv_set_cpu_priority(unsigned char cppr)
{
xics_set_base_cppr(cppr);
icp_hv_set_cppr(cppr);
iosync();
}
#ifdef CONFIG_SMP
powerpc: Consolidate ipi message mux and demux Consolidate the mux and demux of ipi messages into smp.c and call a new smp_ops callback to actually trigger the ipi. The powerpc architecture code is optimised for having 4 distinct ipi triggers, which are mapped to 4 distinct messages (ipi many, ipi single, scheduler ipi, and enter debugger). However, several interrupt controllers only provide a single software triggered interrupt that can be delivered to each cpu. To resolve this limitation, each smp_ops implementation created a per-cpu variable that is manipulated with atomic bitops. Since these lines will be contended they are optimialy marked as shared_aligned and take a full cache line for each cpu. Distro kernels may have 2 or 3 of these in their config, each taking per-cpu space even though at most one will be in use. This consolidation removes smp_message_recv and replaces the single call actions cases with direct calls from the common message recognition loop. The complicated debugger ipi case with its muxed crash handling code is moved to debug_ipi_action which is now called from the demux code (instead of the multi-message action calling smp_message_recv). I put a call to reschedule_action to increase the likelyhood of correctly merging the anticipated scheduler_ipi() hook coming from the scheduler tree; that single required call can be inlined later. The actual message decode is a copy of the old pseries xics code with its memory barriers and cache line spacing, augmented with a per-cpu unsigned long based on the book-e doorbell code. The optional data is set via a callback from the implementation and is passed to the new cause-ipi hook along with the logical cpu number. While currently only the doorbell implemntation uses this data it should be almost zero cost to retrieve and pass it -- it adds a single register load for the argument from the same cache line to which we just completed a store and the register is dead on return from the call. I extended the data element from unsigned int to unsigned long in case some other code wanted to associate a pointer. The doorbell check_self is replaced by a call to smp_muxed_ipi_resend, conditioned on the CPU_DBELL feature. The ifdef guard could be relaxed to CONFIG_SMP but I left it with BOOKE for now. Also, the doorbell interrupt vector for book-e was not calling irq_enter and irq_exit, which throws off cpu accounting and causes code to not realize it is running in interrupt context. Add the missing calls. Signed-off-by: Milton Miller <miltonm@bga.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2011-05-10 19:29:39 +00:00
static void icp_hv_cause_ipi(int cpu, unsigned long data)
{
icp_hv_set_qirr(cpu, IPI_PRIORITY);
}
static irqreturn_t icp_hv_ipi_action(int irq, void *dev_id)
{
int cpu = smp_processor_id();
icp_hv_set_qirr(cpu, 0xff);
powerpc: Consolidate ipi message mux and demux Consolidate the mux and demux of ipi messages into smp.c and call a new smp_ops callback to actually trigger the ipi. The powerpc architecture code is optimised for having 4 distinct ipi triggers, which are mapped to 4 distinct messages (ipi many, ipi single, scheduler ipi, and enter debugger). However, several interrupt controllers only provide a single software triggered interrupt that can be delivered to each cpu. To resolve this limitation, each smp_ops implementation created a per-cpu variable that is manipulated with atomic bitops. Since these lines will be contended they are optimialy marked as shared_aligned and take a full cache line for each cpu. Distro kernels may have 2 or 3 of these in their config, each taking per-cpu space even though at most one will be in use. This consolidation removes smp_message_recv and replaces the single call actions cases with direct calls from the common message recognition loop. The complicated debugger ipi case with its muxed crash handling code is moved to debug_ipi_action which is now called from the demux code (instead of the multi-message action calling smp_message_recv). I put a call to reschedule_action to increase the likelyhood of correctly merging the anticipated scheduler_ipi() hook coming from the scheduler tree; that single required call can be inlined later. The actual message decode is a copy of the old pseries xics code with its memory barriers and cache line spacing, augmented with a per-cpu unsigned long based on the book-e doorbell code. The optional data is set via a callback from the implementation and is passed to the new cause-ipi hook along with the logical cpu number. While currently only the doorbell implemntation uses this data it should be almost zero cost to retrieve and pass it -- it adds a single register load for the argument from the same cache line to which we just completed a store and the register is dead on return from the call. I extended the data element from unsigned int to unsigned long in case some other code wanted to associate a pointer. The doorbell check_self is replaced by a call to smp_muxed_ipi_resend, conditioned on the CPU_DBELL feature. The ifdef guard could be relaxed to CONFIG_SMP but I left it with BOOKE for now. Also, the doorbell interrupt vector for book-e was not calling irq_enter and irq_exit, which throws off cpu accounting and causes code to not realize it is running in interrupt context. Add the missing calls. Signed-off-by: Milton Miller <miltonm@bga.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2011-05-10 19:29:39 +00:00
return smp_ipi_demux();
}
#endif /* CONFIG_SMP */
static const struct icp_ops icp_hv_ops = {
.get_irq = icp_hv_get_irq,
.eoi = icp_hv_eoi,
.set_priority = icp_hv_set_cpu_priority,
.teardown_cpu = icp_hv_teardown_cpu,
.flush_ipi = icp_hv_flush_ipi,
#ifdef CONFIG_SMP
.ipi_action = icp_hv_ipi_action,
powerpc: Consolidate ipi message mux and demux Consolidate the mux and demux of ipi messages into smp.c and call a new smp_ops callback to actually trigger the ipi. The powerpc architecture code is optimised for having 4 distinct ipi triggers, which are mapped to 4 distinct messages (ipi many, ipi single, scheduler ipi, and enter debugger). However, several interrupt controllers only provide a single software triggered interrupt that can be delivered to each cpu. To resolve this limitation, each smp_ops implementation created a per-cpu variable that is manipulated with atomic bitops. Since these lines will be contended they are optimialy marked as shared_aligned and take a full cache line for each cpu. Distro kernels may have 2 or 3 of these in their config, each taking per-cpu space even though at most one will be in use. This consolidation removes smp_message_recv and replaces the single call actions cases with direct calls from the common message recognition loop. The complicated debugger ipi case with its muxed crash handling code is moved to debug_ipi_action which is now called from the demux code (instead of the multi-message action calling smp_message_recv). I put a call to reschedule_action to increase the likelyhood of correctly merging the anticipated scheduler_ipi() hook coming from the scheduler tree; that single required call can be inlined later. The actual message decode is a copy of the old pseries xics code with its memory barriers and cache line spacing, augmented with a per-cpu unsigned long based on the book-e doorbell code. The optional data is set via a callback from the implementation and is passed to the new cause-ipi hook along with the logical cpu number. While currently only the doorbell implemntation uses this data it should be almost zero cost to retrieve and pass it -- it adds a single register load for the argument from the same cache line to which we just completed a store and the register is dead on return from the call. I extended the data element from unsigned int to unsigned long in case some other code wanted to associate a pointer. The doorbell check_self is replaced by a call to smp_muxed_ipi_resend, conditioned on the CPU_DBELL feature. The ifdef guard could be relaxed to CONFIG_SMP but I left it with BOOKE for now. Also, the doorbell interrupt vector for book-e was not calling irq_enter and irq_exit, which throws off cpu accounting and causes code to not realize it is running in interrupt context. Add the missing calls. Signed-off-by: Milton Miller <miltonm@bga.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2011-05-10 19:29:39 +00:00
.cause_ipi = icp_hv_cause_ipi,
#endif
};
int icp_hv_init(void)
{
struct device_node *np;
np = of_find_compatible_node(NULL, NULL, "ibm,ppc-xicp");
if (!np)
np = of_find_node_by_type(NULL,
"PowerPC-External-Interrupt-Presentation");
if (!np)
return -ENODEV;
icp_ops = &icp_hv_ops;
return 0;
}