diff --git a/Documentation/DocBook/genericirq.tmpl b/Documentation/DocBook/genericirq.tmpl
index 1448b33fd222..fb10fd08c05c 100644
--- a/Documentation/DocBook/genericirq.tmpl
+++ b/Documentation/DocBook/genericirq.tmpl
@@ -28,7 +28,7 @@
- 2005-2006
+ 2005-2010
Thomas Gleixner
@@ -100,6 +100,10 @@
Edge type
Simple type
+ During the implementation we identified another type:
+
+ Fast EOI type
+
In the SMP world of the __do_IRQ() super-handler another type
was identified:
@@ -153,6 +157,7 @@
is still available. This leads to a kind of duality for the time
being. Over time the new model should be used in more and more
architectures, as it enables smaller and cleaner IRQ subsystems.
+ It's deprecated for three years now and about to be removed.
@@ -217,6 +222,7 @@
handle_level_irq
handle_edge_irq
+ handle_fasteoi_irq
handle_simple_irq
handle_percpu_irq
@@ -233,33 +239,33 @@
are used by the default flow implementations.
The following helper functions are implemented (simplified excerpt):
-default_enable(irq)
+default_enable(struct irq_data *data)
{
- desc->chip->unmask(irq);
+ desc->chip->irq_unmask(data);
}
-default_disable(irq)
+default_disable(struct irq_data *data)
{
- if (!delay_disable(irq))
- desc->chip->mask(irq);
+ if (!delay_disable(data))
+ desc->chip->irq_mask(data);
}
-default_ack(irq)
+default_ack(struct irq_data *data)
{
- chip->ack(irq);
+ chip->irq_ack(data);
}
-default_mask_ack(irq)
+default_mask_ack(struct irq_data *data)
{
- if (chip->mask_ack) {
- chip->mask_ack(irq);
+ if (chip->irq_mask_ack) {
+ chip->irq_mask_ack(data);
} else {
- chip->mask(irq);
- chip->ack(irq);
+ chip->irq_mask(data);
+ chip->irq_ack(data);
}
}
-noop(irq)
+noop(struct irq_data *data))
{
}
@@ -278,12 +284,27 @@ noop(irq)
The following control flow is implemented (simplified excerpt):
-desc->chip->start();
+desc->chip->irq_mask();
handle_IRQ_event(desc->action);
-desc->chip->end();
+desc->chip->irq_unmask();
-
+
+
+ Default Fast EOI IRQ flow handler
+
+ handle_fasteoi_irq provides a generic implementation
+ for interrupts, which only need an EOI at the end of
+ the handler
+
+
+ The following control flow is implemented (simplified excerpt):
+
+handle_IRQ_event(desc->action);
+desc->chip->irq_eoi();
+
+
+
Default Edge IRQ flow handler
@@ -294,20 +315,19 @@ desc->chip->end();
The following control flow is implemented (simplified excerpt):
if (desc->status & running) {
- desc->chip->hold();
+ desc->chip->irq_mask();
desc->status |= pending | masked;
return;
}
-desc->chip->start();
+desc->chip->irq_ack();
desc->status |= running;
do {
if (desc->status & masked)
- desc->chip->enable();
+ desc->chip->irq_unmask();
desc->status &= ~pending;
handle_IRQ_event(desc->action);
} while (status & pending);
desc->status &= ~running;
-desc->chip->end();
@@ -342,9 +362,9 @@ handle_IRQ_event(desc->action);
The following control flow is implemented (simplified excerpt):
-desc->chip->start();
handle_IRQ_event(desc->action);
-desc->chip->end();
+if (desc->chip->irq_eoi)
+ desc->chip->irq_eoi();
@@ -375,8 +395,7 @@ desc->chip->end();
mechanism. (It's necessary to enable CONFIG_HARDIRQS_SW_RESEND when
you want to use the delayed interrupt disable feature and your
hardware is not capable of retriggering an interrupt.)
- The delayed interrupt disable can be runtime enabled, per interrupt,
- by setting the IRQ_DELAYED_DISABLE flag in the irq_desc status field.
+ The delayed interrupt disable is not configurable.
@@ -387,13 +406,13 @@ desc->chip->end();
contains all the direct chip relevant functions, which
can be utilized by the irq flow implementations.
- ack()
- mask_ack() - Optional, recommended for performance
- mask()
- unmask()
- retrigger() - Optional
- set_type() - Optional
- set_wake() - Optional
+ irq_ack()
+ irq_mask_ack() - Optional, recommended for performance
+ irq_mask()
+ irq_unmask()
+ irq_retrigger() - Optional
+ irq_set_type() - Optional
+ irq_set_wake() - Optional
These primitives are strictly intended to mean what they say: ack means
ACK, masking means masking of an IRQ line, etc. It is up to the flow
@@ -458,6 +477,7 @@ desc->chip->end();
This chapter contains the autogenerated documentation of the internal functions.
+!Ikernel/irq/irqdesc.c
!Ikernel/irq/handle.c
!Ikernel/irq/chip.c
diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl
index a0d479d1e1dd..f66f4df18690 100644
--- a/Documentation/DocBook/kernel-locking.tmpl
+++ b/Documentation/DocBook/kernel-locking.tmpl
@@ -1645,7 +1645,9 @@ the amount of locking which needs to be done.
all the readers who were traversing the list when we deleted the
element are finished. We use call_rcu() to
register a callback which will actually destroy the object once
- the readers are finished.
+ all pre-existing readers are finished. Alternatively,
+ synchronize_rcu() may be used to block until
+ all pre-existing are finished.
But how does Read Copy Update know when the readers are
@@ -1714,7 +1716,7 @@ the amount of locking which needs to be done.
- object_put(obj);
+ list_del_rcu(&obj->list);
cache_num--;
-+ call_rcu(&obj->rcu, cache_delete_rcu, obj);
++ call_rcu(&obj->rcu, cache_delete_rcu);
}
/* Must be holding cache_lock */
@@ -1725,14 +1727,6 @@ the amount of locking which needs to be done.
if (++cache_num > MAX_CACHE_SIZE) {
struct object *i, *outcast = NULL;
list_for_each_entry(i, &cache, list) {
-@@ -85,6 +94,7 @@
- obj->popularity = 0;
- atomic_set(&obj->refcnt, 1); /* The cache holds a reference */
- spin_lock_init(&obj->lock);
-+ INIT_RCU_HEAD(&obj->rcu);
-
- spin_lock_irqsave(&cache_lock, flags);
- __cache_add(obj);
@@ -104,12 +114,11 @@
struct object *cache_find(int id)
{
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index 790d1a812376..0c134f8afc6f 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -218,13 +218,22 @@ over a rather long period of time, but improvements are always welcome!
include:
a. Keeping a count of the number of data-structure elements
- used by the RCU-protected data structure, including those
- waiting for a grace period to elapse. Enforce a limit
- on this number, stalling updates as needed to allow
- previously deferred frees to complete.
+ used by the RCU-protected data structure, including
+ those waiting for a grace period to elapse. Enforce a
+ limit on this number, stalling updates as needed to allow
+ previously deferred frees to complete. Alternatively,
+ limit only the number awaiting deferred free rather than
+ the total number of elements.
- Alternatively, limit only the number awaiting deferred
- free rather than the total number of elements.
+ One way to stall the updates is to acquire the update-side
+ mutex. (Don't try this with a spinlock -- other CPUs
+ spinning on the lock could prevent the grace period
+ from ever ending.) Another way to stall the updates
+ is for the updates to use a wrapper function around
+ the memory allocator, so that this wrapper function
+ simulates OOM when there is too much memory awaiting an
+ RCU grace period. There are of course many other
+ variations on this theme.
b. Limiting update rate. For example, if updates occur only
once per hour, then no explicit rate limiting is required,
@@ -365,3 +374,26 @@ over a rather long period of time, but improvements are always welcome!
and the compiler to freely reorder code into and out of RCU
read-side critical sections. It is the responsibility of the
RCU update-side primitives to deal with this.
+
+17. Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and
+ the __rcu sparse checks to validate your RCU code. These
+ can help find problems as follows:
+
+ CONFIG_PROVE_RCU: check that accesses to RCU-protected data
+ structures are carried out under the proper RCU
+ read-side critical section, while holding the right
+ combination of locks, or whatever other conditions
+ are appropriate.
+
+ CONFIG_DEBUG_OBJECTS_RCU_HEAD: check that you don't pass the
+ same object to call_rcu() (or friends) before an RCU
+ grace period has elapsed since the last time that you
+ passed that same object to call_rcu() (or friends).
+
+ __rcu sparse checks: tag the pointer to the RCU-protected data
+ structure with __rcu, and sparse will warn you if you
+ access that pointer without the services of one of the
+ variants of rcu_dereference().
+
+ These debugging aids can help you find problems that are
+ otherwise extremely difficult to spot.
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index 44c6dcc93d6d..862c08ef1fde 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -80,6 +80,24 @@ o A CPU looping with bottom halves disabled. This condition can
o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
without invoking schedule().
+o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
+ happen to preempt a low-priority task in the middle of an RCU
+ read-side critical section. This is especially damaging if
+ that low-priority task is not permitted to run on any other CPU,
+ in which case the next RCU grace period can never complete, which
+ will eventually cause the system to run out of memory and hang.
+ While the system is in the process of running itself out of
+ memory, you might see stall-warning messages.
+
+o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
+ is running at a higher priority than the RCU softirq threads.
+ This will prevent RCU callbacks from ever being invoked,
+ and in a CONFIG_TREE_PREEMPT_RCU kernel will further prevent
+ RCU grace periods from ever completing. Either way, the
+ system will eventually run out of memory and hang. In the
+ CONFIG_TREE_PREEMPT_RCU case, you might see stall-warning
+ messages.
+
o A bug in the RCU implementation.
o A hardware failure. This is quite unlikely, but has occurred
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index efd8cc95c06b..a851118775d8 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -125,6 +125,17 @@ o "b" is the batch limit for this CPU. If more than this number
of RCU callbacks is ready to invoke, then the remainder will
be deferred.
+o "ci" is the number of RCU callbacks that have been invoked for
+ this CPU. Note that ci+ql is the number of callbacks that have
+ been registered in absence of CPU-hotplug activity.
+
+o "co" is the number of RCU callbacks that have been orphaned due to
+ this CPU going offline.
+
+o "ca" is the number of RCU callbacks that have been adopted due to
+ other CPUs going offline. Note that ci+co-ca+ql is the number of
+ RCU callbacks registered on this CPU.
+
There is also an rcu/rcudata.csv file with the same information in
comma-separated-variable spreadsheet format.
@@ -180,7 +191,7 @@ o "s" is the "signaled" state that drives force_quiescent_state()'s
o "jfq" is the number of jiffies remaining for this grace period
before force_quiescent_state() is invoked to help push things
- along. Note that CPUs in dyntick-idle mode thoughout the grace
+ along. Note that CPUs in dyntick-idle mode throughout the grace
period will not report on their own, but rather must be check by
some other CPU via force_quiescent_state().
diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt
index f1c5c4bccd3e..902d3151f527 100644
--- a/Documentation/cputopology.txt
+++ b/Documentation/cputopology.txt
@@ -14,25 +14,39 @@ to /proc/cpuinfo.
identifier (rather than the kernel's). The actual value is
architecture and platform dependent.
-3) /sys/devices/system/cpu/cpuX/topology/thread_siblings:
+3) /sys/devices/system/cpu/cpuX/topology/book_id:
+
+ the book ID of cpuX. Typically it is the hardware platform's
+ identifier (rather than the kernel's). The actual value is
+ architecture and platform dependent.
+
+4) /sys/devices/system/cpu/cpuX/topology/thread_siblings:
internel kernel map of cpuX's hardware threads within the same
core as cpuX
-4) /sys/devices/system/cpu/cpuX/topology/core_siblings:
+5) /sys/devices/system/cpu/cpuX/topology/core_siblings:
internal kernel map of cpuX's hardware threads within the same
physical_package_id.
+6) /sys/devices/system/cpu/cpuX/topology/book_siblings:
+
+ internal kernel map of cpuX's hardware threads within the same
+ book_id.
+
To implement it in an architecture-neutral way, a new source file,
-drivers/base/topology.c, is to export the 4 attributes.
+drivers/base/topology.c, is to export the 4 or 6 attributes. The two book
+related sysfs files will only be created if CONFIG_SCHED_BOOK is selected.
For an architecture to support this feature, it must define some of
these macros in include/asm-XXX/topology.h:
#define topology_physical_package_id(cpu)
#define topology_core_id(cpu)
+#define topology_book_id(cpu)
#define topology_thread_cpumask(cpu)
#define topology_core_cpumask(cpu)
+#define topology_book_cpumask(cpu)
The type of **_id is int.
The type of siblings is (const) struct cpumask *.
@@ -45,6 +59,9 @@ not defined by include/asm-XXX/topology.h:
3) thread_siblings: just the given CPU
4) core_siblings: just the given CPU
+For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
+default definitions for topology_book_id() and topology_book_cpumask().
+
Additionally, CPU topology information is provided under
/sys/devices/system/cpu and includes these files. The internal
source for the output is in brackets ("[]").
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 842aa9de84a6..5e2bc4ab897a 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -386,34 +386,6 @@ Who: Tejun Heo
----------------------------
-What: Support for VMware's guest paravirtuliazation technique [VMI] will be
- dropped.
-When: 2.6.37 or earlier.
-Why: With the recent innovations in CPU hardware acceleration technologies
- from Intel and AMD, VMware ran a few experiments to compare these
- techniques to guest paravirtualization technique on VMware's platform.
- These hardware assisted virtualization techniques have outperformed the
- performance benefits provided by VMI in most of the workloads. VMware
- expects that these hardware features will be ubiquitous in a couple of
- years, as a result, VMware has started a phased retirement of this
- feature from the hypervisor. We will be removing this feature from the
- Kernel too. Right now we are targeting 2.6.37 but can retire earlier if
- technical reasons (read opportunity to remove major chunk of pvops)
- arise.
-
- Please note that VMI has always been an optimization and non-VMI kernels
- still work fine on VMware's platform.
- Latest versions of VMware's product which support VMI are,
- Workstation 7.0 and VSphere 4.0 on ESX side, future maintainence
- releases for these products will continue supporting VMI.
-
- For more details about VMI retirement take a look at this,
- http://blogs.vmware.com/guestosguide/2009/09/vmi-retirement.html
-
-Who: Alok N Kataria
-
-----------------------------
-
What: Support for lcd_switch and display_get in asus-laptop driver
When: March 2010
Why: These two features use non-standard interfaces. There are the
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 8dd7248508a9..3a0009e03d14 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -455,7 +455,7 @@ and is between 256 and 4096 characters. It is defined in the file
[ARM] imx_timer1,OSTS,netx_timer,mpu_timer2,
pxa_timer,timer3,32k_counter,timer0_1
[AVR32] avr32
- [X86-32] pit,hpet,tsc,vmi-timer;
+ [X86-32] pit,hpet,tsc;
scx200_hrt on Geode; cyclone on IBM x440
[MIPS] MIPS
[PARISC] cr16
@@ -2153,6 +2153,11 @@ and is between 256 and 4096 characters. It is defined in the file
Reserves a hole at the top of the kernel virtual
address space.
+ reservelow= [X86]
+ Format: nn[K]
+ Set the amount of memory to reserve for BIOS at
+ the bottom of the address space.
+
reset_devices [KNL] Force drivers to reset the underlying device
during initialization.
@@ -2435,6 +2440,10 @@ and is between 256 and 4096 characters. It is defined in the file
disables clocksource verification at runtime.
Used to enable high-resolution timer mode on older
hardware, and in virtualized environment.
+ [x86] noirqtime: Do not use TSC to do irq accounting.
+ Used to run time disable IRQ_TIME_ACCOUNTING on any
+ platforms where RDTSC is slow and this accounting
+ can add overhead.
turbografx.map[2|3]= [HW,JOY]
TurboGraFX parallel port interface
diff --git a/MAINTAINERS b/MAINTAINERS
index f2a2b8e647c5..6f5b5b2b528d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1527,6 +1527,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
S: Supported
F: Documentation/filesystems/ceph.txt
F: fs/ceph
+F: net/ceph
+F: include/linux/ceph
CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
M: David Vrabel
@@ -3239,6 +3241,12 @@ F: drivers/net/irda/
F: include/net/irda/
F: net/irda/
+IRQ SUBSYSTEM
+M: Thomas Gleixner
+S: Maintained
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git irq/core
+F: kernel/irq/
+
ISAPNP
M: Jaroslav Kysela
S: Maintained
@@ -4805,6 +4813,15 @@ F: fs/qnx4/
F: include/linux/qnx4_fs.h
F: include/linux/qnxtypes.h
+RADOS BLOCK DEVICE (RBD)
+F: include/linux/qnxtypes.h
+M: Yehuda Sadeh
+M: Sage Weil
+M: ceph-devel@vger.kernel.org
+S: Supported
+F: drivers/block/rbd.c
+F: drivers/block/rbd_types.h
+
RADEON FRAMEBUFFER DISPLAY DRIVER
M: Benjamin Herrenschmidt
L: linux-fbdev@vger.kernel.org
diff --git a/arch/arm/include/asm/hw_irq.h b/arch/arm/include/asm/hw_irq.h
index 90831f6f5f5c..5586b7c8ef6f 100644
--- a/arch/arm/include/asm/hw_irq.h
+++ b/arch/arm/include/asm/hw_irq.h
@@ -24,4 +24,6 @@ void set_irq_flags(unsigned int irq, unsigned int flags);
#define IRQF_PROBE (1 << 1)
#define IRQF_NOAUTOEN (1 << 2)
+#define ARCH_IRQ_INIT_FLAGS (IRQ_NOREQUEST | IRQ_NOPROBE)
+
#endif
diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index c0d5c3b3a760..36ad3be4692a 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -154,14 +154,6 @@ void set_irq_flags(unsigned int irq, unsigned int iflags)
void __init init_IRQ(void)
{
- struct irq_desc *desc;
- int irq;
-
- for (irq = 0; irq < nr_irqs; irq++) {
- desc = irq_to_desc_alloc_node(irq, 0);
- desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
- }
-
init_arch_irq();
}
@@ -169,7 +161,7 @@ void __init init_IRQ(void)
int __init arch_probe_nr_irqs(void)
{
nr_irqs = arch_nr_irqs ? arch_nr_irqs : NR_IRQS;
- return 0;
+ return nr_irqs;
}
#endif
diff --git a/arch/arm/mach-bcmring/dma.c b/arch/arm/mach-bcmring/dma.c
index 29c0a911df26..77eb35c89cd0 100644
--- a/arch/arm/mach-bcmring/dma.c
+++ b/arch/arm/mach-bcmring/dma.c
@@ -691,7 +691,7 @@ int dma_init(void)
memset(&gDMA, 0, sizeof(gDMA));
- init_MUTEX_LOCKED(&gDMA.lock);
+ sema_init(&gDMA.lock, 0);
init_waitqueue_head(&gDMA.freeChannelQ);
/* Initialize the Hardware */
@@ -1574,7 +1574,7 @@ int dma_init_mem_map(DMA_MemMap_t *memMap)
{
memset(memMap, 0, sizeof(*memMap));
- init_MUTEX(&memMap->lock);
+ sema_init(&memMap->lock, 1);
return 0;
}
diff --git a/arch/arm/mach-bcmring/irq.c b/arch/arm/mach-bcmring/irq.c
index dc1c4939b0ce..e3152631eb37 100644
--- a/arch/arm/mach-bcmring/irq.c
+++ b/arch/arm/mach-bcmring/irq.c
@@ -67,21 +67,21 @@ static void bcmring_unmask_irq2(unsigned int irq)
}
static struct irq_chip bcmring_irq0_chip = {
- .typename = "ARM-INTC0",
+ .name = "ARM-INTC0",
.ack = bcmring_mask_irq0,
.mask = bcmring_mask_irq0, /* mask a specific interrupt, blocking its delivery. */
.unmask = bcmring_unmask_irq0, /* unmaks an interrupt */
};
static struct irq_chip bcmring_irq1_chip = {
- .typename = "ARM-INTC1",
+ .name = "ARM-INTC1",
.ack = bcmring_mask_irq1,
.mask = bcmring_mask_irq1,
.unmask = bcmring_unmask_irq1,
};
static struct irq_chip bcmring_irq2_chip = {
- .typename = "ARM-SINTC",
+ .name = "ARM-SINTC",
.ack = bcmring_mask_irq2,
.mask = bcmring_mask_irq2,
.unmask = bcmring_unmask_irq2,
diff --git a/arch/arm/mach-iop13xx/msi.c b/arch/arm/mach-iop13xx/msi.c
index f34b0ed80630..7149fcc16c8a 100644
--- a/arch/arm/mach-iop13xx/msi.c
+++ b/arch/arm/mach-iop13xx/msi.c
@@ -164,10 +164,10 @@ static void iop13xx_msi_nop(unsigned int irq)
static struct irq_chip iop13xx_msi_chip = {
.name = "PCI-MSI",
.ack = iop13xx_msi_nop,
- .enable = unmask_msi_irq,
- .disable = mask_msi_irq,
- .mask = mask_msi_irq,
- .unmask = unmask_msi_irq,
+ .irq_enable = unmask_msi_irq,
+ .irq_disable = mask_msi_irq,
+ .irq_mask = mask_msi_irq,
+ .irq_unmask = unmask_msi_irq,
};
int arch_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc)
diff --git a/arch/ia64/include/asm/hardirq.h b/arch/ia64/include/asm/hardirq.h
index d514cd9edb49..8fb7d33a661f 100644
--- a/arch/ia64/include/asm/hardirq.h
+++ b/arch/ia64/include/asm/hardirq.h
@@ -6,12 +6,6 @@
* David Mosberger-Tang
*/
-
-#include
-#include
-
-#include
-
/*
* No irq_cpustat_t for IA-64. The data is held in the per-CPU data structure.
*/
@@ -20,6 +14,11 @@
#define local_softirq_pending() (local_cpu_data->softirq_pending)
+#include
+#include
+
+#include
+
extern void __iomem *ipi_base_addr;
void ack_bad_irq(unsigned int irq);
diff --git a/arch/ia64/include/asm/system.h b/arch/ia64/include/asm/system.h
index 9f342a574ce8..dd028f2b13b3 100644
--- a/arch/ia64/include/asm/system.h
+++ b/arch/ia64/include/asm/system.h
@@ -272,10 +272,6 @@ void cpu_idle_wait(void);
void default_idle(void);
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-extern void account_system_vtime(struct task_struct *);
-#endif
-
#endif /* __KERNEL__ */
#endif /* __ASSEMBLY__ */
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
index 4a746ea838ff..00b19a416eab 100644
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -104,8 +104,8 @@ static int ia64_msi_retrigger_irq(unsigned int irq)
*/
static struct irq_chip ia64_msi_chip = {
.name = "PCI-MSI",
- .mask = mask_msi_irq,
- .unmask = unmask_msi_irq,
+ .irq_mask = mask_msi_irq,
+ .irq_unmask = unmask_msi_irq,
.ack = ia64_ack_msi_irq,
#ifdef CONFIG_SMP
.set_affinity = ia64_set_msi_irq_affinity,
@@ -160,8 +160,8 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
static struct irq_chip dmar_msi_type = {
.name = "DMAR_MSI",
- .unmask = dmar_msi_unmask,
- .mask = dmar_msi_mask,
+ .irq_unmask = dmar_msi_unmask,
+ .irq_mask = dmar_msi_mask,
.ack = ia64_ack_msi_irq,
#ifdef CONFIG_SMP
.set_affinity = dmar_msi_set_affinity,
diff --git a/arch/ia64/sn/kernel/msi_sn.c b/arch/ia64/sn/kernel/msi_sn.c
index 0c72dd463831..a5e500f02853 100644
--- a/arch/ia64/sn/kernel/msi_sn.c
+++ b/arch/ia64/sn/kernel/msi_sn.c
@@ -228,8 +228,8 @@ static int sn_msi_retrigger_irq(unsigned int irq)
static struct irq_chip sn_msi_chip = {
.name = "PCI-MSI",
- .mask = mask_msi_irq,
- .unmask = unmask_msi_irq,
+ .irq_mask = mask_msi_irq,
+ .irq_unmask = unmask_msi_irq,
.ack = sn_ack_msi_irq,
#ifdef CONFIG_SMP
.set_affinity = sn_set_msi_irq_affinity,
diff --git a/arch/m32r/kernel/irq.c b/arch/m32r/kernel/irq.c
index 3c71f776872c..7db26f1f082d 100644
--- a/arch/m32r/kernel/irq.c
+++ b/arch/m32r/kernel/irq.c
@@ -51,7 +51,7 @@ int show_interrupts(struct seq_file *p, void *v)
for_each_online_cpu(j)
seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
#endif
- seq_printf(p, " %14s", irq_desc[i].chip->typename);
+ seq_printf(p, " %14s", irq_desc[i].chip->name);
seq_printf(p, " %s", action->name);
for (action=action->next; action; action = action->next)
diff --git a/arch/m32r/platforms/m32104ut/setup.c b/arch/m32r/platforms/m32104ut/setup.c
index 922fdfdadeaa..402a59d7219b 100644
--- a/arch/m32r/platforms/m32104ut/setup.c
+++ b/arch/m32r/platforms/m32104ut/setup.c
@@ -65,7 +65,7 @@ static void shutdown_m32104ut_irq(unsigned int irq)
static struct irq_chip m32104ut_irq_type =
{
- .typename = "M32104UT-IRQ",
+ .name = "M32104UT-IRQ",
.startup = startup_m32104ut_irq,
.shutdown = shutdown_m32104ut_irq,
.enable = enable_m32104ut_irq,
diff --git a/arch/m32r/platforms/m32700ut/setup.c b/arch/m32r/platforms/m32700ut/setup.c
index 9c1bc7487c1e..80b1a026795a 100644
--- a/arch/m32r/platforms/m32700ut/setup.c
+++ b/arch/m32r/platforms/m32700ut/setup.c
@@ -71,7 +71,7 @@ static void shutdown_m32700ut_irq(unsigned int irq)
static struct irq_chip m32700ut_irq_type =
{
- .typename = "M32700UT-IRQ",
+ .name = "M32700UT-IRQ",
.startup = startup_m32700ut_irq,
.shutdown = shutdown_m32700ut_irq,
.enable = enable_m32700ut_irq,
@@ -148,7 +148,7 @@ static void shutdown_m32700ut_pld_irq(unsigned int irq)
static struct irq_chip m32700ut_pld_irq_type =
{
- .typename = "M32700UT-PLD-IRQ",
+ .name = "M32700UT-PLD-IRQ",
.startup = startup_m32700ut_pld_irq,
.shutdown = shutdown_m32700ut_pld_irq,
.enable = enable_m32700ut_pld_irq,
@@ -217,7 +217,7 @@ static void shutdown_m32700ut_lanpld_irq(unsigned int irq)
static struct irq_chip m32700ut_lanpld_irq_type =
{
- .typename = "M32700UT-PLD-LAN-IRQ",
+ .name = "M32700UT-PLD-LAN-IRQ",
.startup = startup_m32700ut_lanpld_irq,
.shutdown = shutdown_m32700ut_lanpld_irq,
.enable = enable_m32700ut_lanpld_irq,
@@ -286,7 +286,7 @@ static void shutdown_m32700ut_lcdpld_irq(unsigned int irq)
static struct irq_chip m32700ut_lcdpld_irq_type =
{
- .typename = "M32700UT-PLD-LCD-IRQ",
+ .name = "M32700UT-PLD-LCD-IRQ",
.startup = startup_m32700ut_lcdpld_irq,
.shutdown = shutdown_m32700ut_lcdpld_irq,
.enable = enable_m32700ut_lcdpld_irq,
diff --git a/arch/m32r/platforms/mappi/setup.c b/arch/m32r/platforms/mappi/setup.c
index fb4b17799b66..ea00c84d6b1b 100644
--- a/arch/m32r/platforms/mappi/setup.c
+++ b/arch/m32r/platforms/mappi/setup.c
@@ -65,7 +65,7 @@ static void shutdown_mappi_irq(unsigned int irq)
static struct irq_chip mappi_irq_type =
{
- .typename = "MAPPI-IRQ",
+ .name = "MAPPI-IRQ",
.startup = startup_mappi_irq,
.shutdown = shutdown_mappi_irq,
.enable = enable_mappi_irq,
diff --git a/arch/m32r/platforms/mappi2/setup.c b/arch/m32r/platforms/mappi2/setup.c
index 6a65eda0a056..c049376d0270 100644
--- a/arch/m32r/platforms/mappi2/setup.c
+++ b/arch/m32r/platforms/mappi2/setup.c
@@ -72,7 +72,7 @@ static void shutdown_mappi2_irq(unsigned int irq)
static struct irq_chip mappi2_irq_type =
{
- .typename = "MAPPI2-IRQ",
+ .name = "MAPPI2-IRQ",
.startup = startup_mappi2_irq,
.shutdown = shutdown_mappi2_irq,
.enable = enable_mappi2_irq,
diff --git a/arch/m32r/platforms/mappi3/setup.c b/arch/m32r/platforms/mappi3/setup.c
index 9c337aeac94b..882de25c6e8c 100644
--- a/arch/m32r/platforms/mappi3/setup.c
+++ b/arch/m32r/platforms/mappi3/setup.c
@@ -72,7 +72,7 @@ static void shutdown_mappi3_irq(unsigned int irq)
static struct irq_chip mappi3_irq_type =
{
- .typename = "MAPPI3-IRQ",
+ .name = "MAPPI3-IRQ",
.startup = startup_mappi3_irq,
.shutdown = shutdown_mappi3_irq,
.enable = enable_mappi3_irq,
diff --git a/arch/m32r/platforms/oaks32r/setup.c b/arch/m32r/platforms/oaks32r/setup.c
index ed865741c38d..d11d93bf74f5 100644
--- a/arch/m32r/platforms/oaks32r/setup.c
+++ b/arch/m32r/platforms/oaks32r/setup.c
@@ -63,7 +63,7 @@ static void shutdown_oaks32r_irq(unsigned int irq)
static struct irq_chip oaks32r_irq_type =
{
- .typename = "OAKS32R-IRQ",
+ .name = "OAKS32R-IRQ",
.startup = startup_oaks32r_irq,
.shutdown = shutdown_oaks32r_irq,
.enable = enable_oaks32r_irq,
diff --git a/arch/m32r/platforms/opsput/setup.c b/arch/m32r/platforms/opsput/setup.c
index 80d680657019..5f3402a2fbaf 100644
--- a/arch/m32r/platforms/opsput/setup.c
+++ b/arch/m32r/platforms/opsput/setup.c
@@ -72,7 +72,7 @@ static void shutdown_opsput_irq(unsigned int irq)
static struct irq_chip opsput_irq_type =
{
- .typename = "OPSPUT-IRQ",
+ .name = "OPSPUT-IRQ",
.startup = startup_opsput_irq,
.shutdown = shutdown_opsput_irq,
.enable = enable_opsput_irq,
@@ -149,7 +149,7 @@ static void shutdown_opsput_pld_irq(unsigned int irq)
static struct irq_chip opsput_pld_irq_type =
{
- .typename = "OPSPUT-PLD-IRQ",
+ .name = "OPSPUT-PLD-IRQ",
.startup = startup_opsput_pld_irq,
.shutdown = shutdown_opsput_pld_irq,
.enable = enable_opsput_pld_irq,
@@ -218,7 +218,7 @@ static void shutdown_opsput_lanpld_irq(unsigned int irq)
static struct irq_chip opsput_lanpld_irq_type =
{
- .typename = "OPSPUT-PLD-LAN-IRQ",
+ .name = "OPSPUT-PLD-LAN-IRQ",
.startup = startup_opsput_lanpld_irq,
.shutdown = shutdown_opsput_lanpld_irq,
.enable = enable_opsput_lanpld_irq,
diff --git a/arch/m32r/platforms/usrv/setup.c b/arch/m32r/platforms/usrv/setup.c
index 757302660af8..1beac7a51ed4 100644
--- a/arch/m32r/platforms/usrv/setup.c
+++ b/arch/m32r/platforms/usrv/setup.c
@@ -63,7 +63,7 @@ static void shutdown_mappi_irq(unsigned int irq)
static struct irq_chip mappi_irq_type =
{
- .typename = "M32700-IRQ",
+ .name = "M32700-IRQ",
.startup = startup_mappi_irq,
.shutdown = shutdown_mappi_irq,
.enable = enable_mappi_irq,
@@ -136,7 +136,7 @@ static void shutdown_m32700ut_pld_irq(unsigned int irq)
static struct irq_chip m32700ut_pld_irq_type =
{
- .typename = "USRV-PLD-IRQ",
+ .name = "USRV-PLD-IRQ",
.startup = startup_m32700ut_pld_irq,
.shutdown = shutdown_m32700ut_pld_irq,
.enable = enable_m32700ut_pld_irq,
diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c
index 2340f11dc29c..9a526ba6f257 100644
--- a/arch/mips/kernel/mips-mt-fpaff.c
+++ b/arch/mips/kernel/mips-mt-fpaff.c
@@ -103,7 +103,7 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len,
if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
goto out_unlock;
- retval = security_task_setscheduler(p, 0, NULL);
+ retval = security_task_setscheduler(p)
if (retval)
goto out_unlock;
diff --git a/arch/powerpc/include/asm/system.h b/arch/powerpc/include/asm/system.h
index 6c294acac848..9c3d160670b4 100644
--- a/arch/powerpc/include/asm/system.h
+++ b/arch/powerpc/include/asm/system.h
@@ -542,10 +542,6 @@ extern void reloc_got2(unsigned long);
#define PTRRELOC(x) ((typeof(x)) add_reloc_offset((unsigned long)(x)))
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-extern void account_system_vtime(struct task_struct *);
-#endif
-
extern struct dentry *powerpc_debugfs_root;
#endif /* __KERNEL__ */
diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c
index 97085530aa63..e3e379c6caa7 100644
--- a/arch/powerpc/platforms/cell/axon_msi.c
+++ b/arch/powerpc/platforms/cell/axon_msi.c
@@ -310,9 +310,9 @@ static void axon_msi_teardown_msi_irqs(struct pci_dev *dev)
}
static struct irq_chip msic_irq_chip = {
- .mask = mask_msi_irq,
- .unmask = unmask_msi_irq,
- .shutdown = unmask_msi_irq,
+ .irq_mask = mask_msi_irq,
+ .irq_unmask = unmask_msi_irq,
+ .irq_shutdown = mask_msi_irq,
.name = "AXON-MSI",
};
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c
index 93834b0d8272..67e2c4bdac8f 100644
--- a/arch/powerpc/platforms/pseries/xics.c
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -243,7 +243,7 @@ static unsigned int xics_startup(unsigned int virq)
* at that level, so we do it here by hand.
*/
if (irq_to_desc(virq)->msi_desc)
- unmask_msi_irq(virq);
+ unmask_msi_irq(irq_get_irq_data(virq));
/* unmask it */
xics_unmask_irq(virq);
diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c
index 87991d3abbab..bdbd896c89d8 100644
--- a/arch/powerpc/sysdev/fsl_msi.c
+++ b/arch/powerpc/sysdev/fsl_msi.c
@@ -51,8 +51,8 @@ static void fsl_msi_end_irq(unsigned int virq)
}
static struct irq_chip fsl_msi_chip = {
- .mask = mask_msi_irq,
- .unmask = unmask_msi_irq,
+ .irq_mask = mask_msi_irq,
+ .irq_unmask = unmask_msi_irq,
.ack = fsl_msi_end_irq,
.name = "FSL-MSI",
};
diff --git a/arch/powerpc/sysdev/mpic_pasemi_msi.c b/arch/powerpc/sysdev/mpic_pasemi_msi.c
index 3b6a9a43718f..320ad5a9a25d 100644
--- a/arch/powerpc/sysdev/mpic_pasemi_msi.c
+++ b/arch/powerpc/sysdev/mpic_pasemi_msi.c
@@ -39,24 +39,24 @@
static struct mpic *msi_mpic;
-static void mpic_pasemi_msi_mask_irq(unsigned int irq)
+static void mpic_pasemi_msi_mask_irq(struct irq_data *data)
{
- pr_debug("mpic_pasemi_msi_mask_irq %d\n", irq);
- mask_msi_irq(irq);
- mpic_mask_irq(irq);
+ pr_debug("mpic_pasemi_msi_mask_irq %d\n", data->irq);
+ mask_msi_irq(data);
+ mpic_mask_irq(data->irq);
}
-static void mpic_pasemi_msi_unmask_irq(unsigned int irq)
+static void mpic_pasemi_msi_unmask_irq(struct irq_data *data)
{
- pr_debug("mpic_pasemi_msi_unmask_irq %d\n", irq);
- mpic_unmask_irq(irq);
- unmask_msi_irq(irq);
+ pr_debug("mpic_pasemi_msi_unmask_irq %d\n", data->irq);
+ mpic_unmask_irq(data->irq);
+ unmask_msi_irq(data);
}
static struct irq_chip mpic_pasemi_msi_chip = {
- .shutdown = mpic_pasemi_msi_mask_irq,
- .mask = mpic_pasemi_msi_mask_irq,
- .unmask = mpic_pasemi_msi_unmask_irq,
+ .irq_shutdown = mpic_pasemi_msi_mask_irq,
+ .irq_mask = mpic_pasemi_msi_mask_irq,
+ .irq_unmask = mpic_pasemi_msi_unmask_irq,
.eoi = mpic_end_irq,
.set_type = mpic_set_irq_type,
.set_affinity = mpic_set_affinity,
diff --git a/arch/powerpc/sysdev/mpic_u3msi.c b/arch/powerpc/sysdev/mpic_u3msi.c
index bcbfe79c704b..a2b028b4a202 100644
--- a/arch/powerpc/sysdev/mpic_u3msi.c
+++ b/arch/powerpc/sysdev/mpic_u3msi.c
@@ -23,22 +23,22 @@
/* A bit ugly, can we get this from the pci_dev somehow? */
static struct mpic *msi_mpic;
-static void mpic_u3msi_mask_irq(unsigned int irq)
+static void mpic_u3msi_mask_irq(struct irq_data *data)
{
- mask_msi_irq(irq);
- mpic_mask_irq(irq);
+ mask_msi_irq(data);
+ mpic_mask_irq(data->irq);
}
-static void mpic_u3msi_unmask_irq(unsigned int irq)
+static void mpic_u3msi_unmask_irq(struct irq_data *data)
{
- mpic_unmask_irq(irq);
- unmask_msi_irq(irq);
+ mpic_unmask_irq(data->irq);
+ unmask_msi_irq(data);
}
static struct irq_chip mpic_u3msi_chip = {
- .shutdown = mpic_u3msi_mask_irq,
- .mask = mpic_u3msi_mask_irq,
- .unmask = mpic_u3msi_unmask_irq,
+ .irq_shutdown = mpic_u3msi_mask_irq,
+ .irq_mask = mpic_u3msi_mask_irq,
+ .irq_unmask = mpic_u3msi_unmask_irq,
.eoi = mpic_end_irq,
.set_type = mpic_set_irq_type,
.set_affinity = mpic_set_affinity,
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 958f0dadeadf..75976a141947 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -199,6 +199,13 @@ config HOTPLUG_CPU
can be controlled through /sys/devices/system/cpu/cpu#.
Say N if you want to disable CPU hotplug.
+config SCHED_BOOK
+ bool "Book scheduler support"
+ depends on SMP
+ help
+ Book scheduler support improves the CPU scheduler's decision making
+ when dealing with machines that have several books.
+
config MATHEMU
bool "IEEE FPU emulation"
depends on MARCH_G5
diff --git a/arch/s390/include/asm/hardirq.h b/arch/s390/include/asm/hardirq.h
index 498bc3892385..881d94590aeb 100644
--- a/arch/s390/include/asm/hardirq.h
+++ b/arch/s390/include/asm/hardirq.h
@@ -12,10 +12,6 @@
#ifndef __ASM_HARDIRQ_H
#define __ASM_HARDIRQ_H
-#include
-#include
-#include
-#include
#include
#define local_softirq_pending() (S390_lowcore.softirq_pending)
diff --git a/arch/s390/include/asm/system.h b/arch/s390/include/asm/system.h
index cef66210c846..38ddd8a9a9e8 100644
--- a/arch/s390/include/asm/system.h
+++ b/arch/s390/include/asm/system.h
@@ -97,7 +97,6 @@ static inline void restore_access_regs(unsigned int *acrs)
extern void account_vtime(struct task_struct *, struct task_struct *);
extern void account_tick_vtime(struct task_struct *);
-extern void account_system_vtime(struct task_struct *);
#ifdef CONFIG_PFAULT
extern void pfault_irq_init(void);
diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h
index 831bd033ea77..051107a2c5e2 100644
--- a/arch/s390/include/asm/topology.h
+++ b/arch/s390/include/asm/topology.h
@@ -3,15 +3,32 @@
#include
-#define mc_capable() (1)
-
-const struct cpumask *cpu_coregroup_mask(unsigned int cpu);
-
extern unsigned char cpu_core_id[NR_CPUS];
extern cpumask_t cpu_core_map[NR_CPUS];
+static inline const struct cpumask *cpu_coregroup_mask(unsigned int cpu)
+{
+ return &cpu_core_map[cpu];
+}
+
#define topology_core_id(cpu) (cpu_core_id[cpu])
#define topology_core_cpumask(cpu) (&cpu_core_map[cpu])
+#define mc_capable() (1)
+
+#ifdef CONFIG_SCHED_BOOK
+
+extern unsigned char cpu_book_id[NR_CPUS];
+extern cpumask_t cpu_book_map[NR_CPUS];
+
+static inline const struct cpumask *cpu_book_mask(unsigned int cpu)
+{
+ return &cpu_book_map[cpu];
+}
+
+#define topology_book_id(cpu) (cpu_book_id[cpu])
+#define topology_book_cpumask(cpu) (&cpu_book_map[cpu])
+
+#endif /* CONFIG_SCHED_BOOK */
int topology_set_cpu_management(int fc);
void topology_schedule_update(void);
@@ -30,6 +47,8 @@ static inline void s390_init_cpu_topology(void)
};
#endif
+#define SD_BOOK_INIT SD_CPU_INIT
+
#include
#endif /* _ASM_S390_TOPOLOGY_H */
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index bcef00766a64..13559c993847 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -57,8 +57,8 @@ struct tl_info {
union tl_entry tle[0];
};
-struct core_info {
- struct core_info *next;
+struct mask_info {
+ struct mask_info *next;
unsigned char id;
cpumask_t mask;
};
@@ -66,7 +66,6 @@ struct core_info {
static int topology_enabled;
static void topology_work_fn(struct work_struct *work);
static struct tl_info *tl_info;
-static struct core_info core_info;
static int machine_has_topology;
static struct timer_list topology_timer;
static void set_topology_timer(void);
@@ -74,38 +73,37 @@ static DECLARE_WORK(topology_work, topology_work_fn);
/* topology_lock protects the core linked list */
static DEFINE_SPINLOCK(topology_lock);
+static struct mask_info core_info;
cpumask_t cpu_core_map[NR_CPUS];
unsigned char cpu_core_id[NR_CPUS];
-static cpumask_t cpu_coregroup_map(unsigned int cpu)
+#ifdef CONFIG_SCHED_BOOK
+static struct mask_info book_info;
+cpumask_t cpu_book_map[NR_CPUS];
+unsigned char cpu_book_id[NR_CPUS];
+#endif
+
+static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu)
{
- struct core_info *core = &core_info;
- unsigned long flags;
cpumask_t mask;
cpus_clear(mask);
if (!topology_enabled || !machine_has_topology)
return cpu_possible_map;
- spin_lock_irqsave(&topology_lock, flags);
- while (core) {
- if (cpu_isset(cpu, core->mask)) {
- mask = core->mask;
+ while (info) {
+ if (cpu_isset(cpu, info->mask)) {
+ mask = info->mask;
break;
}
- core = core->next;
+ info = info->next;
}
- spin_unlock_irqrestore(&topology_lock, flags);
if (cpus_empty(mask))
mask = cpumask_of_cpu(cpu);
return mask;
}
-const struct cpumask *cpu_coregroup_mask(unsigned int cpu)
-{
- return &cpu_core_map[cpu];
-}
-
-static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core)
+static void add_cpus_to_mask(struct tl_cpu *tl_cpu, struct mask_info *book,
+ struct mask_info *core)
{
unsigned int cpu;
@@ -117,23 +115,35 @@ static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core)
rcpu = CPU_BITS - 1 - cpu + tl_cpu->origin;
for_each_present_cpu(lcpu) {
- if (cpu_logical_map(lcpu) == rcpu) {
- cpu_set(lcpu, core->mask);
- cpu_core_id[lcpu] = core->id;
- smp_cpu_polarization[lcpu] = tl_cpu->pp;
- }
+ if (cpu_logical_map(lcpu) != rcpu)
+ continue;
+#ifdef CONFIG_SCHED_BOOK
+ cpu_set(lcpu, book->mask);
+ cpu_book_id[lcpu] = book->id;
+#endif
+ cpu_set(lcpu, core->mask);
+ cpu_core_id[lcpu] = core->id;
+ smp_cpu_polarization[lcpu] = tl_cpu->pp;
}
}
}
-static void clear_cores(void)
+static void clear_masks(void)
{
- struct core_info *core = &core_info;
+ struct mask_info *info;
- while (core) {
- cpus_clear(core->mask);
- core = core->next;
+ info = &core_info;
+ while (info) {
+ cpus_clear(info->mask);
+ info = info->next;
}
+#ifdef CONFIG_SCHED_BOOK
+ info = &book_info;
+ while (info) {
+ cpus_clear(info->mask);
+ info = info->next;
+ }
+#endif
}
static union tl_entry *next_tle(union tl_entry *tle)
@@ -146,29 +156,36 @@ static union tl_entry *next_tle(union tl_entry *tle)
static void tl_to_cores(struct tl_info *info)
{
+#ifdef CONFIG_SCHED_BOOK
+ struct mask_info *book = &book_info;
+#else
+ struct mask_info *book = NULL;
+#endif
+ struct mask_info *core = &core_info;
union tl_entry *tle, *end;
- struct core_info *core = &core_info;
+
spin_lock_irq(&topology_lock);
- clear_cores();
+ clear_masks();
tle = info->tle;
end = (union tl_entry *)((unsigned long)info + info->length);
while (tle < end) {
switch (tle->nl) {
- case 5:
- case 4:
- case 3:
+#ifdef CONFIG_SCHED_BOOK
case 2:
+ book = book->next;
+ book->id = tle->container.id;
break;
+#endif
case 1:
core = core->next;
core->id = tle->container.id;
break;
case 0:
- add_cpus_to_core(&tle->cpu, core);
+ add_cpus_to_mask(&tle->cpu, book, core);
break;
default:
- clear_cores();
+ clear_masks();
machine_has_topology = 0;
goto out;
}
@@ -221,10 +238,29 @@ int topology_set_cpu_management(int fc)
static void update_cpu_core_map(void)
{
+ unsigned long flags;
int cpu;
- for_each_possible_cpu(cpu)
- cpu_core_map[cpu] = cpu_coregroup_map(cpu);
+ spin_lock_irqsave(&topology_lock, flags);
+ for_each_possible_cpu(cpu) {
+ cpu_core_map[cpu] = cpu_group_map(&core_info, cpu);
+#ifdef CONFIG_SCHED_BOOK
+ cpu_book_map[cpu] = cpu_group_map(&book_info, cpu);
+#endif
+ }
+ spin_unlock_irqrestore(&topology_lock, flags);
+}
+
+static void store_topology(struct tl_info *info)
+{
+#ifdef CONFIG_SCHED_BOOK
+ int rc;
+
+ rc = stsi(info, 15, 1, 3);
+ if (rc != -ENOSYS)
+ return;
+#endif
+ stsi(info, 15, 1, 2);
}
int arch_update_cpu_topology(void)
@@ -238,7 +274,7 @@ int arch_update_cpu_topology(void)
topology_update_polarization_simple();
return 0;
}
- stsi(info, 15, 1, 2);
+ store_topology(info);
tl_to_cores(info);
update_cpu_core_map();
for_each_online_cpu(cpu) {
@@ -299,12 +335,24 @@ out:
}
__initcall(init_topology_update);
+static void alloc_masks(struct tl_info *info, struct mask_info *mask, int offset)
+{
+ int i, nr_masks;
+
+ nr_masks = info->mag[NR_MAG - offset];
+ for (i = 0; i < info->mnest - offset; i++)
+ nr_masks *= info->mag[NR_MAG - offset - 1 - i];
+ nr_masks = max(nr_masks, 1);
+ for (i = 0; i < nr_masks; i++) {
+ mask->next = alloc_bootmem(sizeof(struct mask_info));
+ mask = mask->next;
+ }
+}
+
void __init s390_init_cpu_topology(void)
{
unsigned long long facility_bits;
struct tl_info *info;
- struct core_info *core;
- int nr_cores;
int i;
if (stfle(&facility_bits, 1) <= 0)
@@ -315,25 +363,13 @@ void __init s390_init_cpu_topology(void)
tl_info = alloc_bootmem_pages(PAGE_SIZE);
info = tl_info;
- stsi(info, 15, 1, 2);
-
- nr_cores = info->mag[NR_MAG - 2];
- for (i = 0; i < info->mnest - 2; i++)
- nr_cores *= info->mag[NR_MAG - 3 - i];
-
+ store_topology(info);
pr_info("The CPU configuration topology of the machine is:");
for (i = 0; i < NR_MAG; i++)
printk(" %d", info->mag[i]);
printk(" / %d\n", info->mnest);
-
- core = &core_info;
- for (i = 0; i < nr_cores; i++) {
- core->next = alloc_bootmem(sizeof(struct core_info));
- core = core->next;
- if (!core)
- goto error;
- }
- return;
-error:
- machine_has_topology = 0;
+ alloc_masks(info, &core_info, 2);
+#ifdef CONFIG_SCHED_BOOK
+ alloc_masks(info, &book_info, 3);
+#endif
}
diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index 257de1f0692b..ae5bac39b896 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -290,7 +290,7 @@ void __init init_IRQ(void)
int __init arch_probe_nr_irqs(void)
{
nr_irqs = sh_mv.mv_nr_irqs;
- return 0;
+ return NR_IRQS_LEGACY;
}
#endif
diff --git a/arch/sparc/kernel/pci_msi.c b/arch/sparc/kernel/pci_msi.c
index 548b8ca9c210..b210416ace7b 100644
--- a/arch/sparc/kernel/pci_msi.c
+++ b/arch/sparc/kernel/pci_msi.c
@@ -114,10 +114,10 @@ static void free_msi(struct pci_pbm_info *pbm, int msi_num)
static struct irq_chip msi_irq = {
.name = "PCI-MSI",
- .mask = mask_msi_irq,
- .unmask = unmask_msi_irq,
- .enable = unmask_msi_irq,
- .disable = mask_msi_irq,
+ .irq_mask = mask_msi_irq,
+ .irq_unmask = unmask_msi_irq,
+ .irq_enable = unmask_msi_irq,
+ .irq_disable = mask_msi_irq,
/* XXX affinity XXX */
};
diff --git a/arch/tile/kernel/irq.c b/arch/tile/kernel/irq.c
index 596c60086930..9a27d563fc30 100644
--- a/arch/tile/kernel/irq.c
+++ b/arch/tile/kernel/irq.c
@@ -208,7 +208,7 @@ static void tile_irq_chip_eoi(unsigned int irq)
}
static struct irq_chip tile_irq_chip = {
- .typename = "tile_irq_chip",
+ .name = "tile_irq_chip",
.ack = tile_irq_chip_ack,
.eoi = tile_irq_chip_eoi,
.mask = tile_irq_chip_mask,
@@ -288,7 +288,7 @@ int show_interrupts(struct seq_file *p, void *v)
for_each_online_cpu(j)
seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
#endif
- seq_printf(p, " %14s", irq_desc[i].chip->typename);
+ seq_printf(p, " %14s", irq_desc[i].chip->name);
seq_printf(p, " %s", action->name);
for (action = action->next; action; action = action->next)
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index a3f0b04d7101..a746e3037a5b 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -46,7 +46,7 @@ int show_interrupts(struct seq_file *p, void *v)
for_each_online_cpu(j)
seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
#endif
- seq_printf(p, " %14s", irq_desc[i].chip->typename);
+ seq_printf(p, " %14s", irq_desc[i].chip->name);
seq_printf(p, " %s", action->name);
for (action=action->next; action; action = action->next)
@@ -369,7 +369,7 @@ static void dummy(unsigned int irq)
/* This is used for everything else than the timer. */
static struct irq_chip normal_irq_type = {
- .typename = "SIGIO",
+ .name = "SIGIO",
.release = free_irq_by_irq_and_dev,
.disable = dummy,
.enable = dummy,
@@ -378,7 +378,7 @@ static struct irq_chip normal_irq_type = {
};
static struct irq_chip SIGVTALRM_irq_type = {
- .typename = "SIGVTALRM",
+ .name = "SIGVTALRM",
.release = free_irq_by_irq_and_dev,
.shutdown = dummy, /* never called */
.disable = dummy,
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fd227d6b8d9c..7ab9db88ab6a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -63,6 +63,10 @@ config X86
select HAVE_USER_RETURN_NOTIFIER
select HAVE_ARCH_JUMP_LABEL
select HAVE_TEXT_POKE_SMP
+ select HAVE_GENERIC_HARDIRQS
+ select HAVE_SPARSE_IRQ
+ select GENERIC_IRQ_PROBE
+ select GENERIC_PENDING_IRQ if SMP
config INSTRUCTION_DECODER
def_bool (KPROBES || PERF_EVENTS)
@@ -204,20 +208,6 @@ config HAVE_INTEL_TXT
def_bool y
depends on EXPERIMENTAL && DMAR && ACPI
-# Use the generic interrupt handling code in kernel/irq/:
-config GENERIC_HARDIRQS
- def_bool y
-
-config GENERIC_HARDIRQS_NO__DO_IRQ
- def_bool y
-
-config GENERIC_IRQ_PROBE
- def_bool y
-
-config GENERIC_PENDING_IRQ
- def_bool y
- depends on GENERIC_HARDIRQS && SMP
-
config USE_GENERIC_SMP_HELPERS
def_bool y
depends on SMP
@@ -300,23 +290,6 @@ config X86_X2APIC
If you don't know what to do here, say N.
-config SPARSE_IRQ
- bool "Support sparse irq numbering"
- depends on PCI_MSI || HT_IRQ
- ---help---
- This enables support for sparse irqs. This is useful for distro
- kernels that want to define a high CONFIG_NR_CPUS value but still
- want to have low kernel memory footprint on smaller machines.
-
- ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread
- out the irq_desc[] array in a more NUMA-friendly way. )
-
- If you don't know what to do here, say N.
-
-config NUMA_IRQ_DESC
- def_bool y
- depends on SPARSE_IRQ && NUMA
-
config X86_MPPARSE
bool "Enable MPS table" if ACPI
default y
@@ -521,25 +494,6 @@ if PARAVIRT_GUEST
source "arch/x86/xen/Kconfig"
-config VMI
- bool "VMI Guest support (DEPRECATED)"
- select PARAVIRT
- depends on X86_32
- ---help---
- VMI provides a paravirtualized interface to the VMware ESX server
- (it could be used by other hypervisors in theory too, but is not
- at the moment), by linking the kernel to a GPL-ed ROM module
- provided by the hypervisor.
-
- As of September 2009, VMware has started a phased retirement
- of this feature from VMware's products. Please see
- feature-removal-schedule.txt for details. If you are
- planning to enable this option, please note that you cannot
- live migrate a VMI enabled VM to a future VMware product,
- which doesn't support VMI. So if you expect your kernel to
- seamlessly migrate to newer VMware products, keep this
- disabled.
-
config KVM_CLOCK
bool "KVM paravirtualized clock"
select PARAVIRT
@@ -674,7 +628,7 @@ config GART_IOMMU
bool "GART IOMMU support" if EMBEDDED
default y
select SWIOTLB
- depends on X86_64 && PCI && K8_NB
+ depends on X86_64 && PCI && AMD_NB
---help---
Support for full DMA access of devices with 32bit memory access only
on systems with more than 3GB. This is usually needed for USB,
@@ -799,6 +753,17 @@ config SCHED_MC
making when dealing with multi-core CPU chips at a cost of slightly
increased overhead in some places. If unsure say N here.
+config IRQ_TIME_ACCOUNTING
+ bool "Fine granularity task level IRQ time accounting"
+ default n
+ ---help---
+ Select this option to enable fine granularity task irq time
+ accounting. This is done by reading a timestamp on each
+ transitions between softirq and hardirq state, so there can be a
+ small performance impact.
+
+ If in doubt, say N here.
+
source "kernel/Kconfig.preempt"
config X86_UP_APIC
@@ -1152,6 +1117,9 @@ config X86_PAE
config ARCH_PHYS_ADDR_T_64BIT
def_bool X86_64 || X86_PAE
+config ARCH_DMA_ADDR_T_64BIT
+ def_bool X86_64 || HIGHMEM64G
+
config DIRECT_GBPAGES
bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
default y
@@ -1330,25 +1298,34 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
Set whether the default state of memory_corruption_check is
on or off.
-config X86_RESERVE_LOW_64K
- bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
- default y
+config X86_RESERVE_LOW
+ int "Amount of low memory, in kilobytes, to reserve for the BIOS"
+ default 64
+ range 4 640
---help---
- Reserve the first 64K of physical RAM on BIOSes that are known
- to potentially corrupt that memory range. A numbers of BIOSes are
- known to utilize this area during suspend/resume, so it must not
- be used by the kernel.
+ Specify the amount of low memory to reserve for the BIOS.
- Set this to N if you are absolutely sure that you trust the BIOS
- to get all its memory reservations and usages right.
+ The first page contains BIOS data structures that the kernel
+ must not use, so that page must always be reserved.
- If you have doubts about the BIOS (e.g. suspend/resume does not
- work or there's kernel crashes after certain hardware hotplug
- events) and it's not AMI or Phoenix, then you might want to enable
- X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
- corruption patterns.
+ By default we reserve the first 64K of physical RAM, as a
+ number of BIOSes are known to corrupt that memory range
+ during events such as suspend/resume or monitor cable
+ insertion, so it must not be used by the kernel.
- Say Y if unsure.
+ You can set this to 4 if you are absolutely sure that you
+ trust the BIOS to get all its memory reservations and usages
+ right. If you know your BIOS have problems beyond the
+ default 64K area, you can set this to 640 to avoid using the
+ entire low memory range.
+
+ If you have doubts about the BIOS (e.g. suspend/resume does
+ not work or there's kernel crashes after certain hardware
+ hotplug events) then you might want to enable
+ X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check
+ typical corruption patterns.
+
+ Leave this to the default value of 64 if you are unsure.
config MATH_EMULATION
bool
@@ -1904,7 +1881,7 @@ config PCI_GODIRECT
bool "Direct"
config PCI_GOOLPC
- bool "OLPC"
+ bool "OLPC XO-1"
depends on OLPC
config PCI_GOANY
@@ -2065,14 +2042,21 @@ config SCx200HR_TIMER
config OLPC
bool "One Laptop Per Child support"
select GPIOLIB
+ select OLPC_OPENFIRMWARE
---help---
Add support for detecting the unique features of the OLPC
XO hardware.
+config OLPC_XO1
+ tristate "OLPC XO-1 support"
+ depends on OLPC && PCI
+ ---help---
+ Add support for non-essential features of the OLPC XO-1 laptop.
+
config OLPC_OPENFIRMWARE
bool "Support for OLPC's Open Firmware"
depends on !X86_64 && !X86_PAE
- default y if OLPC
+ default n
help
This option adds support for the implementation of Open Firmware
that is used on the OLPC XO-1 Children's Machine.
@@ -2080,7 +2064,7 @@ config OLPC_OPENFIRMWARE
endif # X86_32
-config K8_NB
+config AMD_NB
def_bool y
depends on CPU_SUP_AMD && PCI
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 75085080b63e..e5bb96b10f1a 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -43,6 +43,10 @@ config EARLY_PRINTK
with klogd/syslogd or the X server. You should normally N here,
unless you want to debug such a crash.
+config EARLY_PRINTK_MRST
+ bool "Early printk for MRST platform support"
+ depends on EARLY_PRINTK && X86_MRST
+
config EARLY_PRINTK_DBGP
bool "Early printk via EHCI debug port"
depends on EARLY_PRINTK && PCI
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index e8c8881351b3..b02e509072a7 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -96,8 +96,12 @@ cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_en
# is .cfi_signal_frame supported too?
cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections)
+
+# does binutils support specific instructions?
+asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
+
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
LDFLAGS := -m elf_$(UTS_MACHINE)
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index 5af2982133b5..f16a2caca1e0 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
* Author: Joerg Roedel
* Leo Duran
*
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
index cb030374b90a..916bc8111a01 100644
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2009-2010 Advanced Micro Devices, Inc.
* Author: Joerg Roedel
*
* This program is free software; you can redistribute it and/or modify it
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 08616180deaf..e3509fc303bf 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
* Author: Joerg Roedel
* Leo Duran
*
@@ -416,13 +416,22 @@ struct amd_iommu {
struct dma_ops_domain *default_dom;
/*
- * This array is required to work around a potential BIOS bug.
- * The BIOS may miss to restore parts of the PCI configuration
- * space when the system resumes from S3. The result is that the
- * IOMMU does not execute commands anymore which leads to system
- * failure.
+ * We can't rely on the BIOS to restore all values on reinit, so we
+ * need to stash them
*/
- u32 cache_cfg[4];
+
+ /* The iommu BAR */
+ u32 stored_addr_lo;
+ u32 stored_addr_hi;
+
+ /*
+ * Each iommu has 6 l1s, each of which is documented as having 0x12
+ * registers
+ */
+ u32 stored_l1[6][0x12];
+
+ /* The l2 indirect registers */
+ u32 stored_l2[0x83];
};
/*
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/amd_nb.h
similarity index 61%
rename from arch/x86/include/asm/k8.h
rename to arch/x86/include/asm/amd_nb.h
index af00bd1d2089..c8517f81b21e 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_X86_K8_H
-#define _ASM_X86_K8_H
+#ifndef _ASM_X86_AMD_NB_H
+#define _ASM_X86_AMD_NB_H
#include
@@ -7,24 +7,27 @@ extern struct pci_device_id k8_nb_ids[];
struct bootnode;
extern int early_is_k8_nb(u32 value);
-extern struct pci_dev **k8_northbridges;
-extern int num_k8_northbridges;
extern int cache_k8_northbridges(void);
extern void k8_flush_garts(void);
extern int k8_get_nodes(struct bootnode *nodes);
extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
extern int k8_scan_nodes(void);
-#ifdef CONFIG_K8_NB
-extern int num_k8_northbridges;
+struct k8_northbridge_info {
+ u16 num;
+ u8 gart_supported;
+ struct pci_dev **nb_misc;
+};
+extern struct k8_northbridge_info k8_northbridges;
+
+#ifdef CONFIG_AMD_NB
static inline struct pci_dev *node_to_k8_nb_misc(int node)
{
- return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
+ return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL;
}
#else
-#define num_k8_northbridges 0
static inline struct pci_dev *node_to_k8_nb_misc(int node)
{
@@ -33,4 +36,4 @@ static inline struct pci_dev *node_to_k8_nb_misc(int node)
#endif
-#endif /* _ASM_X86_K8_H */
+#endif /* _ASM_X86_AMD_NB_H */
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
index a69b1ac9eaf8..2fefa501d3ba 100644
--- a/arch/x86/include/asm/apb_timer.h
+++ b/arch/x86/include/asm/apb_timer.h
@@ -54,7 +54,6 @@ extern struct clock_event_device *global_clock_event;
extern unsigned long apbt_quick_calibrate(void);
extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
extern void apbt_setup_secondary_clock(void);
-extern unsigned int boot_cpu_id;
extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint);
extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr);
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 1fa03e04ae44..286de34b0ed6 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -252,9 +252,7 @@ static inline int apic_is_clustered_box(void)
}
#endif
-extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask);
-extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask);
-
+extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask);
#else /* !CONFIG_X86_LOCAL_APIC */
static inline void lapic_shutdown(void) { }
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 7fe3b3060f08..a859ca461fb0 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -131,6 +131,7 @@
#define APIC_EILVTn(n) (0x500 + 0x10 * n)
#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */
#define APIC_EILVT_NR_AMD_10H 4
+#define APIC_EILVT_NR_MAX APIC_EILVT_NR_AMD_10H
#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF)
#define APIC_EILVT_MSG_FIX 0x0
#define APIC_EILVT_MSG_SMI 0x2
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index b185091bf19c..4fab24de26b1 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -32,6 +32,5 @@ extern void arch_unregister_cpu(int);
DECLARE_PER_CPU(int, cpu_state);
-extern unsigned int boot_cpu_id;
#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 3f76523589af..220e2ea08e80 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -152,10 +152,14 @@
#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */
#define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */
#define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */
-#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */
+#define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */
#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */
#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */
+#define X86_FEATURE_LWP (6*32+15) /* Light Weight Profiling */
+#define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */
#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */
+#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */
+#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */
/*
* Auxiliary flags: Linux defined - For features scattered in various
@@ -180,6 +184,13 @@
#define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */
#define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */
#define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */
+#define X86_FEATURE_TSCRATEMSR (8*32+ 9) /* "tsc_scale" AMD TSC scaling support */
+#define X86_FEATURE_VMCBCLEAN (8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */
+#define X86_FEATURE_FLUSHBYASID (8*32+11) /* AMD flush-by-ASID support */
+#define X86_FEATURE_DECODEASSISTS (8*32+12) /* AMD Decode Assists support */
+#define X86_FEATURE_PAUSEFILTER (8*32+13) /* AMD filtered pause intercept */
+#define X86_FEATURE_PFTHRESHOLD (8*32+14) /* AMD pause filter threshold */
+
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index 733f7e91e7a9..326099199318 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -89,6 +89,16 @@
CFI_ADJUST_CFA_OFFSET -8
.endm
+ .macro pushfq_cfi
+ pushfq
+ CFI_ADJUST_CFA_OFFSET 8
+ .endm
+
+ .macro popfq_cfi
+ popfq
+ CFI_ADJUST_CFA_OFFSET -8
+ .endm
+
.macro movq_cfi reg offset=0
movq %\reg, \offset(%rsp)
CFI_REL_OFFSET \reg, \offset
@@ -109,6 +119,16 @@
CFI_ADJUST_CFA_OFFSET -4
.endm
+ .macro pushfl_cfi
+ pushfl
+ CFI_ADJUST_CFA_OFFSET 4
+ .endm
+
+ .macro popfl_cfi
+ popfl
+ CFI_ADJUST_CFA_OFFSET -4
+ .endm
+
.macro movl_cfi reg offset=0
movl %\reg, \offset(%esp)
CFI_REL_OFFSET \reg, \offset
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index d07b44f7d1dc..4d293dced62f 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -214,5 +214,20 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
return __virt_to_fix(vaddr);
}
+
+/* Return an pointer with offset calculated */
+static inline unsigned long __set_fixmap_offset(enum fixed_addresses idx,
+ phys_addr_t phys, pgprot_t flags)
+{
+ __set_fixmap(idx, phys, flags);
+ return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1));
+}
+
+#define set_fixmap_offset(idx, phys) \
+ __set_fixmap_offset(idx, phys, PAGE_KERNEL)
+
+#define set_fixmap_offset_nocache(idx, phys) \
+ __set_fixmap_offset(idx, phys, PAGE_KERNEL_NOCACHE)
+
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 4ac5b0f33fc1..bf357f9b25f0 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -17,6 +17,7 @@ extern int fix_aperture;
#define GARTEN (1<<0)
#define DISGARTCPU (1<<4)
#define DISGARTIO (1<<5)
+#define DISTLBWALKPRB (1<<6)
/* GART cache control register bits. */
#define INVGART (1<<0)
@@ -27,7 +28,6 @@ extern int fix_aperture;
#define AMD64_GARTAPERTUREBASE 0x94
#define AMD64_GARTTABLEBASE 0x98
#define AMD64_GARTCACHECTL 0x9c
-#define AMD64_GARTEN (1<<0)
#ifdef CONFIG_GART_IOMMU
extern int gart_iommu_aperture;
@@ -57,6 +57,19 @@ static inline void gart_iommu_hole_init(void)
extern int agp_amd64_init(void);
+static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order)
+{
+ u32 ctl;
+
+ /*
+ * Don't enable translation but enable GART IO and CPU accesses.
+ * Also, set DISTLBWALKPRB since GART tables memory is UC.
+ */
+ ctl = DISTLBWALKPRB | order << 1;
+
+ pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
+}
+
static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
{
u32 tmp, ctl;
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 1d5c08a1bdfd..2c392d663dce 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -74,10 +74,12 @@ extern void hpet_disable(void);
extern unsigned int hpet_readl(unsigned int a);
extern void force_hpet_resume(void);
-extern void hpet_msi_unmask(unsigned int irq);
-extern void hpet_msi_mask(unsigned int irq);
-extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg);
-extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg);
+struct irq_data;
+extern void hpet_msi_unmask(struct irq_data *data);
+extern void hpet_msi_mask(struct irq_data *data);
+struct hpet_dev;
+extern void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg);
+extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg);
#ifdef CONFIG_PCI_MSI
extern int arch_setup_hpet_msi(unsigned int irq, unsigned int id);
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 3a54a1ca1a02..0274ec5a7e62 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -78,6 +78,13 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
irq_attr->polarity = polarity;
}
+struct irq_2_iommu {
+ struct intel_iommu *iommu;
+ u16 irte_index;
+ u16 sub_handle;
+ u8 irte_mask;
+};
+
/*
* This is performance-critical, we want to do it O(1)
*
@@ -89,15 +96,17 @@ struct irq_cfg {
cpumask_var_t old_domain;
u8 vector;
u8 move_in_progress : 1;
+#ifdef CONFIG_INTR_REMAP
+ struct irq_2_iommu irq_2_iommu;
+#endif
};
-extern struct irq_cfg *irq_cfg(unsigned int);
extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *);
extern void send_cleanup_vector(struct irq_cfg *);
-struct irq_desc;
-extern unsigned int set_desc_affinity(struct irq_desc *, const struct cpumask *,
- unsigned int *dest_id);
+struct irq_data;
+int __ioapic_set_affinity(struct irq_data *, const struct cpumask *,
+ unsigned int *dest_id);
extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr);
extern void setup_ioapic_dest(void);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index a73a8d5a5e69..4aa2bb3b242a 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -55,6 +55,12 @@ extern int save_i387_xstate_ia32(void __user *buf);
extern int restore_i387_xstate_ia32(void __user *buf);
#endif
+#ifdef CONFIG_MATH_EMULATION
+extern void finit_soft_fpu(struct i387_soft_struct *soft);
+#else
+static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
+#endif
+
#define X87_FSW_ES (1 << 7) /* Exception Summary */
static __always_inline __pure bool use_xsaveopt(void)
@@ -67,6 +73,11 @@ static __always_inline __pure bool use_xsave(void)
return static_cpu_has(X86_FEATURE_XSAVE);
}
+static __always_inline __pure bool use_fxsr(void)
+{
+ return static_cpu_has(X86_FEATURE_FXSR);
+}
+
extern void __sanitize_i387_state(struct task_struct *);
static inline void sanitize_i387_state(struct task_struct *tsk)
@@ -77,19 +88,11 @@ static inline void sanitize_i387_state(struct task_struct *tsk)
}
#ifdef CONFIG_X86_64
-
-/* Ignore delayed exceptions from user space */
-static inline void tolerant_fwait(void)
-{
- asm volatile("1: fwait\n"
- "2:\n"
- _ASM_EXTABLE(1b, 2b));
-}
-
static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
{
int err;
+ /* See comment in fxsave() below. */
asm volatile("1: rex64/fxrstor (%[fx])\n\t"
"2:\n"
".section .fixup,\"ax\"\n"
@@ -98,44 +101,10 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
".previous\n"
_ASM_EXTABLE(1b, 3b)
: [err] "=r" (err)
-#if 0 /* See comment in fxsave() below. */
- : [fx] "r" (fx), "m" (*fx), "0" (0));
-#else
- : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
-#endif
+ : [fx] "R" (fx), "m" (*fx), "0" (0));
return err;
}
-/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
- is pending. Clear the x87 state here by setting it to fixed
- values. The kernel data segment can be sometimes 0 and sometimes
- new user value. Both should be ok.
- Use the PDA as safe address because it should be already in L1. */
-static inline void fpu_clear(struct fpu *fpu)
-{
- struct xsave_struct *xstate = &fpu->state->xsave;
- struct i387_fxsave_struct *fx = &fpu->state->fxsave;
-
- /*
- * xsave header may indicate the init state of the FP.
- */
- if (use_xsave() &&
- !(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
- return;
-
- if (unlikely(fx->swd & X87_FSW_ES))
- asm volatile("fnclex");
- alternative_input(ASM_NOP8 ASM_NOP2,
- " emms\n" /* clear stack tags */
- " fildl %%gs:0", /* load to clear state */
- X86_FEATURE_FXSAVE_LEAK);
-}
-
-static inline void clear_fpu_state(struct task_struct *tsk)
-{
- fpu_clear(&tsk->thread.fpu);
-}
-
static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
{
int err;
@@ -149,6 +118,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
if (unlikely(err))
return -EFAULT;
+ /* See comment in fxsave() below. */
asm volatile("1: rex64/fxsave (%[fx])\n\t"
"2:\n"
".section .fixup,\"ax\"\n"
@@ -157,11 +127,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
".previous\n"
_ASM_EXTABLE(1b, 3b)
: [err] "=r" (err), "=m" (*fx)
-#if 0 /* See comment in fxsave() below. */
- : [fx] "r" (fx), "0" (0));
-#else
- : [fx] "cdaSDb" (fx), "0" (0));
-#endif
+ : [fx] "R" (fx), "0" (0));
if (unlikely(err) &&
__clear_user(fx, sizeof(struct i387_fxsave_struct)))
err = -EFAULT;
@@ -175,56 +141,29 @@ static inline void fpu_fxsave(struct fpu *fpu)
uses any extended registers for addressing, a second REX prefix
will be generated (to the assembler, rex64 followed by semicolon
is a separate instruction), and hence the 64-bitness is lost. */
-#if 0
+
+#ifdef CONFIG_AS_FXSAVEQ
/* Using "fxsaveq %0" would be the ideal choice, but is only supported
starting with gas 2.16. */
__asm__ __volatile__("fxsaveq %0"
: "=m" (fpu->state->fxsave));
-#elif 0
+#else
/* Using, as a workaround, the properly prefixed form below isn't
accepted by any binutils version so far released, complaining that
the same type of prefix is used twice if an extended register is
- needed for addressing (fix submitted to mainline 2005-11-21). */
- __asm__ __volatile__("rex64/fxsave %0"
- : "=m" (fpu->state->fxsave));
-#else
- /* This, however, we can work around by forcing the compiler to select
+ needed for addressing (fix submitted to mainline 2005-11-21).
+ asm volatile("rex64/fxsave %0"
+ : "=m" (fpu->state->fxsave));
+ This, however, we can work around by forcing the compiler to select
an addressing mode that doesn't require extended registers. */
- __asm__ __volatile__("rex64/fxsave (%1)"
- : "=m" (fpu->state->fxsave)
- : "cdaSDb" (&fpu->state->fxsave));
+ asm volatile("rex64/fxsave (%[fx])"
+ : "=m" (fpu->state->fxsave)
+ : [fx] "R" (&fpu->state->fxsave));
#endif
}
-static inline void fpu_save_init(struct fpu *fpu)
-{
- if (use_xsave())
- fpu_xsave(fpu);
- else
- fpu_fxsave(fpu);
-
- fpu_clear(fpu);
-}
-
-static inline void __save_init_fpu(struct task_struct *tsk)
-{
- fpu_save_init(&tsk->thread.fpu);
- task_thread_info(tsk)->status &= ~TS_USEDFPU;
-}
-
#else /* CONFIG_X86_32 */
-#ifdef CONFIG_MATH_EMULATION
-extern void finit_soft_fpu(struct i387_soft_struct *soft);
-#else
-static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
-#endif
-
-static inline void tolerant_fwait(void)
-{
- asm volatile("fnclex ; fwait");
-}
-
/* perform fxrstor iff the processor has extended states, otherwise frstor */
static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
{
@@ -241,6 +180,14 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
return 0;
}
+static inline void fpu_fxsave(struct fpu *fpu)
+{
+ asm volatile("fxsave %[fx]"
+ : [fx] "=m" (fpu->state->fxsave));
+}
+
+#endif /* CONFIG_X86_64 */
+
/* We need a safe address that is cheap to find and that is already
in L1 during context switch. The best choices are unfortunately
different for UP and SMP */
@@ -256,47 +203,33 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
static inline void fpu_save_init(struct fpu *fpu)
{
if (use_xsave()) {
- struct xsave_struct *xstate = &fpu->state->xsave;
- struct i387_fxsave_struct *fx = &fpu->state->fxsave;
-
fpu_xsave(fpu);
/*
* xsave header may indicate the init state of the FP.
*/
- if (!(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
- goto end;
-
- if (unlikely(fx->swd & X87_FSW_ES))
- asm volatile("fnclex");
-
- /*
- * we can do a simple return here or be paranoid :)
- */
- goto clear_state;
+ if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP))
+ return;
+ } else if (use_fxsr()) {
+ fpu_fxsave(fpu);
+ } else {
+ asm volatile("fsave %[fx]; fwait"
+ : [fx] "=m" (fpu->state->fsave));
+ return;
}
- /* Use more nops than strictly needed in case the compiler
- varies code */
- alternative_input(
- "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
- "fxsave %[fx]\n"
- "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
- X86_FEATURE_FXSR,
- [fx] "m" (fpu->state->fxsave),
- [fsw] "m" (fpu->state->fxsave.swd) : "memory");
-clear_state:
+ if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES))
+ asm volatile("fnclex");
+
/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
is pending. Clear the x87 state here by setting it to fixed
values. safe_address is a random variable that should be in L1 */
alternative_input(
- GENERIC_NOP8 GENERIC_NOP2,
+ ASM_NOP8 ASM_NOP2,
"emms\n\t" /* clear stack tags */
- "fildl %[addr]", /* set F?P to defined value */
+ "fildl %P[addr]", /* set F?P to defined value */
X86_FEATURE_FXSAVE_LEAK,
[addr] "m" (safe_address));
-end:
- ;
}
static inline void __save_init_fpu(struct task_struct *tsk)
@@ -305,9 +238,6 @@ static inline void __save_init_fpu(struct task_struct *tsk)
task_thread_info(tsk)->status &= ~TS_USEDFPU;
}
-
-#endif /* CONFIG_X86_64 */
-
static inline int fpu_fxrstor_checking(struct fpu *fpu)
{
return fxrstor_checking(&fpu->state->fxsave);
@@ -344,7 +274,10 @@ static inline void __unlazy_fpu(struct task_struct *tsk)
static inline void __clear_fpu(struct task_struct *tsk)
{
if (task_thread_info(tsk)->status & TS_USEDFPU) {
- tolerant_fwait();
+ /* Ignore delayed exceptions from user space */
+ asm volatile("1: fwait\n"
+ "2:\n"
+ _ASM_EXTABLE(1b, 2b));
task_thread_info(tsk)->status &= ~TS_USEDFPU;
stts();
}
@@ -405,19 +338,6 @@ static inline void irq_ts_restore(int TS_state)
stts();
}
-#ifdef CONFIG_X86_64
-
-static inline void save_init_fpu(struct task_struct *tsk)
-{
- __save_init_fpu(tsk);
- stts();
-}
-
-#define unlazy_fpu __unlazy_fpu
-#define clear_fpu __clear_fpu
-
-#else /* CONFIG_X86_32 */
-
/*
* These disable preemption on their own and are safe
*/
@@ -443,8 +363,6 @@ static inline void clear_fpu(struct task_struct *tsk)
preempt_enable();
}
-#endif /* CONFIG_X86_64 */
-
/*
* i387 state interaction
*/
@@ -508,7 +426,4 @@ extern void fpu_finit(struct fpu *fpu);
#endif /* __ASSEMBLY__ */
-#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
-#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
-
#endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index 1655147646aa..a20365953bf8 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -55,6 +55,8 @@ extern struct irq_chip i8259A_chip;
struct legacy_pic {
int nr_legacy_irqs;
struct irq_chip *chip;
+ void (*mask)(unsigned int irq);
+ void (*unmask)(unsigned int irq);
void (*mask_all)(void);
void (*restore_mask)(void);
void (*init)(int auto_eoi);
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 30a3e9776123..6a45ec41ec26 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -206,6 +206,7 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
extern void iounmap(volatile void __iomem *addr);
+extern void set_iounmap_nonlazy(void);
#ifdef __KERNEL__
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 9cb2edb87c2f..c8be4566c3d2 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -170,12 +170,6 @@ extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
extern void probe_nr_irqs_gsi(void);
-extern int setup_ioapic_entry(int apic, int irq,
- struct IO_APIC_route_entry *entry,
- unsigned int destination, int trigger,
- int polarity, int vector, int pin);
-extern void ioapic_write_entry(int apic, int pin,
- struct IO_APIC_route_entry e);
extern void setup_ioapic_ids_from_mpc(void);
struct mp_ioapic_gsi{
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index f275e2244505..1c23360fb2d8 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -3,4 +3,39 @@
#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
+#ifdef CONFIG_INTR_REMAP
+static inline void prepare_irte(struct irte *irte, int vector,
+ unsigned int dest)
+{
+ memset(irte, 0, sizeof(*irte));
+
+ irte->present = 1;
+ irte->dst_mode = apic->irq_dest_mode;
+ /*
+ * Trigger mode in the IRTE will always be edge, and for IO-APIC, the
+ * actual level or edge trigger will be setup in the IO-APIC
+ * RTE. This will help simplify level triggered irq migration.
+ * For more details, see the comments (in io_apic.c) explainig IO-APIC
+ * irq migration in the presence of interrupt-remapping.
+ */
+ irte->trigger_mode = 0;
+ irte->dlvry_mode = apic->irq_delivery_mode;
+ irte->vector = vector;
+ irte->dest_id = IRTE_DEST(dest);
+ irte->redir_hint = 1;
+}
+static inline bool irq_remapped(struct irq_cfg *cfg)
+{
+ return cfg->irq_2_iommu.iommu != NULL;
+}
+#else
+static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
+{
+}
+static inline bool irq_remapped(struct irq_cfg *cfg)
+{
+ return false;
+}
+#endif
+
#endif /* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 16350740edf6..4a711a684b17 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -10,6 +10,9 @@
*/
#ifndef _ASM_X86_MRST_H
#define _ASM_X86_MRST_H
+
+#include
+
extern int pci_mrst_init(void);
int __init sfi_parse_mrtc(struct sfi_table_header *table);
@@ -26,7 +29,7 @@ enum mrst_cpu_type {
};
extern enum mrst_cpu_type __mrst_cpu_chip;
-static enum mrst_cpu_type mrst_identify_cpu(void)
+static inline enum mrst_cpu_type mrst_identify_cpu(void)
{
return __mrst_cpu_chip;
}
@@ -42,4 +45,9 @@ extern enum mrst_timer_options mrst_timer_options;
#define SFI_MTMR_MAX_NUM 8
#define SFI_MRTC_MAX 8
+extern struct console early_mrst_console;
+extern void mrst_early_console_init(void);
+
+extern struct console early_hsu_console;
+extern void hsu_early_console_init(void);
#endif /* _ASM_X86_MRST_H */
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
new file mode 100644
index 000000000000..bcdff997668c
--- /dev/null
+++ b/arch/x86/include/asm/mwait.h
@@ -0,0 +1,15 @@
+#ifndef _ASM_X86_MWAIT_H
+#define _ASM_X86_MWAIT_H
+
+#define MWAIT_SUBSTATE_MASK 0xf
+#define MWAIT_CSTATE_MASK 0xf
+#define MWAIT_SUBSTATE_SIZE 4
+#define MWAIT_MAX_NUM_CSTATES 8
+
+#define CPUID_MWAIT_LEAF 5
+#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1
+#define CPUID5_ECX_INTERRUPT_BREAK 0x2
+
+#define MWAIT_ECX_INTERRUPT_BREAK 0x1
+
+#endif /* _ASM_X86_MWAIT_H */
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index 08fde475cb3b..2a8478140bb3 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -21,10 +21,14 @@ extern void olpc_ofw_detect(void);
/* install OFW's pde permanently into the kernel's pgtable */
extern void setup_olpc_ofw_pgd(void);
+/* check if OFW was detected during boot */
+extern bool olpc_ofw_present(void);
+
#else /* !CONFIG_OLPC_OPENFIRMWARE */
static inline void olpc_ofw_detect(void) { }
static inline void setup_olpc_ofw_pgd(void) { }
+static inline bool olpc_ofw_present(void) { return false; }
#endif /* !CONFIG_OLPC_OPENFIRMWARE */
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index a667f24c7254..1df66211fd1b 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -8,7 +8,7 @@
#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
-#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
+#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1))
#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
/* Cast PAGE_MASK to a signed type so that it is sign-extended if
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 5653f43d90e5..edecb4ed2210 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -416,11 +416,6 @@ static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn);
}
-static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
- unsigned long start, unsigned long count)
-{
- PVOP_VCALL4(pv_mmu_ops.alloc_pmd_clone, pfn, clonepfn, start, count);
-}
static inline void paravirt_release_pmd(unsigned long pfn)
{
PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index db9ef5532341..b82bac975250 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -255,7 +255,6 @@ struct pv_mmu_ops {
*/
void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
- void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count);
void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
void (*release_pte)(unsigned long pfn);
void (*release_pmd)(unsigned long pfn);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a34c785c5a63..ada823a13c7c 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,6 +28,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
extern spinlock_t pgd_lock;
extern struct list_head pgd_list;
+extern struct mm_struct *pgd_page_get_mm(struct page *page);
+
#ifdef CONFIG_PARAVIRT
#include
#else /* !CONFIG_PARAVIRT */
@@ -603,6 +605,8 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
pte_update(mm, addr, ptep);
}
+#define flush_tlb_fix_spurious_fault(vma, address)
+
/*
* clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
*
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 076052cd62be..f96ac9bedf75 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -102,6 +102,8 @@ static inline void native_pgd_clear(pgd_t *pgd)
native_set_pgd(pgd, native_make_pgd(0));
}
+extern void sync_global_pgds(unsigned long start, unsigned long end);
+
/*
* Conversion functions: convert a page and protection to a page entry,
* and a page entry and page directory to the page they refer to.
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 325b7bdbebaa..cae9c3cb95cf 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -110,6 +110,8 @@ struct cpuinfo_x86 {
u16 phys_proc_id;
/* Core id: */
u16 cpu_core_id;
+ /* Compute unit id */
+ u8 compute_unit_id;
/* Index into per_cpu list: */
u16 cpu_index;
#endif
@@ -602,7 +604,7 @@ extern unsigned long mmu_cr4_features;
static inline void set_in_cr4(unsigned long mask)
{
- unsigned cr4;
+ unsigned long cr4;
mmu_cr4_features |= mask;
cr4 = read_cr4();
@@ -612,7 +614,7 @@ static inline void set_in_cr4(unsigned long mask)
static inline void clear_in_cr4(unsigned long mask)
{
- unsigned cr4;
+ unsigned long cr4;
mmu_cr4_features &= ~mask;
cr4 = read_cr4();
@@ -764,29 +766,6 @@ extern unsigned long idle_halt;
extern unsigned long idle_nomwait;
extern bool c1e_detected;
-/*
- * on systems with caches, caches must be flashed as the absolute
- * last instruction before going into a suspended halt. Otherwise,
- * dirty data can linger in the cache and become stale on resume,
- * leading to strange errors.
- *
- * perform a variety of operations to guarantee that the compiler
- * will not reorder instructions. wbinvd itself is serializing
- * so the processor will not reorder.
- *
- * Systems without cache can just go into halt.
- */
-static inline void wbinvd_halt(void)
-{
- mb();
- /* check for clflush to determine if wbinvd is legal */
- if (cpu_has_clflush)
- asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
- else
- while (1)
- halt();
-}
-
extern void enable_sep_cpu(void);
extern int sysenter_setup(void);
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index ef292c792d74..d6763b139a84 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -93,6 +93,11 @@ void *extend_brk(size_t size, size_t align);
: : "i" (sz)); \
}
+/* Helper for reserving space for arrays of things */
+#define RESERVE_BRK_ARRAY(type, name, entries) \
+ type *name; \
+ RESERVE_BRK(name, sizeof(type) * entries)
+
#ifdef __i386__
void __init i386_start_kernel(void);
diff --git a/arch/x86/include/asm/vmi.h b/arch/x86/include/asm/vmi.h
deleted file mode 100644
index 61e08c0a2907..000000000000
--- a/arch/x86/include/asm/vmi.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * VMI interface definition
- *
- * Copyright (C) 2005, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Maintained by: Zachary Amsden zach@vmware.com
- *
- */
-#include
-
-/*
- *---------------------------------------------------------------------
- *
- * VMI Option ROM API
- *
- *---------------------------------------------------------------------
- */
-#define VMI_SIGNATURE 0x696d5663 /* "cVmi" */
-
-#define PCI_VENDOR_ID_VMWARE 0x15AD
-#define PCI_DEVICE_ID_VMWARE_VMI 0x0801
-
-/*
- * We use two version numbers for compatibility, with the major
- * number signifying interface breakages, and the minor number
- * interface extensions.
- */
-#define VMI_API_REV_MAJOR 3
-#define VMI_API_REV_MINOR 0
-
-#define VMI_CALL_CPUID 0
-#define VMI_CALL_WRMSR 1
-#define VMI_CALL_RDMSR 2
-#define VMI_CALL_SetGDT 3
-#define VMI_CALL_SetLDT 4
-#define VMI_CALL_SetIDT 5
-#define VMI_CALL_SetTR 6
-#define VMI_CALL_GetGDT 7
-#define VMI_CALL_GetLDT 8
-#define VMI_CALL_GetIDT 9
-#define VMI_CALL_GetTR 10
-#define VMI_CALL_WriteGDTEntry 11
-#define VMI_CALL_WriteLDTEntry 12
-#define VMI_CALL_WriteIDTEntry 13
-#define VMI_CALL_UpdateKernelStack 14
-#define VMI_CALL_SetCR0 15
-#define VMI_CALL_SetCR2 16
-#define VMI_CALL_SetCR3 17
-#define VMI_CALL_SetCR4 18
-#define VMI_CALL_GetCR0 19
-#define VMI_CALL_GetCR2 20
-#define VMI_CALL_GetCR3 21
-#define VMI_CALL_GetCR4 22
-#define VMI_CALL_WBINVD 23
-#define VMI_CALL_SetDR 24
-#define VMI_CALL_GetDR 25
-#define VMI_CALL_RDPMC 26
-#define VMI_CALL_RDTSC 27
-#define VMI_CALL_CLTS 28
-#define VMI_CALL_EnableInterrupts 29
-#define VMI_CALL_DisableInterrupts 30
-#define VMI_CALL_GetInterruptMask 31
-#define VMI_CALL_SetInterruptMask 32
-#define VMI_CALL_IRET 33
-#define VMI_CALL_SYSEXIT 34
-#define VMI_CALL_Halt 35
-#define VMI_CALL_Reboot 36
-#define VMI_CALL_Shutdown 37
-#define VMI_CALL_SetPxE 38
-#define VMI_CALL_SetPxELong 39
-#define VMI_CALL_UpdatePxE 40
-#define VMI_CALL_UpdatePxELong 41
-#define VMI_CALL_MachineToPhysical 42
-#define VMI_CALL_PhysicalToMachine 43
-#define VMI_CALL_AllocatePage 44
-#define VMI_CALL_ReleasePage 45
-#define VMI_CALL_InvalPage 46
-#define VMI_CALL_FlushTLB 47
-#define VMI_CALL_SetLinearMapping 48
-
-#define VMI_CALL_SetIOPLMask 61
-#define VMI_CALL_SetInitialAPState 62
-#define VMI_CALL_APICWrite 63
-#define VMI_CALL_APICRead 64
-#define VMI_CALL_IODelay 65
-#define VMI_CALL_SetLazyMode 73
-
-/*
- *---------------------------------------------------------------------
- *
- * MMU operation flags
- *
- *---------------------------------------------------------------------
- */
-
-/* Flags used by VMI_{Allocate|Release}Page call */
-#define VMI_PAGE_PAE 0x10 /* Allocate PAE shadow */
-#define VMI_PAGE_CLONE 0x20 /* Clone from another shadow */
-#define VMI_PAGE_ZEROED 0x40 /* Page is pre-zeroed */
-
-
-/* Flags shared by Allocate|Release Page and PTE updates */
-#define VMI_PAGE_PT 0x01
-#define VMI_PAGE_PD 0x02
-#define VMI_PAGE_PDP 0x04
-#define VMI_PAGE_PML4 0x08
-
-#define VMI_PAGE_NORMAL 0x00 /* for debugging */
-
-/* Flags used by PTE updates */
-#define VMI_PAGE_CURRENT_AS 0x10 /* implies VMI_PAGE_VA_MASK is valid */
-#define VMI_PAGE_DEFER 0x20 /* may queue update until TLB inval */
-#define VMI_PAGE_VA_MASK 0xfffff000
-
-#ifdef CONFIG_X86_PAE
-#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
-#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
-#else
-#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_ZEROED)
-#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_ZEROED)
-#endif
-
-/* Flags used by VMI_FlushTLB call */
-#define VMI_FLUSH_TLB 0x01
-#define VMI_FLUSH_GLOBAL 0x02
-
-/*
- *---------------------------------------------------------------------
- *
- * VMI relocation definitions for ROM call get_reloc
- *
- *---------------------------------------------------------------------
- */
-
-/* VMI Relocation types */
-#define VMI_RELOCATION_NONE 0
-#define VMI_RELOCATION_CALL_REL 1
-#define VMI_RELOCATION_JUMP_REL 2
-#define VMI_RELOCATION_NOP 3
-
-#ifndef __ASSEMBLY__
-struct vmi_relocation_info {
- unsigned char *eip;
- unsigned char type;
- unsigned char reserved[3];
-};
-#endif
-
-
-/*
- *---------------------------------------------------------------------
- *
- * Generic ROM structures and definitions
- *
- *---------------------------------------------------------------------
- */
-
-#ifndef __ASSEMBLY__
-
-struct vrom_header {
- u16 rom_signature; /* option ROM signature */
- u8 rom_length; /* ROM length in 512 byte chunks */
- u8 rom_entry[4]; /* 16-bit code entry point */
- u8 rom_pad0; /* 4-byte align pad */
- u32 vrom_signature; /* VROM identification signature */
- u8 api_version_min;/* Minor version of API */
- u8 api_version_maj;/* Major version of API */
- u8 jump_slots; /* Number of jump slots */
- u8 reserved1; /* Reserved for expansion */
- u32 virtual_top; /* Hypervisor virtual address start */
- u16 reserved2; /* Reserved for expansion */
- u16 license_offs; /* Offset to License string */
- u16 pci_header_offs;/* Offset to PCI OPROM header */
- u16 pnp_header_offs;/* Offset to PnP OPROM header */
- u32 rom_pad3; /* PnP reserverd / VMI reserved */
- u8 reserved[96]; /* Reserved for headers */
- char vmi_init[8]; /* VMI_Init jump point */
- char get_reloc[8]; /* VMI_GetRelocationInfo jump point */
-} __attribute__((packed));
-
-struct pnp_header {
- char sig[4];
- char rev;
- char size;
- short next;
- short res;
- long devID;
- unsigned short manufacturer_offset;
- unsigned short product_offset;
-} __attribute__((packed));
-
-struct pci_header {
- char sig[4];
- short vendorID;
- short deviceID;
- short vpdData;
- short size;
- char rev;
- char class;
- char subclass;
- char interface;
- short chunks;
- char rom_version_min;
- char rom_version_maj;
- char codetype;
- char lastRom;
- short reserved;
-} __attribute__((packed));
-
-/* Function prototypes for bootstrapping */
-#ifdef CONFIG_VMI
-extern void vmi_init(void);
-extern void vmi_activate(void);
-extern void vmi_bringup(void);
-#else
-static inline void vmi_init(void) {}
-static inline void vmi_activate(void) {}
-static inline void vmi_bringup(void) {}
-#endif
-
-/* State needed to start an application processor in an SMP system. */
-struct vmi_ap_state {
- u32 cr0;
- u32 cr2;
- u32 cr3;
- u32 cr4;
-
- u64 efer;
-
- u32 eip;
- u32 eflags;
- u32 eax;
- u32 ebx;
- u32 ecx;
- u32 edx;
- u32 esp;
- u32 ebp;
- u32 esi;
- u32 edi;
- u16 cs;
- u16 ss;
- u16 ds;
- u16 es;
- u16 fs;
- u16 gs;
- u16 ldtr;
-
- u16 gdtr_limit;
- u32 gdtr_base;
- u32 idtr_base;
- u16 idtr_limit;
-};
-
-#endif
diff --git a/arch/x86/include/asm/vmi_time.h b/arch/x86/include/asm/vmi_time.h
deleted file mode 100644
index c6e0bee93e3c..000000000000
--- a/arch/x86/include/asm/vmi_time.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * VMI Time wrappers
- *
- * Copyright (C) 2006, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to dhecht@vmware.com
- *
- */
-
-#ifndef _ASM_X86_VMI_TIME_H
-#define _ASM_X86_VMI_TIME_H
-
-/*
- * Raw VMI call indices for timer functions
- */
-#define VMI_CALL_GetCycleFrequency 66
-#define VMI_CALL_GetCycleCounter 67
-#define VMI_CALL_SetAlarm 68
-#define VMI_CALL_CancelAlarm 69
-#define VMI_CALL_GetWallclockTime 70
-#define VMI_CALL_WallclockUpdated 71
-
-/* Cached VMI timer operations */
-extern struct vmi_timer_ops {
- u64 (*get_cycle_frequency)(void);
- u64 (*get_cycle_counter)(int);
- u64 (*get_wallclock)(void);
- int (*wallclock_updated)(void);
- void (*set_alarm)(u32 flags, u64 expiry, u64 period);
- void (*cancel_alarm)(u32 flags);
-} vmi_timer_ops;
-
-/* Prototypes */
-extern void __init vmi_time_init(void);
-extern unsigned long vmi_get_wallclock(void);
-extern int vmi_set_wallclock(unsigned long now);
-extern unsigned long long vmi_sched_clock(void);
-extern unsigned long vmi_tsc_khz(void);
-
-#ifdef CONFIG_X86_LOCAL_APIC
-extern void __devinit vmi_time_bsp_init(void);
-extern void __devinit vmi_time_ap_init(void);
-#endif
-
-/*
- * When run under a hypervisor, a vcpu is always in one of three states:
- * running, halted, or ready. The vcpu is in the 'running' state if it
- * is executing. When the vcpu executes the halt interface, the vcpu
- * enters the 'halted' state and remains halted until there is some work
- * pending for the vcpu (e.g. an alarm expires, host I/O completes on
- * behalf of virtual I/O). At this point, the vcpu enters the 'ready'
- * state (waiting for the hypervisor to reschedule it). Finally, at any
- * time when the vcpu is not in the 'running' state nor the 'halted'
- * state, it is in the 'ready' state.
- *
- * Real time is advances while the vcpu is 'running', 'ready', or
- * 'halted'. Stolen time is the time in which the vcpu is in the
- * 'ready' state. Available time is the remaining time -- the vcpu is
- * either 'running' or 'halted'.
- *
- * All three views of time are accessible through the VMI cycle
- * counters.
- */
-
-/* The cycle counters. */
-#define VMI_CYCLES_REAL 0
-#define VMI_CYCLES_AVAILABLE 1
-#define VMI_CYCLES_STOLEN 2
-
-/* The alarm interface 'flags' bits */
-#define VMI_ALARM_COUNTERS 2
-
-#define VMI_ALARM_COUNTER_MASK 0x000000ff
-
-#define VMI_ALARM_WIRED_IRQ0 0x00000000
-#define VMI_ALARM_WIRED_LVTT 0x00010000
-
-#define VMI_ALARM_IS_ONESHOT 0x00000000
-#define VMI_ALARM_IS_PERIODIC 0x00000100
-
-#define CONFIG_VMI_ALARM_HZ 100
-
-#endif /* _ASM_X86_VMI_TIME_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 7490bf8d1459..80a93dc99076 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -86,15 +86,15 @@ obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_VM86) += vm86_32.o
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
+obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_APB_TIMER) += apb_timer.o
-obj-$(CONFIG_K8_NB) += k8.o
+obj-$(CONFIG_AMD_NB) += amd_nb.o
obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
-obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
obj-$(CONFIG_KVM_GUEST) += kvm.o
obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
@@ -107,6 +107,7 @@ obj-$(CONFIG_SCx200) += scx200.o
scx200-y += scx200_32.o
obj-$(CONFIG_OLPC) += olpc.o
+obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o
obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o
obj-$(CONFIG_X86_MRST) += mrst.o
@@ -123,7 +124,6 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
- obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
obj-$(CONFIG_AUDIT) += audit_64.o
obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index fb16f17e59be..5812404a0d4c 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -13,6 +13,7 @@
#include
#include
+#include
/*
* Initialize bm_flags based on the CPU cache properties
@@ -65,16 +66,6 @@ static struct cstate_entry __percpu *cpu_cstate_entry; /* per CPU ptr */
static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
-#define MWAIT_SUBSTATE_MASK (0xf)
-#define MWAIT_CSTATE_MASK (0xf)
-#define MWAIT_SUBSTATE_SIZE (4)
-
-#define CPUID_MWAIT_LEAF (5)
-#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
-#define CPUID5_ECX_INTERRUPT_BREAK (0x2)
-
-#define MWAIT_ECX_INTERRUPT_BREAK (0x1)
-
#define NATIVE_CSTATE_BEYOND_HALT (2)
static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 679b6450382b..d2fdb0826df2 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
* Author: Joerg Roedel
* Leo Duran
*
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 5a170cbbbed8..3cb482e123de 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
* Author: Joerg Roedel
* Leo Duran
*
@@ -194,6 +194,39 @@ static inline unsigned long tbl_size(int entry_size)
return 1UL << shift;
}
+/* Access to l1 and l2 indexed register spaces */
+
+static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
+{
+ u32 val;
+
+ pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+ pci_read_config_dword(iommu->dev, 0xfc, &val);
+ return val;
+}
+
+static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
+{
+ pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
+ pci_write_config_dword(iommu->dev, 0xfc, val);
+ pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+}
+
+static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
+{
+ u32 val;
+
+ pci_write_config_dword(iommu->dev, 0xf0, address);
+ pci_read_config_dword(iommu->dev, 0xf4, &val);
+ return val;
+}
+
+static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
+{
+ pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
+ pci_write_config_dword(iommu->dev, 0xf4, val);
+}
+
/****************************************************************************
*
* AMD IOMMU MMIO register space handling functions
@@ -619,6 +652,7 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
{
int cap_ptr = iommu->cap_ptr;
u32 range, misc;
+ int i, j;
pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
&iommu->cap);
@@ -633,12 +667,29 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
MMIO_GET_LD(range));
iommu->evt_msi_num = MMIO_MSI_NUM(misc);
- if (is_rd890_iommu(iommu->dev)) {
- pci_read_config_dword(iommu->dev, 0xf0, &iommu->cache_cfg[0]);
- pci_read_config_dword(iommu->dev, 0xf4, &iommu->cache_cfg[1]);
- pci_read_config_dword(iommu->dev, 0xf8, &iommu->cache_cfg[2]);
- pci_read_config_dword(iommu->dev, 0xfc, &iommu->cache_cfg[3]);
- }
+ if (!is_rd890_iommu(iommu->dev))
+ return;
+
+ /*
+ * Some rd890 systems may not be fully reconfigured by the BIOS, so
+ * it's necessary for us to store this information so it can be
+ * reprogrammed on resume
+ */
+
+ pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
+ &iommu->stored_addr_lo);
+ pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
+ &iommu->stored_addr_hi);
+
+ /* Low bit locks writes to configuration space */
+ iommu->stored_addr_lo &= ~1;
+
+ for (i = 0; i < 6; i++)
+ for (j = 0; j < 0x12; j++)
+ iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
+
+ for (i = 0; i < 0x83; i++)
+ iommu->stored_l2[i] = iommu_read_l2(iommu, i);
}
/*
@@ -1127,14 +1178,53 @@ static void iommu_init_flags(struct amd_iommu *iommu)
iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
}
-static void iommu_apply_quirks(struct amd_iommu *iommu)
+static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
{
- if (is_rd890_iommu(iommu->dev)) {
- pci_write_config_dword(iommu->dev, 0xf0, iommu->cache_cfg[0]);
- pci_write_config_dword(iommu->dev, 0xf4, iommu->cache_cfg[1]);
- pci_write_config_dword(iommu->dev, 0xf8, iommu->cache_cfg[2]);
- pci_write_config_dword(iommu->dev, 0xfc, iommu->cache_cfg[3]);
- }
+ int i, j;
+ u32 ioc_feature_control;
+ struct pci_dev *pdev = NULL;
+
+ /* RD890 BIOSes may not have completely reconfigured the iommu */
+ if (!is_rd890_iommu(iommu->dev))
+ return;
+
+ /*
+ * First, we need to ensure that the iommu is enabled. This is
+ * controlled by a register in the northbridge
+ */
+ pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
+
+ if (!pdev)
+ return;
+
+ /* Select Northbridge indirect register 0x75 and enable writing */
+ pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
+ pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
+
+ /* Enable the iommu */
+ if (!(ioc_feature_control & 0x1))
+ pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
+
+ pci_dev_put(pdev);
+
+ /* Restore the iommu BAR */
+ pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+ iommu->stored_addr_lo);
+ pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
+ iommu->stored_addr_hi);
+
+ /* Restore the l1 indirect regs for each of the 6 l1s */
+ for (i = 0; i < 6; i++)
+ for (j = 0; j < 0x12; j++)
+ iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
+
+ /* Restore the l2 indirect regs */
+ for (i = 0; i < 0x83; i++)
+ iommu_write_l2(iommu, i, iommu->stored_l2[i]);
+
+ /* Lock PCI setup registers */
+ pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+ iommu->stored_addr_lo | 1);
}
/*
@@ -1147,7 +1237,6 @@ static void enable_iommus(void)
for_each_iommu(iommu) {
iommu_disable(iommu);
- iommu_apply_quirks(iommu);
iommu_init_flags(iommu);
iommu_set_device_table(iommu);
iommu_enable_command_buffer(iommu);
@@ -1173,6 +1262,11 @@ static void disable_iommus(void)
static int amd_iommu_resume(struct sys_device *dev)
{
+ struct amd_iommu *iommu;
+
+ for_each_iommu(iommu)
+ iommu_apply_resume_quirks(iommu);
+
/* re-load the hardware */
enable_iommus();
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/amd_nb.c
similarity index 66%
rename from arch/x86/kernel/k8.c
rename to arch/x86/kernel/amd_nb.c
index 0f7bc20cfcde..8f6463d8ed0d 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -8,21 +8,19 @@
#include
#include
#include
-#include
-
-int num_k8_northbridges;
-EXPORT_SYMBOL(num_k8_northbridges);
+#include
static u32 *flush_words;
struct pci_device_id k8_nb_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) },
{}
};
EXPORT_SYMBOL(k8_nb_ids);
-struct pci_dev **k8_northbridges;
+struct k8_northbridge_info k8_northbridges;
EXPORT_SYMBOL(k8_northbridges);
static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
@@ -40,36 +38,45 @@ int cache_k8_northbridges(void)
int i;
struct pci_dev *dev;
- if (num_k8_northbridges)
+ if (k8_northbridges.num)
return 0;
dev = NULL;
while ((dev = next_k8_northbridge(dev)) != NULL)
- num_k8_northbridges++;
+ k8_northbridges.num++;
- k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *),
- GFP_KERNEL);
- if (!k8_northbridges)
+ /* some CPU families (e.g. family 0x11) do not support GART */
+ if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
+ boot_cpu_data.x86 == 0x15)
+ k8_northbridges.gart_supported = 1;
+
+ k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) *
+ sizeof(void *), GFP_KERNEL);
+ if (!k8_northbridges.nb_misc)
return -ENOMEM;
- if (!num_k8_northbridges) {
- k8_northbridges[0] = NULL;
+ if (!k8_northbridges.num) {
+ k8_northbridges.nb_misc[0] = NULL;
return 0;
}
- flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL);
- if (!flush_words) {
- kfree(k8_northbridges);
- return -ENOMEM;
+ if (k8_northbridges.gart_supported) {
+ flush_words = kmalloc(k8_northbridges.num * sizeof(u32),
+ GFP_KERNEL);
+ if (!flush_words) {
+ kfree(k8_northbridges.nb_misc);
+ return -ENOMEM;
+ }
}
dev = NULL;
i = 0;
while ((dev = next_k8_northbridge(dev)) != NULL) {
- k8_northbridges[i] = dev;
- pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
+ k8_northbridges.nb_misc[i] = dev;
+ if (k8_northbridges.gart_supported)
+ pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
}
- k8_northbridges[i] = NULL;
+ k8_northbridges.nb_misc[i] = NULL;
return 0;
}
EXPORT_SYMBOL_GPL(cache_k8_northbridges);
@@ -93,22 +100,25 @@ void k8_flush_garts(void)
unsigned long flags;
static DEFINE_SPINLOCK(gart_lock);
+ if (!k8_northbridges.gart_supported)
+ return;
+
/* Avoid races between AGP and IOMMU. In theory it's not needed
but I'm not sure if the hardware won't lose flush requests
when another is pending. This whole thing is so expensive anyways
that it doesn't matter to serialize more. -AK */
spin_lock_irqsave(&gart_lock, flags);
flushed = 0;
- for (i = 0; i < num_k8_northbridges; i++) {
- pci_write_config_dword(k8_northbridges[i], 0x9c,
+ for (i = 0; i < k8_northbridges.num; i++) {
+ pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c,
flush_words[i]|1);
flushed++;
}
- for (i = 0; i < num_k8_northbridges; i++) {
+ for (i = 0; i < k8_northbridges.num; i++) {
u32 w;
/* Make sure the hardware actually executed the flush*/
for (;;) {
- pci_read_config_dword(k8_northbridges[i],
+ pci_read_config_dword(k8_northbridges.nb_misc[i],
0x9c, &w);
if (!(w & 1))
break;
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 8dd77800ff5d..92543c73cf8e 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -231,34 +231,6 @@ static void apbt_restart_clocksource(struct clocksource *cs)
apbt_start_counter(phy_cs_timer_id);
}
-/* Setup IRQ routing via IOAPIC */
-#ifdef CONFIG_SMP
-static void apbt_setup_irq(struct apbt_dev *adev)
-{
- struct irq_chip *chip;
- struct irq_desc *desc;
-
- /* timer0 irq has been setup early */
- if (adev->irq == 0)
- return;
- desc = irq_to_desc(adev->irq);
- chip = get_irq_chip(adev->irq);
- disable_irq(adev->irq);
- desc->status |= IRQ_MOVE_PCNTXT;
- irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
- /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */
- set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge");
- enable_irq(adev->irq);
- if (system_state == SYSTEM_BOOTING)
- if (request_irq(adev->irq, apbt_interrupt_handler,
- IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
- adev->name, adev)) {
- printk(KERN_ERR "Failed request IRQ for APBT%d\n",
- adev->num);
- }
-}
-#endif
-
static void apbt_enable_int(int n)
{
unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
@@ -334,6 +306,27 @@ static int __init apbt_clockevent_register(void)
}
#ifdef CONFIG_SMP
+
+static void apbt_setup_irq(struct apbt_dev *adev)
+{
+ /* timer0 irq has been setup early */
+ if (adev->irq == 0)
+ return;
+
+ if (system_state == SYSTEM_BOOTING) {
+ irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
+ /* APB timer irqs are set up as mp_irqs, timer is edge type */
+ __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
+ if (request_irq(adev->irq, apbt_interrupt_handler,
+ IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
+ adev->name, adev)) {
+ printk(KERN_ERR "Failed request IRQ for APBT%d\n",
+ adev->num);
+ }
+ } else
+ enable_irq(adev->irq);
+}
+
/* Should be called with per cpu */
void apbt_setup_secondary_clock(void)
{
@@ -343,7 +336,7 @@ void apbt_setup_secondary_clock(void)
/* Don't register boot CPU clockevent */
cpu = smp_processor_id();
- if (cpu == boot_cpu_id)
+ if (!cpu)
return;
/*
* We need to calculate the scaled math multiplication factor for
@@ -389,16 +382,17 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
switch (action & 0xf) {
case CPU_DEAD:
+ disable_irq(adev->irq);
apbt_disable_int(cpu);
- if (system_state == SYSTEM_RUNNING)
+ if (system_state == SYSTEM_RUNNING) {
pr_debug("skipping APBT CPU %lu offline\n", cpu);
- else if (adev) {
+ } else if (adev) {
pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
free_irq(adev->irq, adev);
}
break;
default:
- pr_debug(KERN_INFO "APBT notified %lu, no action\n", action);
+ pr_debug("APBT notified %lu, no action\n", action);
}
return NOTIFY_OK;
}
@@ -552,7 +546,7 @@ bad_count:
pr_debug("APB CS going back %lx:%lx:%lx ",
t2, last_read, t2 - last_read);
bad_count_x3:
- pr_debug(KERN_INFO "tripple check enforced\n");
+ pr_debug("triple check enforced\n");
t0 = apbt_readl(phy_cs_timer_id,
APBTMR_N_CURRENT_VALUE);
udelay(1);
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index a2e0caf26e17..377f5db3b8b4 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -27,7 +27,7 @@
#include
#include
#include
-#include
+#include
#include
int gart_iommu_aperture;
@@ -307,7 +307,7 @@ void __init early_gart_iommu_check(void)
continue;
ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
- aper_enabled = ctl & AMD64_GARTEN;
+ aper_enabled = ctl & GARTEN;
aper_order = (ctl >> 1) & 7;
aper_size = (32 * 1024 * 1024) << aper_order;
aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
@@ -362,7 +362,7 @@ void __init early_gart_iommu_check(void)
continue;
ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
- ctl &= ~AMD64_GARTEN;
+ ctl &= ~GARTEN;
write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
}
}
@@ -505,8 +505,13 @@ out:
/* Fix up the north bridges */
for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
- int bus;
- int dev_base, dev_limit;
+ int bus, dev_base, dev_limit;
+
+ /*
+ * Don't enable translation yet but enable GART IO and CPU
+ * accesses and set DISTLBWALKPRB since GART table memory is UC.
+ */
+ u32 ctl = DISTLBWALKPRB | aper_order << 1;
bus = bus_dev_ranges[i].bus;
dev_base = bus_dev_ranges[i].dev_base;
@@ -515,10 +520,7 @@ out:
if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
- /* Don't enable translation yet. That is done later.
- Assume this BIOS didn't initialise the GART so
- just overwrite all previous bits */
- write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1);
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
}
}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e3b534cda49a..850657d1b0ed 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -52,6 +52,7 @@
#include
#include
#include
+#include
unsigned int num_processors;
@@ -370,38 +371,87 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
}
/*
- * Setup extended LVT, AMD specific (K8, family 10h)
+ * Setup extended LVT, AMD specific
*
- * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
- * MCE interrupts are supported. Thus MCE offset must be set to 0.
+ * Software should use the LVT offsets the BIOS provides. The offsets
+ * are determined by the subsystems using it like those for MCE
+ * threshold or IBS. On K8 only offset 0 (APIC500) and MCE interrupts
+ * are supported. Beginning with family 10h at least 4 offsets are
+ * available.
*
+ * Since the offsets must be consistent for all cores, we keep track
+ * of the LVT offsets in software and reserve the offset for the same
+ * vector also to be used on other cores. An offset is freed by
+ * setting the entry to APIC_EILVT_MASKED.
+ *
+ * If the BIOS is right, there should be no conflicts. Otherwise a
+ * "[Firmware Bug]: ..." error message is generated. However, if
+ * software does not properly determines the offsets, it is not
+ * necessarily a BIOS bug.
+ */
+
+static atomic_t eilvt_offsets[APIC_EILVT_NR_MAX];
+
+static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new)
+{
+ return (old & APIC_EILVT_MASKED)
+ || (new == APIC_EILVT_MASKED)
+ || ((new & ~APIC_EILVT_MASKED) == old);
+}
+
+static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
+{
+ unsigned int rsvd; /* 0: uninitialized */
+
+ if (offset >= APIC_EILVT_NR_MAX)
+ return ~0;
+
+ rsvd = atomic_read(&eilvt_offsets[offset]) & ~APIC_EILVT_MASKED;
+ do {
+ if (rsvd &&
+ !eilvt_entry_is_changeable(rsvd, new))
+ /* may not change if vectors are different */
+ return rsvd;
+ rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new);
+ } while (rsvd != new);
+
+ return new;
+}
+
+/*
* If mask=1, the LVT entry does not generate interrupts while mask=0
* enables the vector. See also the BKDGs.
*/
-#define APIC_EILVT_LVTOFF_MCE 0
-#define APIC_EILVT_LVTOFF_IBS 1
-
-static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
+int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
{
- unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0);
- unsigned int v = (mask << 16) | (msg_type << 8) | vector;
+ unsigned long reg = APIC_EILVTn(offset);
+ unsigned int new, old, reserved;
- apic_write(reg, v);
-}
+ new = (mask << 16) | (msg_type << 8) | vector;
+ old = apic_read(reg);
+ reserved = reserve_eilvt_offset(offset, new);
-u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
-{
- setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
- return APIC_EILVT_LVTOFF_MCE;
-}
+ if (reserved != new) {
+ pr_err(FW_BUG "cpu %d, try to setup vector 0x%x, but "
+ "vector 0x%x was already reserved by another core, "
+ "APIC%lX=0x%x\n",
+ smp_processor_id(), new, reserved, reg, old);
+ return -EINVAL;
+ }
-u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
-{
- setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
- return APIC_EILVT_LVTOFF_IBS;
+ if (!eilvt_entry_is_changeable(old, new)) {
+ pr_err(FW_BUG "cpu %d, try to setup vector 0x%x but "
+ "register already in use, APIC%lX=0x%x\n",
+ smp_processor_id(), new, reg, old);
+ return -EBUSY;
+ }
+
+ apic_write(reg, new);
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs);
+EXPORT_SYMBOL_GPL(setup_APIC_eilvt);
/*
* Program the next event, relative to now
@@ -1665,10 +1715,7 @@ int __init APIC_init_uniprocessor(void)
}
#endif
-#ifndef CONFIG_SMP
- enable_IR_x2apic();
default_setup_apic_routing();
-#endif
verify_local_APIC();
connect_bsp_APIC();
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5c5b8f3dddb5..8ae808d110f4 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -131,13 +131,9 @@ struct irq_pin_list {
struct irq_pin_list *next;
};
-static struct irq_pin_list *get_one_free_irq_2_pin(int node)
+static struct irq_pin_list *alloc_irq_pin_list(int node)
{
- struct irq_pin_list *pin;
-
- pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
-
- return pin;
+ return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
}
/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -150,10 +146,7 @@ static struct irq_cfg irq_cfgx[NR_IRQS];
int __init arch_early_irq_init(void)
{
struct irq_cfg *cfg;
- struct irq_desc *desc;
- int count;
- int node;
- int i;
+ int count, node, i;
if (!legacy_pic->nr_legacy_irqs) {
nr_irqs_gsi = 0;
@@ -162,13 +155,15 @@ int __init arch_early_irq_init(void)
cfg = irq_cfgx;
count = ARRAY_SIZE(irq_cfgx);
- node= cpu_to_node(boot_cpu_id);
+ node = cpu_to_node(0);
+
+ /* Make sure the legacy interrupts are marked in the bitmap */
+ irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs);
for (i = 0; i < count; i++) {
- desc = irq_to_desc(i);
- desc->chip_data = &cfg[i];
- zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
- zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
+ set_irq_chip_data(i, &cfg[i]);
+ zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
/*
* For legacy IRQ's, start with assigning irq0 to irq15 to
* IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
@@ -183,170 +178,88 @@ int __init arch_early_irq_init(void)
}
#ifdef CONFIG_SPARSE_IRQ
-struct irq_cfg *irq_cfg(unsigned int irq)
+static struct irq_cfg *irq_cfg(unsigned int irq)
{
- struct irq_cfg *cfg = NULL;
- struct irq_desc *desc;
+ return get_irq_chip_data(irq);
+}
- desc = irq_to_desc(irq);
- if (desc)
- cfg = desc->chip_data;
+static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
+{
+ struct irq_cfg *cfg;
+ cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
+ if (!cfg)
+ return NULL;
+ if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node))
+ goto out_cfg;
+ if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node))
+ goto out_domain;
return cfg;
+out_domain:
+ free_cpumask_var(cfg->domain);
+out_cfg:
+ kfree(cfg);
+ return NULL;
}
-static struct irq_cfg *get_one_free_irq_cfg(int node)
+static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
{
- struct irq_cfg *cfg;
-
- cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
- if (cfg) {
- if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
- kfree(cfg);
- cfg = NULL;
- } else if (!zalloc_cpumask_var_node(&cfg->old_domain,
- GFP_ATOMIC, node)) {
- free_cpumask_var(cfg->domain);
- kfree(cfg);
- cfg = NULL;
- }
- }
-
- return cfg;
-}
-
-int arch_init_chip_data(struct irq_desc *desc, int node)
-{
- struct irq_cfg *cfg;
-
- cfg = desc->chip_data;
- if (!cfg) {
- desc->chip_data = get_one_free_irq_cfg(node);
- if (!desc->chip_data) {
- printk(KERN_ERR "can not alloc irq_cfg\n");
- BUG_ON(1);
- }
- }
-
- return 0;
-}
-
-/* for move_irq_desc */
-static void
-init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
-{
- struct irq_pin_list *old_entry, *head, *tail, *entry;
-
- cfg->irq_2_pin = NULL;
- old_entry = old_cfg->irq_2_pin;
- if (!old_entry)
- return;
-
- entry = get_one_free_irq_2_pin(node);
- if (!entry)
- return;
-
- entry->apic = old_entry->apic;
- entry->pin = old_entry->pin;
- head = entry;
- tail = entry;
- old_entry = old_entry->next;
- while (old_entry) {
- entry = get_one_free_irq_2_pin(node);
- if (!entry) {
- entry = head;
- while (entry) {
- head = entry->next;
- kfree(entry);
- entry = head;
- }
- /* still use the old one */
- return;
- }
- entry->apic = old_entry->apic;
- entry->pin = old_entry->pin;
- tail->next = entry;
- tail = entry;
- old_entry = old_entry->next;
- }
-
- tail->next = NULL;
- cfg->irq_2_pin = head;
-}
-
-static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
-{
- struct irq_pin_list *entry, *next;
-
- if (old_cfg->irq_2_pin == cfg->irq_2_pin)
- return;
-
- entry = old_cfg->irq_2_pin;
-
- while (entry) {
- next = entry->next;
- kfree(entry);
- entry = next;
- }
- old_cfg->irq_2_pin = NULL;
-}
-
-void arch_init_copy_chip_data(struct irq_desc *old_desc,
- struct irq_desc *desc, int node)
-{
- struct irq_cfg *cfg;
- struct irq_cfg *old_cfg;
-
- cfg = get_one_free_irq_cfg(node);
-
if (!cfg)
return;
-
- desc->chip_data = cfg;
-
- old_cfg = old_desc->chip_data;
-
- cfg->vector = old_cfg->vector;
- cfg->move_in_progress = old_cfg->move_in_progress;
- cpumask_copy(cfg->domain, old_cfg->domain);
- cpumask_copy(cfg->old_domain, old_cfg->old_domain);
-
- init_copy_irq_2_pin(old_cfg, cfg, node);
-}
-
-static void free_irq_cfg(struct irq_cfg *cfg)
-{
+ set_irq_chip_data(at, NULL);
free_cpumask_var(cfg->domain);
free_cpumask_var(cfg->old_domain);
kfree(cfg);
}
-void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
-{
- struct irq_cfg *old_cfg, *cfg;
-
- old_cfg = old_desc->chip_data;
- cfg = desc->chip_data;
-
- if (old_cfg == cfg)
- return;
-
- if (old_cfg) {
- free_irq_2_pin(old_cfg, cfg);
- free_irq_cfg(old_cfg);
- old_desc->chip_data = NULL;
- }
-}
-/* end for move_irq_desc */
-
#else
+
struct irq_cfg *irq_cfg(unsigned int irq)
{
return irq < nr_irqs ? irq_cfgx + irq : NULL;
}
+static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
+{
+ return irq_cfgx + irq;
+}
+
+static inline void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) { }
+
#endif
+static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
+{
+ int res = irq_alloc_desc_at(at, node);
+ struct irq_cfg *cfg;
+
+ if (res < 0) {
+ if (res != -EEXIST)
+ return NULL;
+ cfg = get_irq_chip_data(at);
+ if (cfg)
+ return cfg;
+ }
+
+ cfg = alloc_irq_cfg(at, node);
+ if (cfg)
+ set_irq_chip_data(at, cfg);
+ else
+ irq_free_desc(at);
+ return cfg;
+}
+
+static int alloc_irq_from(unsigned int from, int node)
+{
+ return irq_alloc_desc_from(from, node);
+}
+
+static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
+{
+ free_irq_cfg(at, cfg);
+ irq_free_desc(at);
+}
+
struct io_apic {
unsigned int index;
unsigned int unused[3];
@@ -451,7 +364,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
io_apic_write(apic, 0x10 + 2*pin, eu.w1);
}
-void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
unsigned long flags;
raw_spin_lock_irqsave(&ioapic_lock, flags);
@@ -481,7 +394,7 @@ static void ioapic_mask_entry(int apic, int pin)
* fast in the common case, and fast for shared ISA-space IRQs.
*/
static int
-add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
+__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
{
struct irq_pin_list **last, *entry;
@@ -493,7 +406,7 @@ add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
last = &entry->next;
}
- entry = get_one_free_irq_2_pin(node);
+ entry = alloc_irq_pin_list(node);
if (!entry) {
printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n",
node, apic, pin);
@@ -508,7 +421,7 @@ add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
{
- if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin))
+ if (__add_pin_to_irq_node(cfg, node, apic, pin))
panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
}
@@ -571,11 +484,6 @@ static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry)
IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
}
-static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
-{
- io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
-}
-
static void io_apic_sync(struct irq_pin_list *entry)
{
/*
@@ -587,44 +495,37 @@ static void io_apic_sync(struct irq_pin_list *entry)
readl(&io_apic->data);
}
-static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
+static void mask_ioapic(struct irq_cfg *cfg)
{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
-}
-
-static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
-{
- struct irq_cfg *cfg = desc->chip_data;
- unsigned long flags;
-
- BUG_ON(!cfg);
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- __mask_IO_APIC_irq(cfg);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
-static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
+static void mask_ioapic_irq(struct irq_data *data)
+{
+ mask_ioapic(data->chip_data);
+}
+
+static void __unmask_ioapic(struct irq_cfg *cfg)
+{
+ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
+}
+
+static void unmask_ioapic(struct irq_cfg *cfg)
{
- struct irq_cfg *cfg = desc->chip_data;
unsigned long flags;
raw_spin_lock_irqsave(&ioapic_lock, flags);
- __unmask_IO_APIC_irq(cfg);
+ __unmask_ioapic(cfg);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
-static void mask_IO_APIC_irq(unsigned int irq)
+static void unmask_ioapic_irq(struct irq_data *data)
{
- struct irq_desc *desc = irq_to_desc(irq);
-
- mask_IO_APIC_irq_desc(desc);
-}
-static void unmask_IO_APIC_irq(unsigned int irq)
-{
- struct irq_desc *desc = irq_to_desc(irq);
-
- unmask_IO_APIC_irq_desc(desc);
+ unmask_ioapic(data->chip_data);
}
static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
@@ -694,14 +595,14 @@ struct IO_APIC_route_entry **alloc_ioapic_entries(void)
struct IO_APIC_route_entry **ioapic_entries;
ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics,
- GFP_ATOMIC);
+ GFP_KERNEL);
if (!ioapic_entries)
return 0;
for (apic = 0; apic < nr_ioapics; apic++) {
ioapic_entries[apic] =
kzalloc(sizeof(struct IO_APIC_route_entry) *
- nr_ioapic_registers[apic], GFP_ATOMIC);
+ nr_ioapic_registers[apic], GFP_KERNEL);
if (!ioapic_entries[apic])
goto nomem;
}
@@ -1259,7 +1160,6 @@ void __setup_vector_irq(int cpu)
/* Initialize vector_irq on a new cpu */
int irq, vector;
struct irq_cfg *cfg;
- struct irq_desc *desc;
/*
* vector_lock will make sure that we don't run into irq vector
@@ -1268,9 +1168,10 @@ void __setup_vector_irq(int cpu)
*/
raw_spin_lock(&vector_lock);
/* Mark the inuse vectors */
- for_each_irq_desc(irq, desc) {
- cfg = desc->chip_data;
-
+ for_each_active_irq(irq) {
+ cfg = get_irq_chip_data(irq);
+ if (!cfg)
+ continue;
/*
* If it is a legacy IRQ handled by the legacy PIC, this cpu
* will be part of the irq_cfg's domain.
@@ -1327,17 +1228,17 @@ static inline int IO_APIC_irq_trigger(int irq)
}
#endif
-static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
+static void ioapic_register_intr(unsigned int irq, unsigned long trigger)
{
if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
trigger == IOAPIC_LEVEL)
- desc->status |= IRQ_LEVEL;
+ irq_set_status_flags(irq, IRQ_LEVEL);
else
- desc->status &= ~IRQ_LEVEL;
+ irq_clear_status_flags(irq, IRQ_LEVEL);
- if (irq_remapped(irq)) {
- desc->status |= IRQ_MOVE_PCNTXT;
+ if (irq_remapped(get_irq_chip_data(irq))) {
+ irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
if (trigger)
set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
handle_fasteoi_irq,
@@ -1358,10 +1259,10 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
handle_edge_irq, "edge");
}
-int setup_ioapic_entry(int apic_id, int irq,
- struct IO_APIC_route_entry *entry,
- unsigned int destination, int trigger,
- int polarity, int vector, int pin)
+static int setup_ioapic_entry(int apic_id, int irq,
+ struct IO_APIC_route_entry *entry,
+ unsigned int destination, int trigger,
+ int polarity, int vector, int pin)
{
/*
* add it to the IO-APIC irq-routing table:
@@ -1382,21 +1283,7 @@ int setup_ioapic_entry(int apic_id, int irq,
if (index < 0)
panic("Failed to allocate IRTE for ioapic %d\n", apic_id);
- memset(&irte, 0, sizeof(irte));
-
- irte.present = 1;
- irte.dst_mode = apic->irq_dest_mode;
- /*
- * Trigger mode in the IRTE will always be edge, and the
- * actual level or edge trigger will be setup in the IO-APIC
- * RTE. This will help simplify level triggered irq migration.
- * For more details, see the comments above explainig IO-APIC
- * irq migration in the presence of interrupt-remapping.
- */
- irte.trigger_mode = 0;
- irte.dlvry_mode = apic->irq_delivery_mode;
- irte.vector = vector;
- irte.dest_id = IRTE_DEST(destination);
+ prepare_irte(&irte, vector, destination);
/* Set source-id of interrupt request */
set_ioapic_sid(&irte, apic_id);
@@ -1431,18 +1318,14 @@ int setup_ioapic_entry(int apic_id, int irq,
return 0;
}
-static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq_desc *desc,
- int trigger, int polarity)
+static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
+ struct irq_cfg *cfg, int trigger, int polarity)
{
- struct irq_cfg *cfg;
struct IO_APIC_route_entry entry;
unsigned int dest;
if (!IO_APIC_IRQ(irq))
return;
-
- cfg = desc->chip_data;
-
/*
* For legacy irqs, cfg->domain starts with cpu 0 for legacy
* controllers like 8259. Now that IO-APIC can handle this irq, update
@@ -1471,9 +1354,9 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
return;
}
- ioapic_register_intr(irq, desc, trigger);
+ ioapic_register_intr(irq, trigger);
if (irq < legacy_pic->nr_legacy_irqs)
- legacy_pic->chip->mask(irq);
+ legacy_pic->mask(irq);
ioapic_write_entry(apic_id, pin, entry);
}
@@ -1484,11 +1367,9 @@ static struct {
static void __init setup_IO_APIC_irqs(void)
{
- int apic_id, pin, idx, irq;
- int notcon = 0;
- struct irq_desc *desc;
+ int apic_id, pin, idx, irq, notcon = 0;
+ int node = cpu_to_node(0);
struct irq_cfg *cfg;
- int node = cpu_to_node(boot_cpu_id);
apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
@@ -1525,19 +1406,17 @@ static void __init setup_IO_APIC_irqs(void)
apic->multi_timer_check(apic_id, irq))
continue;
- desc = irq_to_desc_alloc_node(irq, node);
- if (!desc) {
- printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+ cfg = alloc_irq_and_cfg_at(irq, node);
+ if (!cfg)
continue;
- }
- cfg = desc->chip_data;
+
add_pin_to_irq_node(cfg, node, apic_id, pin);
/*
* don't mark it in pin_programmed, so later acpi could
* set it correctly when irq < 16
*/
- setup_IO_APIC_irq(apic_id, pin, irq, desc,
- irq_trigger(idx), irq_polarity(idx));
+ setup_ioapic_irq(apic_id, pin, irq, cfg, irq_trigger(idx),
+ irq_polarity(idx));
}
if (notcon)
@@ -1552,9 +1431,7 @@ static void __init setup_IO_APIC_irqs(void)
*/
void setup_IO_APIC_irq_extra(u32 gsi)
{
- int apic_id = 0, pin, idx, irq;
- int node = cpu_to_node(boot_cpu_id);
- struct irq_desc *desc;
+ int apic_id = 0, pin, idx, irq, node = cpu_to_node(0);
struct irq_cfg *cfg;
/*
@@ -1570,18 +1447,15 @@ void setup_IO_APIC_irq_extra(u32 gsi)
return;
irq = pin_2_irq(idx, apic_id, pin);
-#ifdef CONFIG_SPARSE_IRQ
- desc = irq_to_desc(irq);
- if (desc)
- return;
-#endif
- desc = irq_to_desc_alloc_node(irq, node);
- if (!desc) {
- printk(KERN_INFO "can not get irq_desc for %d\n", irq);
- return;
- }
- cfg = desc->chip_data;
+ /* Only handle the non legacy irqs on secondary ioapics */
+ if (apic_id == 0 || irq < NR_IRQS_LEGACY)
+ return;
+
+ cfg = alloc_irq_and_cfg_at(irq, node);
+ if (!cfg)
+ return;
+
add_pin_to_irq_node(cfg, node, apic_id, pin);
if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
@@ -1591,7 +1465,7 @@ void setup_IO_APIC_irq_extra(u32 gsi)
}
set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
- setup_IO_APIC_irq(apic_id, pin, irq, desc,
+ setup_ioapic_irq(apic_id, pin, irq, cfg,
irq_trigger(idx), irq_polarity(idx));
}
@@ -1642,7 +1516,6 @@ __apicdebuginit(void) print_IO_APIC(void)
union IO_APIC_reg_03 reg_03;
unsigned long flags;
struct irq_cfg *cfg;
- struct irq_desc *desc;
unsigned int irq;
printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
@@ -1729,10 +1602,10 @@ __apicdebuginit(void) print_IO_APIC(void)
}
}
printk(KERN_DEBUG "IRQ to pin mappings:\n");
- for_each_irq_desc(irq, desc) {
+ for_each_active_irq(irq) {
struct irq_pin_list *entry;
- cfg = desc->chip_data;
+ cfg = get_irq_chip_data(irq);
if (!cfg)
continue;
entry = cfg->irq_2_pin;
@@ -2239,29 +2112,26 @@ static int __init timer_irq_works(void)
* an edge even if it isn't on the 8259A...
*/
-static unsigned int startup_ioapic_irq(unsigned int irq)
+static unsigned int startup_ioapic_irq(struct irq_data *data)
{
- int was_pending = 0;
+ int was_pending = 0, irq = data->irq;
unsigned long flags;
- struct irq_cfg *cfg;
raw_spin_lock_irqsave(&ioapic_lock, flags);
if (irq < legacy_pic->nr_legacy_irqs) {
- legacy_pic->chip->mask(irq);
+ legacy_pic->mask(irq);
if (legacy_pic->irq_pending(irq))
was_pending = 1;
}
- cfg = irq_cfg(irq);
- __unmask_IO_APIC_irq(cfg);
+ __unmask_ioapic(data->chip_data);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return was_pending;
}
-static int ioapic_retrigger_irq(unsigned int irq)
+static int ioapic_retrigger_irq(struct irq_data *data)
{
-
- struct irq_cfg *cfg = irq_cfg(irq);
+ struct irq_cfg *cfg = data->chip_data;
unsigned long flags;
raw_spin_lock_irqsave(&vector_lock, flags);
@@ -2312,7 +2182,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
* With interrupt-remapping, destination information comes
* from interrupt-remapping table entry.
*/
- if (!irq_remapped(irq))
+ if (!irq_remapped(cfg))
io_apic_write(apic, 0x11 + pin*2, dest);
reg = io_apic_read(apic, 0x10 + pin*2);
reg &= ~IO_APIC_REDIR_VECTOR_MASK;
@@ -2322,65 +2192,46 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
}
/*
- * Either sets desc->affinity to a valid value, and returns
+ * Either sets data->affinity to a valid value, and returns
* ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
- * leaves desc->affinity untouched.
+ * leaves data->affinity untouched.
*/
-unsigned int
-set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask,
- unsigned int *dest_id)
+int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ unsigned int *dest_id)
{
- struct irq_cfg *cfg;
- unsigned int irq;
+ struct irq_cfg *cfg = data->chip_data;
if (!cpumask_intersects(mask, cpu_online_mask))
return -1;
- irq = desc->irq;
- cfg = desc->chip_data;
- if (assign_irq_vector(irq, cfg, mask))
+ if (assign_irq_vector(data->irq, data->chip_data, mask))
return -1;
- cpumask_copy(desc->affinity, mask);
+ cpumask_copy(data->affinity, mask);
- *dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
+ *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
return 0;
}
static int
-set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
{
- struct irq_cfg *cfg;
+ unsigned int dest, irq = data->irq;
unsigned long flags;
- unsigned int dest;
- unsigned int irq;
- int ret = -1;
-
- irq = desc->irq;
- cfg = desc->chip_data;
+ int ret;
raw_spin_lock_irqsave(&ioapic_lock, flags);
- ret = set_desc_affinity(desc, mask, &dest);
+ ret = __ioapic_set_affinity(data, mask, &dest);
if (!ret) {
/* Only the high 8 bits are valid. */
dest = SET_APIC_LOGICAL_ID(dest);
- __target_IO_APIC_irq(irq, dest, cfg);
+ __target_IO_APIC_irq(irq, dest, data->chip_data);
}
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
return ret;
}
-static int
-set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
-{
- struct irq_desc *desc;
-
- desc = irq_to_desc(irq);
-
- return set_ioapic_affinity_irq_desc(desc, mask);
-}
-
#ifdef CONFIG_INTR_REMAP
/*
@@ -2395,24 +2246,21 @@ set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
* the interrupt-remapping table entry.
*/
static int
-migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
{
- struct irq_cfg *cfg;
+ struct irq_cfg *cfg = data->chip_data;
+ unsigned int dest, irq = data->irq;
struct irte irte;
- unsigned int dest;
- unsigned int irq;
- int ret = -1;
if (!cpumask_intersects(mask, cpu_online_mask))
- return ret;
+ return -EINVAL;
- irq = desc->irq;
if (get_irte(irq, &irte))
- return ret;
+ return -EBUSY;
- cfg = desc->chip_data;
if (assign_irq_vector(irq, cfg, mask))
- return ret;
+ return -EBUSY;
dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
@@ -2427,29 +2275,14 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
if (cfg->move_in_progress)
send_cleanup_vector(cfg);
- cpumask_copy(desc->affinity, mask);
-
+ cpumask_copy(data->affinity, mask);
return 0;
}
-/*
- * Migrates the IRQ destination in the process context.
- */
-static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
- const struct cpumask *mask)
-{
- return migrate_ioapic_irq_desc(desc, mask);
-}
-static int set_ir_ioapic_affinity_irq(unsigned int irq,
- const struct cpumask *mask)
-{
- struct irq_desc *desc = irq_to_desc(irq);
-
- return set_ir_ioapic_affinity_irq_desc(desc, mask);
-}
#else
-static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
- const struct cpumask *mask)
+static inline int
+ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
{
return 0;
}
@@ -2511,10 +2344,8 @@ unlock:
irq_exit();
}
-static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
+static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
{
- struct irq_desc *desc = *descp;
- struct irq_cfg *cfg = desc->chip_data;
unsigned me;
if (likely(!cfg->move_in_progress))
@@ -2526,31 +2357,28 @@ static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
send_cleanup_vector(cfg);
}
-static void irq_complete_move(struct irq_desc **descp)
+static void irq_complete_move(struct irq_cfg *cfg)
{
- __irq_complete_move(descp, ~get_irq_regs()->orig_ax);
+ __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
}
void irq_force_complete_move(int irq)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg = desc->chip_data;
+ struct irq_cfg *cfg = get_irq_chip_data(irq);
if (!cfg)
return;
- __irq_complete_move(&desc, cfg->vector);
+ __irq_complete_move(cfg, cfg->vector);
}
#else
-static inline void irq_complete_move(struct irq_desc **descp) {}
+static inline void irq_complete_move(struct irq_cfg *cfg) { }
#endif
-static void ack_apic_edge(unsigned int irq)
+static void ack_apic_edge(struct irq_data *data)
{
- struct irq_desc *desc = irq_to_desc(irq);
-
- irq_complete_move(&desc);
- move_native_irq(irq);
+ irq_complete_move(data->chip_data);
+ move_native_irq(data->irq);
ack_APIC_irq();
}
@@ -2572,10 +2400,12 @@ atomic_t irq_mis_count;
* Otherwise, we simulate the EOI message manually by changing the trigger
* mode to edge and then back to level, with RTE being masked during this.
*/
-static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
{
struct irq_pin_list *entry;
+ unsigned long flags;
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
for_each_irq_pin(entry, cfg->irq_2_pin) {
if (mp_ioapics[entry->apic].apicver >= 0x20) {
/*
@@ -2584,7 +2414,7 @@ static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
* intr-remapping table entry. Hence for the io-apic
* EOI we use the pin number.
*/
- if (irq_remapped(irq))
+ if (irq_remapped(cfg))
io_apic_eoi(entry->apic, entry->pin);
else
io_apic_eoi(entry->apic, cfg->vector);
@@ -2593,36 +2423,22 @@ static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
__unmask_and_level_IO_APIC_irq(entry);
}
}
-}
-
-static void eoi_ioapic_irq(struct irq_desc *desc)
-{
- struct irq_cfg *cfg;
- unsigned long flags;
- unsigned int irq;
-
- irq = desc->irq;
- cfg = desc->chip_data;
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- __eoi_ioapic_irq(irq, cfg);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
-static void ack_apic_level(unsigned int irq)
+static void ack_apic_level(struct irq_data *data)
{
+ struct irq_cfg *cfg = data->chip_data;
+ int i, do_unmask_irq = 0, irq = data->irq;
struct irq_desc *desc = irq_to_desc(irq);
unsigned long v;
- int i;
- struct irq_cfg *cfg;
- int do_unmask_irq = 0;
- irq_complete_move(&desc);
+ irq_complete_move(cfg);
#ifdef CONFIG_GENERIC_PENDING_IRQ
/* If we are moving the irq we need to mask it */
if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
do_unmask_irq = 1;
- mask_IO_APIC_irq_desc(desc);
+ mask_ioapic(cfg);
}
#endif
@@ -2658,7 +2474,6 @@ static void ack_apic_level(unsigned int irq)
* we use the above logic (mask+edge followed by unmask+level) from
* Manfred Spraul to clear the remote IRR.
*/
- cfg = desc->chip_data;
i = cfg->vector;
v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
@@ -2678,7 +2493,7 @@ static void ack_apic_level(unsigned int irq)
if (!(v & (1 << (i & 0x1f)))) {
atomic_inc(&irq_mis_count);
- eoi_ioapic_irq(desc);
+ eoi_ioapic_irq(irq, cfg);
}
/* Now we can move and renable the irq */
@@ -2709,61 +2524,57 @@ static void ack_apic_level(unsigned int irq)
* accurate and is causing problems then it is a hardware bug
* and you can go talk to the chipset vendor about it.
*/
- cfg = desc->chip_data;
if (!io_apic_level_ack_pending(cfg))
move_masked_irq(irq);
- unmask_IO_APIC_irq_desc(desc);
+ unmask_ioapic(cfg);
}
}
#ifdef CONFIG_INTR_REMAP
-static void ir_ack_apic_edge(unsigned int irq)
+static void ir_ack_apic_edge(struct irq_data *data)
{
ack_APIC_irq();
}
-static void ir_ack_apic_level(unsigned int irq)
+static void ir_ack_apic_level(struct irq_data *data)
{
- struct irq_desc *desc = irq_to_desc(irq);
-
ack_APIC_irq();
- eoi_ioapic_irq(desc);
+ eoi_ioapic_irq(data->irq, data->chip_data);
}
#endif /* CONFIG_INTR_REMAP */
static struct irq_chip ioapic_chip __read_mostly = {
- .name = "IO-APIC",
- .startup = startup_ioapic_irq,
- .mask = mask_IO_APIC_irq,
- .unmask = unmask_IO_APIC_irq,
- .ack = ack_apic_edge,
- .eoi = ack_apic_level,
+ .name = "IO-APIC",
+ .irq_startup = startup_ioapic_irq,
+ .irq_mask = mask_ioapic_irq,
+ .irq_unmask = unmask_ioapic_irq,
+ .irq_ack = ack_apic_edge,
+ .irq_eoi = ack_apic_level,
#ifdef CONFIG_SMP
- .set_affinity = set_ioapic_affinity_irq,
+ .irq_set_affinity = ioapic_set_affinity,
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
static struct irq_chip ir_ioapic_chip __read_mostly = {
- .name = "IR-IO-APIC",
- .startup = startup_ioapic_irq,
- .mask = mask_IO_APIC_irq,
- .unmask = unmask_IO_APIC_irq,
+ .name = "IR-IO-APIC",
+ .irq_startup = startup_ioapic_irq,
+ .irq_mask = mask_ioapic_irq,
+ .irq_unmask = unmask_ioapic_irq,
#ifdef CONFIG_INTR_REMAP
- .ack = ir_ack_apic_edge,
- .eoi = ir_ack_apic_level,
+ .irq_ack = ir_ack_apic_edge,
+ .irq_eoi = ir_ack_apic_level,
#ifdef CONFIG_SMP
- .set_affinity = set_ir_ioapic_affinity_irq,
+ .irq_set_affinity = ir_ioapic_set_affinity,
#endif
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
static inline void init_IO_APIC_traps(void)
{
- int irq;
- struct irq_desc *desc;
struct irq_cfg *cfg;
+ unsigned int irq;
/*
* NOTE! The local APIC isn't very good at handling
@@ -2776,8 +2587,8 @@ static inline void init_IO_APIC_traps(void)
* Also, we've got to be careful not to trash gate
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
- for_each_irq_desc(irq, desc) {
- cfg = desc->chip_data;
+ for_each_active_irq(irq) {
+ cfg = get_irq_chip_data(irq);
if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
/*
* Hmm.. We don't have an entry for this,
@@ -2788,7 +2599,7 @@ static inline void init_IO_APIC_traps(void)
legacy_pic->make_irq(irq);
else
/* Strange. Oh, well.. */
- desc->chip = &no_irq_chip;
+ set_irq_chip(irq, &no_irq_chip);
}
}
}
@@ -2797,7 +2608,7 @@ static inline void init_IO_APIC_traps(void)
* The local APIC irq-chip implementation:
*/
-static void mask_lapic_irq(unsigned int irq)
+static void mask_lapic_irq(struct irq_data *data)
{
unsigned long v;
@@ -2805,7 +2616,7 @@ static void mask_lapic_irq(unsigned int irq)
apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
}
-static void unmask_lapic_irq(unsigned int irq)
+static void unmask_lapic_irq(struct irq_data *data)
{
unsigned long v;
@@ -2813,21 +2624,21 @@ static void unmask_lapic_irq(unsigned int irq)
apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
}
-static void ack_lapic_irq(unsigned int irq)
+static void ack_lapic_irq(struct irq_data *data)
{
ack_APIC_irq();
}
static struct irq_chip lapic_chip __read_mostly = {
.name = "local-APIC",
- .mask = mask_lapic_irq,
- .unmask = unmask_lapic_irq,
- .ack = ack_lapic_irq,
+ .irq_mask = mask_lapic_irq,
+ .irq_unmask = unmask_lapic_irq,
+ .irq_ack = ack_lapic_irq,
};
-static void lapic_register_intr(int irq, struct irq_desc *desc)
+static void lapic_register_intr(int irq)
{
- desc->status &= ~IRQ_LEVEL;
+ irq_clear_status_flags(irq, IRQ_LEVEL);
set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
"edge");
}
@@ -2930,9 +2741,8 @@ int timer_through_8259 __initdata;
*/
static inline void __init check_timer(void)
{
- struct irq_desc *desc = irq_to_desc(0);
- struct irq_cfg *cfg = desc->chip_data;
- int node = cpu_to_node(boot_cpu_id);
+ struct irq_cfg *cfg = get_irq_chip_data(0);
+ int node = cpu_to_node(0);
int apic1, pin1, apic2, pin2;
unsigned long flags;
int no_pin1 = 0;
@@ -2942,7 +2752,7 @@ static inline void __init check_timer(void)
/*
* get/set the timer IRQ vector:
*/
- legacy_pic->chip->mask(0);
+ legacy_pic->mask(0);
assign_irq_vector(0, cfg, apic->target_cpus());
/*
@@ -3001,7 +2811,7 @@ static inline void __init check_timer(void)
add_pin_to_irq_node(cfg, node, apic1, pin1);
setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
} else {
- /* for edge trigger, setup_IO_APIC_irq already
+ /* for edge trigger, setup_ioapic_irq already
* leave it unmasked.
* so only need to unmask if it is level-trigger
* do we really have level trigger timer?
@@ -3009,12 +2819,12 @@ static inline void __init check_timer(void)
int idx;
idx = find_irq_entry(apic1, pin1, mp_INT);
if (idx != -1 && irq_trigger(idx))
- unmask_IO_APIC_irq_desc(desc);
+ unmask_ioapic(cfg);
}
if (timer_irq_works()) {
if (nmi_watchdog == NMI_IO_APIC) {
setup_nmi();
- legacy_pic->chip->unmask(0);
+ legacy_pic->unmask(0);
}
if (disable_timer_pin_1 > 0)
clear_IO_APIC_pin(0, pin1);
@@ -3037,14 +2847,14 @@ static inline void __init check_timer(void)
*/
replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
- legacy_pic->chip->unmask(0);
+ legacy_pic->unmask(0);
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
timer_through_8259 = 1;
if (nmi_watchdog == NMI_IO_APIC) {
- legacy_pic->chip->mask(0);
+ legacy_pic->mask(0);
setup_nmi();
- legacy_pic->chip->unmask(0);
+ legacy_pic->unmask(0);
}
goto out;
}
@@ -3052,7 +2862,7 @@ static inline void __init check_timer(void)
* Cleanup, just in case ...
*/
local_irq_disable();
- legacy_pic->chip->mask(0);
+ legacy_pic->mask(0);
clear_IO_APIC_pin(apic2, pin2);
apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
}
@@ -3069,16 +2879,16 @@ static inline void __init check_timer(void)
apic_printk(APIC_QUIET, KERN_INFO
"...trying to set up timer as Virtual Wire IRQ...\n");
- lapic_register_intr(0, desc);
+ lapic_register_intr(0);
apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
- legacy_pic->chip->unmask(0);
+ legacy_pic->unmask(0);
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
goto out;
}
local_irq_disable();
- legacy_pic->chip->mask(0);
+ legacy_pic->mask(0);
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
@@ -3244,49 +3054,42 @@ device_initcall(ioapic_init_sysfs);
/*
* Dynamic irq allocate and deallocation
*/
-unsigned int create_irq_nr(unsigned int irq_want, int node)
+unsigned int create_irq_nr(unsigned int from, int node)
{
- /* Allocate an unused irq */
- unsigned int irq;
- unsigned int new;
+ struct irq_cfg *cfg;
unsigned long flags;
- struct irq_cfg *cfg_new = NULL;
- struct irq_desc *desc_new = NULL;
+ unsigned int ret = 0;
+ int irq;
- irq = 0;
- if (irq_want < nr_irqs_gsi)
- irq_want = nr_irqs_gsi;
+ if (from < nr_irqs_gsi)
+ from = nr_irqs_gsi;
+
+ irq = alloc_irq_from(from, node);
+ if (irq < 0)
+ return 0;
+ cfg = alloc_irq_cfg(irq, node);
+ if (!cfg) {
+ free_irq_at(irq, NULL);
+ return 0;
+ }
raw_spin_lock_irqsave(&vector_lock, flags);
- for (new = irq_want; new < nr_irqs; new++) {
- desc_new = irq_to_desc_alloc_node(new, node);
- if (!desc_new) {
- printk(KERN_INFO "can not get irq_desc for %d\n", new);
- continue;
- }
- cfg_new = desc_new->chip_data;
-
- if (cfg_new->vector != 0)
- continue;
-
- desc_new = move_irq_desc(desc_new, node);
- cfg_new = desc_new->chip_data;
-
- if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
- irq = new;
- break;
- }
+ if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
+ ret = irq;
raw_spin_unlock_irqrestore(&vector_lock, flags);
- if (irq > 0)
- dynamic_irq_init_keep_chip_data(irq);
-
- return irq;
+ if (ret) {
+ set_irq_chip_data(irq, cfg);
+ irq_clear_status_flags(irq, IRQ_NOREQUEST);
+ } else {
+ free_irq_at(irq, cfg);
+ }
+ return ret;
}
int create_irq(void)
{
- int node = cpu_to_node(boot_cpu_id);
+ int node = cpu_to_node(0);
unsigned int irq_want;
int irq;
@@ -3301,14 +3104,17 @@ int create_irq(void)
void destroy_irq(unsigned int irq)
{
+ struct irq_cfg *cfg = get_irq_chip_data(irq);
unsigned long flags;
- dynamic_irq_cleanup_keep_chip_data(irq);
+ irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
- free_irte(irq);
+ if (intr_remapping_enabled)
+ free_irte(irq);
raw_spin_lock_irqsave(&vector_lock, flags);
- __clear_irq_vector(irq, get_irq_chip_data(irq));
+ __clear_irq_vector(irq, cfg);
raw_spin_unlock_irqrestore(&vector_lock, flags);
+ free_irq_at(irq, cfg);
}
/*
@@ -3332,7 +3138,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
- if (irq_remapped(irq)) {
+ if (irq_remapped(get_irq_chip_data(irq))) {
struct irte irte;
int ir_index;
u16 sub_handle;
@@ -3340,14 +3146,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
ir_index = map_irq_to_irte_handle(irq, &sub_handle);
BUG_ON(ir_index == -1);
- memset (&irte, 0, sizeof(irte));
-
- irte.present = 1;
- irte.dst_mode = apic->irq_dest_mode;
- irte.trigger_mode = 0; /* edge */
- irte.dlvry_mode = apic->irq_delivery_mode;
- irte.vector = cfg->vector;
- irte.dest_id = IRTE_DEST(dest);
+ prepare_irte(&irte, cfg->vector, dest);
/* Set source-id of interrupt request */
if (pdev)
@@ -3392,26 +3191,24 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
}
#ifdef CONFIG_SMP
-static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int
+msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
+ struct irq_cfg *cfg = data->chip_data;
struct msi_msg msg;
unsigned int dest;
- if (set_desc_affinity(desc, mask, &dest))
+ if (__ioapic_set_affinity(data, mask, &dest))
return -1;
- cfg = desc->chip_data;
-
- get_cached_msi_msg_desc(desc, &msg);
+ __get_cached_msi_msg(data->msi_desc, &msg);
msg.data &= ~MSI_DATA_VECTOR_MASK;
msg.data |= MSI_DATA_VECTOR(cfg->vector);
msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
- write_msi_msg_desc(desc, &msg);
+ __write_msi_msg(data->msi_desc, &msg);
return 0;
}
@@ -3421,17 +3218,17 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
* done in the process context using interrupt-remapping hardware.
*/
static int
-ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
+ir_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg = desc->chip_data;
- unsigned int dest;
+ struct irq_cfg *cfg = data->chip_data;
+ unsigned int dest, irq = data->irq;
struct irte irte;
if (get_irte(irq, &irte))
return -1;
- if (set_desc_affinity(desc, mask, &dest))
+ if (__ioapic_set_affinity(data, mask, &dest))
return -1;
irte.vector = cfg->vector;
@@ -3461,27 +3258,27 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
* which implement the MSI or MSI-X Capability Structure.
*/
static struct irq_chip msi_chip = {
- .name = "PCI-MSI",
- .unmask = unmask_msi_irq,
- .mask = mask_msi_irq,
- .ack = ack_apic_edge,
+ .name = "PCI-MSI",
+ .irq_unmask = unmask_msi_irq,
+ .irq_mask = mask_msi_irq,
+ .irq_ack = ack_apic_edge,
#ifdef CONFIG_SMP
- .set_affinity = set_msi_irq_affinity,
+ .irq_set_affinity = msi_set_affinity,
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
static struct irq_chip msi_ir_chip = {
- .name = "IR-PCI-MSI",
- .unmask = unmask_msi_irq,
- .mask = mask_msi_irq,
+ .name = "IR-PCI-MSI",
+ .irq_unmask = unmask_msi_irq,
+ .irq_mask = mask_msi_irq,
#ifdef CONFIG_INTR_REMAP
- .ack = ir_ack_apic_edge,
+ .irq_ack = ir_ack_apic_edge,
#ifdef CONFIG_SMP
- .set_affinity = ir_set_msi_irq_affinity,
+ .irq_set_affinity = ir_msi_set_affinity,
#endif
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
/*
@@ -3513,8 +3310,8 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
{
- int ret;
struct msi_msg msg;
+ int ret;
ret = msi_compose_msg(dev, irq, &msg, -1);
if (ret < 0)
@@ -3523,12 +3320,8 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
set_irq_msi(irq, msidesc);
write_msi_msg(irq, &msg);
- if (irq_remapped(irq)) {
- struct irq_desc *desc = irq_to_desc(irq);
- /*
- * irq migration in process context
- */
- desc->status |= IRQ_MOVE_PCNTXT;
+ if (irq_remapped(get_irq_chip_data(irq))) {
+ irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
} else
set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
@@ -3540,13 +3333,10 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
- unsigned int irq;
- int ret, sub_handle;
+ int node, ret, sub_handle, index = 0;
+ unsigned int irq, irq_want;
struct msi_desc *msidesc;
- unsigned int irq_want;
struct intel_iommu *iommu = NULL;
- int index = 0;
- int node;
/* x86 doesn't support multiple MSI yet */
if (type == PCI_CAP_ID_MSI && nvec > 1)
@@ -3606,18 +3396,17 @@ void arch_teardown_msi_irq(unsigned int irq)
#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
#ifdef CONFIG_SMP
-static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int
+dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
+ struct irq_cfg *cfg = data->chip_data;
+ unsigned int dest, irq = data->irq;
struct msi_msg msg;
- unsigned int dest;
- if (set_desc_affinity(desc, mask, &dest))
+ if (__ioapic_set_affinity(data, mask, &dest))
return -1;
- cfg = desc->chip_data;
-
dmar_msi_read(irq, &msg);
msg.data &= ~MSI_DATA_VECTOR_MASK;
@@ -3633,14 +3422,14 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
#endif /* CONFIG_SMP */
static struct irq_chip dmar_msi_type = {
- .name = "DMAR_MSI",
- .unmask = dmar_msi_unmask,
- .mask = dmar_msi_mask,
- .ack = ack_apic_edge,
+ .name = "DMAR_MSI",
+ .irq_unmask = dmar_msi_unmask,
+ .irq_mask = dmar_msi_mask,
+ .irq_ack = ack_apic_edge,
#ifdef CONFIG_SMP
- .set_affinity = dmar_msi_set_affinity,
+ .irq_set_affinity = dmar_msi_set_affinity,
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
int arch_setup_dmar_msi(unsigned int irq)
@@ -3661,26 +3450,24 @@ int arch_setup_dmar_msi(unsigned int irq)
#ifdef CONFIG_HPET_TIMER
#ifdef CONFIG_SMP
-static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int hpet_msi_set_affinity(struct irq_data *data,
+ const struct cpumask *mask, bool force)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
+ struct irq_cfg *cfg = data->chip_data;
struct msi_msg msg;
unsigned int dest;
- if (set_desc_affinity(desc, mask, &dest))
+ if (__ioapic_set_affinity(data, mask, &dest))
return -1;
- cfg = desc->chip_data;
-
- hpet_msi_read(irq, &msg);
+ hpet_msi_read(data->handler_data, &msg);
msg.data &= ~MSI_DATA_VECTOR_MASK;
msg.data |= MSI_DATA_VECTOR(cfg->vector);
msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
- hpet_msi_write(irq, &msg);
+ hpet_msi_write(data->handler_data, &msg);
return 0;
}
@@ -3688,34 +3475,33 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
#endif /* CONFIG_SMP */
static struct irq_chip ir_hpet_msi_type = {
- .name = "IR-HPET_MSI",
- .unmask = hpet_msi_unmask,
- .mask = hpet_msi_mask,
+ .name = "IR-HPET_MSI",
+ .irq_unmask = hpet_msi_unmask,
+ .irq_mask = hpet_msi_mask,
#ifdef CONFIG_INTR_REMAP
- .ack = ir_ack_apic_edge,
+ .irq_ack = ir_ack_apic_edge,
#ifdef CONFIG_SMP
- .set_affinity = ir_set_msi_irq_affinity,
+ .irq_set_affinity = ir_msi_set_affinity,
#endif
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
static struct irq_chip hpet_msi_type = {
.name = "HPET_MSI",
- .unmask = hpet_msi_unmask,
- .mask = hpet_msi_mask,
- .ack = ack_apic_edge,
+ .irq_unmask = hpet_msi_unmask,
+ .irq_mask = hpet_msi_mask,
+ .irq_ack = ack_apic_edge,
#ifdef CONFIG_SMP
- .set_affinity = hpet_msi_set_affinity,
+ .irq_set_affinity = hpet_msi_set_affinity,
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
{
- int ret;
struct msi_msg msg;
- struct irq_desc *desc = irq_to_desc(irq);
+ int ret;
if (intr_remapping_enabled) {
struct intel_iommu *iommu = map_hpet_to_ir(id);
@@ -3733,9 +3519,9 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
if (ret < 0)
return ret;
- hpet_msi_write(irq, &msg);
- desc->status |= IRQ_MOVE_PCNTXT;
- if (irq_remapped(irq))
+ hpet_msi_write(get_irq_data(irq), &msg);
+ irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
+ if (irq_remapped(get_irq_chip_data(irq)))
set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type,
handle_edge_irq, "edge");
else
@@ -3768,33 +3554,30 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
write_ht_irq_msg(irq, &msg);
}
-static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int
+ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
+ struct irq_cfg *cfg = data->chip_data;
unsigned int dest;
- if (set_desc_affinity(desc, mask, &dest))
+ if (__ioapic_set_affinity(data, mask, &dest))
return -1;
- cfg = desc->chip_data;
-
- target_ht_irq(irq, dest, cfg->vector);
-
+ target_ht_irq(data->irq, dest, cfg->vector);
return 0;
}
#endif
static struct irq_chip ht_irq_chip = {
- .name = "PCI-HT",
- .mask = mask_ht_irq,
- .unmask = unmask_ht_irq,
- .ack = ack_apic_edge,
+ .name = "PCI-HT",
+ .irq_mask = mask_ht_irq,
+ .irq_unmask = unmask_ht_irq,
+ .irq_ack = ack_apic_edge,
#ifdef CONFIG_SMP
- .set_affinity = set_ht_irq_affinity,
+ .irq_set_affinity = ht_set_affinity,
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
@@ -3885,14 +3668,13 @@ int __init arch_probe_nr_irqs(void)
if (nr < nr_irqs)
nr_irqs = nr;
- return 0;
+ return NR_IRQS_LEGACY;
}
#endif
static int __io_apic_set_pci_routing(struct device *dev, int irq,
struct io_apic_irq_attr *irq_attr)
{
- struct irq_desc *desc;
struct irq_cfg *cfg;
int node;
int ioapic, pin;
@@ -3908,13 +3690,11 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
if (dev)
node = dev_to_node(dev);
else
- node = cpu_to_node(boot_cpu_id);
+ node = cpu_to_node(0);
- desc = irq_to_desc_alloc_node(irq, node);
- if (!desc) {
- printk(KERN_INFO "can not get irq_desc %d\n", irq);
+ cfg = alloc_irq_and_cfg_at(irq, node);
+ if (!cfg)
return 0;
- }
pin = irq_attr->ioapic_pin;
trigger = irq_attr->trigger;
@@ -3924,15 +3704,14 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
* IRQs < 16 are already in the irq_2_pin[] map
*/
if (irq >= legacy_pic->nr_legacy_irqs) {
- cfg = desc->chip_data;
- if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
+ if (__add_pin_to_irq_node(cfg, node, ioapic, pin)) {
printk(KERN_INFO "can not add pin %d for irq %d\n",
pin, irq);
return 0;
}
}
- setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
+ setup_ioapic_irq(ioapic, pin, irq, cfg, trigger, polarity);
return 0;
}
@@ -4125,14 +3904,14 @@ void __init setup_ioapic_dest(void)
*/
if (desc->status &
(IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
- mask = desc->affinity;
+ mask = desc->irq_data.affinity;
else
mask = apic->target_cpus();
if (intr_remapping_enabled)
- set_ir_ioapic_affinity_irq_desc(desc, mask);
+ ir_ioapic_set_affinity(&desc->irq_data, mask, false);
else
- set_ioapic_affinity_irq_desc(desc, mask);
+ ioapic_set_affinity(&desc->irq_data, mask, false);
}
}
@@ -4316,19 +4095,18 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
void __init pre_init_apic_IRQ0(void)
{
struct irq_cfg *cfg;
- struct irq_desc *desc;
printk(KERN_INFO "Early APIC setup for system timer0\n");
#ifndef CONFIG_SMP
phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
#endif
- desc = irq_to_desc_alloc_node(0, 0);
+ /* Make sure the irq descriptor is set up */
+ cfg = alloc_irq_and_cfg_at(0, 0);
setup_local_APIC();
- cfg = irq_cfg(0);
add_pin_to_irq_node(cfg, 0, 0, 0);
set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
- setup_IO_APIC_irq(0, 0, 0, desc, 0, 0);
+ setup_ioapic_irq(0, 0, 0, cfg, 0, 0);
}
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index a43f71cb30f8..c90041ccb742 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -178,7 +178,7 @@ int __init check_nmi_watchdog(void)
error:
if (nmi_watchdog == NMI_IO_APIC) {
if (!timer_through_8259)
- legacy_pic->chip->mask(0);
+ legacy_pic->mask(0);
on_each_cpu(__acpi_nmi_disable, NULL, 1);
}
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 83e9be4778e2..f9e4e6a54073 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -54,6 +54,9 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
*/
void __init default_setup_apic_routing(void)
{
+
+ enable_IR_x2apic();
+
#ifdef CONFIG_X86_X2APIC
if (x2apic_mode
#ifdef CONFIG_X86_UV
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index ba5f62f45f01..9e093f8fe78c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -148,7 +148,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
/* calling is from identify_secondary_cpu() ? */
- if (c->cpu_index == boot_cpu_id)
+ if (!c->cpu_index)
return;
/*
@@ -253,37 +253,51 @@ static int __cpuinit nearby_node(int apicid)
#endif
/*
- * Fixup core topology information for AMD multi-node processors.
- * Assumption: Number of cores in each internal node is the same.
+ * Fixup core topology information for
+ * (1) AMD multi-node processors
+ * Assumption: Number of cores in each internal node is the same.
+ * (2) AMD processors supporting compute units
*/
#ifdef CONFIG_X86_HT
-static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c)
+static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
{
- unsigned long long value;
- u32 nodes, cores_per_node;
+ u32 nodes;
+ u8 node_id;
int cpu = smp_processor_id();
- if (!cpu_has(c, X86_FEATURE_NODEID_MSR))
+ /* get information required for multi-node processors */
+ if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+ u32 eax, ebx, ecx, edx;
+
+ cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
+ nodes = ((ecx >> 8) & 7) + 1;
+ node_id = ecx & 7;
+
+ /* get compute unit information */
+ smp_num_siblings = ((ebx >> 8) & 3) + 1;
+ c->compute_unit_id = ebx & 0xff;
+ } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
+ u64 value;
+
+ rdmsrl(MSR_FAM10H_NODE_ID, value);
+ nodes = ((value >> 3) & 7) + 1;
+ node_id = value & 7;
+ } else
return;
- /* fixup topology information only once for a core */
- if (cpu_has(c, X86_FEATURE_AMD_DCM))
- return;
+ /* fixup multi-node processor information */
+ if (nodes > 1) {
+ u32 cores_per_node;
- rdmsrl(MSR_FAM10H_NODE_ID, value);
+ set_cpu_cap(c, X86_FEATURE_AMD_DCM);
+ cores_per_node = c->x86_max_cores / nodes;
- nodes = ((value >> 3) & 7) + 1;
- if (nodes == 1)
- return;
+ /* store NodeID, use llc_shared_map to store sibling info */
+ per_cpu(cpu_llc_id, cpu) = node_id;
- set_cpu_cap(c, X86_FEATURE_AMD_DCM);
- cores_per_node = c->x86_max_cores / nodes;
-
- /* store NodeID, use llc_shared_map to store sibling info */
- per_cpu(cpu_llc_id, cpu) = value & 7;
-
- /* fixup core id to be in range from 0 to (cores_per_node - 1) */
- c->cpu_core_id = c->cpu_core_id % cores_per_node;
+ /* core id to be in range from 0 to (cores_per_node - 1) */
+ c->cpu_core_id = c->cpu_core_id % cores_per_node;
+ }
}
#endif
@@ -304,9 +318,7 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
c->phys_proc_id = c->initial_apicid >> bits;
/* use socket ID also for last level cache */
per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
- /* fixup topology information on multi-node processors */
- if ((c->x86 == 0x10) && (c->x86_model == 9))
- amd_fixup_dcm(c);
+ amd_get_topology(c);
#endif
}
@@ -412,6 +424,23 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
}
#endif
+
+ /* We need to do the following only once */
+ if (c != &boot_cpu_data)
+ return;
+
+ if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
+
+ if (c->x86 > 0x10 ||
+ (c->x86 == 0x10 && c->x86_model >= 0x2)) {
+ u64 val;
+
+ rdmsrl(MSR_K7_HWCR, val);
+ if (!(val & BIT(24)))
+ printk(KERN_WARNING FW_BUG "TSC doesn't count "
+ "with P0 frequency!\n");
+ }
+ }
}
static void __cpuinit init_amd(struct cpuinfo_x86 *c)
@@ -523,7 +552,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
#endif
if (c->extended_cpuid_level >= 0x80000006) {
- if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000))
+ if (cpuid_edx(0x80000006) & 0xf000)
num_cache_leaves = 4;
else
num_cache_leaves = 3;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f2f9ac7da25c..4b68bda30938 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -665,7 +665,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
this_cpu->c_early_init(c);
#ifdef CONFIG_SMP
- c->cpu_index = boot_cpu_id;
+ c->cpu_index = 0;
#endif
filter_cpuid_features(c, false);
}
@@ -704,16 +704,21 @@ void __init early_cpu_init(void)
}
/*
- * The NOPL instruction is supposed to exist on all CPUs with
- * family >= 6; unfortunately, that's not true in practice because
- * of early VIA chips and (more importantly) broken virtualizers that
- * are not easy to detect. In the latter case it doesn't even *fail*
- * reliably, so probing for it doesn't even work. Disable it completely
+ * The NOPL instruction is supposed to exist on all CPUs of family >= 6;
+ * unfortunately, that's not true in practice because of early VIA
+ * chips and (more importantly) broken virtualizers that are not easy
+ * to detect. In the latter case it doesn't even *fail* reliably, so
+ * probing for it doesn't even work. Disable it completely on 32-bit
* unless we can find a reliable way to detect all the broken cases.
+ * Enable it explicitly on 64-bit for non-constant inputs of cpu_has().
*/
static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
{
+#ifdef CONFIG_X86_32
clear_cpu_cap(c, X86_FEATURE_NOPL);
+#else
+ set_cpu_cap(c, X86_FEATURE_NOPL);
+#endif
}
static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
@@ -1264,13 +1269,6 @@ void __cpuinit cpu_init(void)
clear_all_debug_regs();
dbg_restore_debug_regs();
- /*
- * Force FPU initialization:
- */
- current_thread_info()->status = 0;
- clear_used_math();
- mxcsr_feature_mask_init();
-
fpu_init();
xsave_init();
}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index f668bb1f7d43..e765633f210e 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -32,6 +32,7 @@ struct cpu_dev {
extern const struct cpu_dev *const __x86_cpu_dev_start[],
*const __x86_cpu_dev_end[];
+extern void get_cpu_cap(struct cpuinfo_x86 *c);
extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
extern void get_cpu_cap(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b4389441efbb..695f17731e23 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -170,7 +170,7 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
/* calling is from identify_secondary_cpu() ? */
- if (c->cpu_index == boot_cpu_id)
+ if (!c->cpu_index)
return;
/*
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 898c2f4eab88..12cd823c8d03 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -17,7 +17,7 @@
#include
#include
-#include
+#include
#include
#define LVL_1_INST 1
@@ -306,7 +306,7 @@ struct _cache_attr {
ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
};
-#ifdef CONFIG_CPU_SUP_AMD
+#ifdef CONFIG_AMD_NB
/*
* L3 cache descriptors
@@ -369,7 +369,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
return;
/* not in virtualized environments */
- if (num_k8_northbridges == 0)
+ if (k8_northbridges.num == 0)
return;
/*
@@ -377,7 +377,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
* never freed but this is done only on shutdown so it doesn't matter.
*/
if (!l3_caches) {
- int size = num_k8_northbridges * sizeof(struct amd_l3_cache *);
+ int size = k8_northbridges.num * sizeof(struct amd_l3_cache *);
l3_caches = kzalloc(size, GFP_ATOMIC);
if (!l3_caches)
@@ -556,12 +556,12 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
show_cache_disable_1, store_cache_disable_1);
-#else /* CONFIG_CPU_SUP_AMD */
+#else /* CONFIG_AMD_NB */
static void __cpuinit
amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
{
};
-#endif /* CONFIG_CPU_SUP_AMD */
+#endif /* CONFIG_AMD_NB */
static int
__cpuinit cpuid4_cache_lookup_regs(int index,
@@ -1000,7 +1000,7 @@ static struct attribute *default_attrs[] = {
static struct attribute *default_l3_attrs[] = {
DEFAULT_SYSFS_CACHE_ATTRS,
-#ifdef CONFIG_CPU_SUP_AMD
+#ifdef CONFIG_AMD_NB
&cache_disable_0.attr,
&cache_disable_1.attr,
#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 39aaee5c1ab2..80c482382d5c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -131,7 +131,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
u32 low = 0, high = 0, address = 0;
unsigned int bank, block;
struct thresh_restart tr;
- u8 lvt_off;
+ int lvt_off = -1;
+ u8 offset;
for (bank = 0; bank < NR_BANKS; ++bank) {
for (block = 0; block < NR_BLOCKS; ++block) {
@@ -162,8 +163,28 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
if (shared_bank[bank] && c->cpu_core_id)
break;
#endif
- lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR,
- APIC_EILVT_MSG_FIX, 0);
+ offset = (high & MASK_LVTOFF_HI) >> 20;
+ if (lvt_off < 0) {
+ if (setup_APIC_eilvt(offset,
+ THRESHOLD_APIC_VECTOR,
+ APIC_EILVT_MSG_FIX, 0)) {
+ pr_err(FW_BUG "cpu %d, failed to "
+ "setup threshold interrupt "
+ "for bank %d, block %d "
+ "(MSR%08X=0x%x%08x)",
+ smp_processor_id(), bank, block,
+ address, high, low);
+ continue;
+ }
+ lvt_off = offset;
+ } else if (lvt_off != offset) {
+ pr_err(FW_BUG "cpu %d, invalid threshold "
+ "interrupt offset %d for bank %d,"
+ "block %d (MSR%08X=0x%x%08x)",
+ smp_processor_id(), lvt_off, bank,
+ block, address, high, low);
+ continue;
+ }
high &= ~MASK_LVTOFF_HI;
high |= lvt_off << 20;
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 169d8804a9f8..4b683267eca5 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -350,7 +350,7 @@ static void intel_thermal_interrupt(void)
static void unexpected_thermal_interrupt(void)
{
- printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
+ printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
smp_processor_id());
add_taint(TAINT_MACHINE_CHECK);
}
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index c5f59d071425..ac140c7be396 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -827,7 +827,7 @@ int __init amd_special_default_mtrr(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
return 0;
- if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
+ if (boot_cpu_data.x86 < 0xf)
return 0;
/* In case some hypervisor doesn't pass SYSCFG through: */
if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 7d28d7d03885..9f27228ceffd 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -64,18 +64,59 @@ static inline void k8_check_syscfg_dram_mod_en(void)
}
}
+/* Get the size of contiguous MTRR range */
+static u64 get_mtrr_size(u64 mask)
+{
+ u64 size;
+
+ mask >>= PAGE_SHIFT;
+ mask |= size_or_mask;
+ size = -mask;
+ size <<= PAGE_SHIFT;
+ return size;
+}
+
/*
- * Returns the effective MTRR type for the region
- * Error returns:
- * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
- * - 0xFF - when MTRR is not enabled
+ * Check and return the effective type for MTRR-MTRR type overlap.
+ * Returns 1 if the effective type is UNCACHEABLE, else returns 0
*/
-u8 mtrr_type_lookup(u64 start, u64 end)
+static int check_type_overlap(u8 *prev, u8 *curr)
+{
+ if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) {
+ *prev = MTRR_TYPE_UNCACHABLE;
+ *curr = MTRR_TYPE_UNCACHABLE;
+ return 1;
+ }
+
+ if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) ||
+ (*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) {
+ *prev = MTRR_TYPE_WRTHROUGH;
+ *curr = MTRR_TYPE_WRTHROUGH;
+ }
+
+ if (*prev != *curr) {
+ *prev = MTRR_TYPE_UNCACHABLE;
+ *curr = MTRR_TYPE_UNCACHABLE;
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Error/Semi-error returns:
+ * 0xFF - when MTRR is not enabled
+ * *repeat == 1 implies [start:end] spanned across MTRR range and type returned
+ * corresponds only to [start:*partial_end].
+ * Caller has to lookup again for [*partial_end:end].
+ */
+static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
{
int i;
u64 base, mask;
u8 prev_match, curr_match;
+ *repeat = 0;
if (!mtrr_state_set)
return 0xFF;
@@ -126,8 +167,34 @@ u8 mtrr_type_lookup(u64 start, u64 end)
start_state = ((start & mask) == (base & mask));
end_state = ((end & mask) == (base & mask));
- if (start_state != end_state)
- return 0xFE;
+
+ if (start_state != end_state) {
+ /*
+ * We have start:end spanning across an MTRR.
+ * We split the region into
+ * either
+ * (start:mtrr_end) (mtrr_end:end)
+ * or
+ * (start:mtrr_start) (mtrr_start:end)
+ * depending on kind of overlap.
+ * Return the type for first region and a pointer to
+ * the start of second region so that caller will
+ * lookup again on the second region.
+ * Note: This way we handle multiple overlaps as well.
+ */
+ if (start_state)
+ *partial_end = base + get_mtrr_size(mask);
+ else
+ *partial_end = base;
+
+ if (unlikely(*partial_end <= start)) {
+ WARN_ON(1);
+ *partial_end = start + PAGE_SIZE;
+ }
+
+ end = *partial_end - 1; /* end is inclusive */
+ *repeat = 1;
+ }
if ((start & mask) != (base & mask))
continue;
@@ -138,21 +205,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
continue;
}
- if (prev_match == MTRR_TYPE_UNCACHABLE ||
- curr_match == MTRR_TYPE_UNCACHABLE) {
- return MTRR_TYPE_UNCACHABLE;
- }
-
- if ((prev_match == MTRR_TYPE_WRBACK &&
- curr_match == MTRR_TYPE_WRTHROUGH) ||
- (prev_match == MTRR_TYPE_WRTHROUGH &&
- curr_match == MTRR_TYPE_WRBACK)) {
- prev_match = MTRR_TYPE_WRTHROUGH;
- curr_match = MTRR_TYPE_WRTHROUGH;
- }
-
- if (prev_match != curr_match)
- return MTRR_TYPE_UNCACHABLE;
+ if (check_type_overlap(&prev_match, &curr_match))
+ return curr_match;
}
if (mtrr_tom2) {
@@ -166,6 +220,36 @@ u8 mtrr_type_lookup(u64 start, u64 end)
return mtrr_state.def_type;
}
+/*
+ * Returns the effective MTRR type for the region
+ * Error return:
+ * 0xFF - when MTRR is not enabled
+ */
+u8 mtrr_type_lookup(u64 start, u64 end)
+{
+ u8 type, prev_type;
+ int repeat;
+ u64 partial_end;
+
+ type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+
+ /*
+ * Common path is with repeat = 0.
+ * However, we can have cases where [start:end] spans across some
+ * MTRR range. Do repeated lookups for that case here.
+ */
+ while (repeat) {
+ prev_type = type;
+ start = partial_end;
+ type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+
+ if (check_type_overlap(&prev_type, &type))
+ return type;
+ }
+
+ return type;
+}
+
/* Get the MSR pair relating to a var range */
static void
get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index fb329e9f8494..d9f4ff8fcd69 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -700,11 +700,10 @@ static void probe_nmi_watchdog(void)
{
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
- if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
- boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17)
- return;
- wd_ops = &k7_wd_ops;
- break;
+ if (boot_cpu_data.x86 == 6 ||
+ (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15))
+ wd_ops = &k7_wd_ops;
+ return;
case X86_VENDOR_INTEL:
/* Work around where perfctr1 doesn't have a working enable
* bit as described in the following errata:
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index d49079515122..c7f64e6f537a 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -44,6 +44,12 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{ X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
{ X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
{ X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 },
+ { X86_FEATURE_TSCRATEMSR, CR_EDX, 4, 0x8000000a, 0 },
+ { X86_FEATURE_VMCBCLEAN, CR_EDX, 5, 0x8000000a, 0 },
+ { X86_FEATURE_FLUSHBYASID, CR_EDX, 6, 0x8000000a, 0 },
+ { X86_FEATURE_DECODEASSISTS, CR_EDX, 7, 0x8000000a, 0 },
+ { X86_FEATURE_PAUSEFILTER, CR_EDX,10, 0x8000000a, 0 },
+ { X86_FEATURE_PFTHRESHOLD, CR_EDX,12, 0x8000000a, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 045b36cada65..994828899e09 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -34,7 +34,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
if (!csize)
return 0;
- vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+ vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
if (!vaddr)
return -ENOMEM;
@@ -46,6 +46,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
} else
memcpy(buf, vaddr + offset, csize);
+ set_iounmap_nonlazy();
iounmap(vaddr);
return csize;
}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index ebdb85cf2686..76b8cd953dee 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -96,7 +96,6 @@ static void __init nvidia_bugs(int num, int slot, int func)
}
-#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
{
@@ -115,7 +114,6 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
d &= 0xff;
return d;
}
-#endif
static void __init ati_bugs(int num, int slot, int func)
{
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index fa99bae75ace..4572f25f9325 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -14,6 +14,7 @@
#include
#include
#include
+#include
#include
#include
@@ -238,6 +239,18 @@ static int __init setup_early_printk(char *buf)
#ifdef CONFIG_HVC_XEN
if (!strncmp(buf, "xen", 3))
early_console_register(&xenboot_console, keep);
+#endif
+#ifdef CONFIG_X86_MRST_EARLY_PRINTK
+ if (!strncmp(buf, "mrst", 4)) {
+ mrst_early_console_init();
+ early_console_register(&early_mrst_console, keep);
+ }
+
+ if (!strncmp(buf, "hsu", 3)) {
+ hsu_early_console_init();
+ early_console_register(&early_hsu_console, keep);
+ }
+
#endif
buf++;
}
diff --git a/arch/x86/kernel/early_printk_mrst.c b/arch/x86/kernel/early_printk_mrst.c
new file mode 100644
index 000000000000..65df603622b2
--- /dev/null
+++ b/arch/x86/kernel/early_printk_mrst.c
@@ -0,0 +1,319 @@
+/*
+ * early_printk_mrst.c - early consoles for Intel MID platforms
+ *
+ * Copyright (c) 2008-2010, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+/*
+ * This file implements two early consoles named mrst and hsu.
+ * mrst is based on Maxim3110 spi-uart device, it exists in both
+ * Moorestown and Medfield platforms, while hsu is based on a High
+ * Speed UART device which only exists in the Medfield platform
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+#define MRST_SPI_TIMEOUT 0x200000
+#define MRST_REGBASE_SPI0 0xff128000
+#define MRST_REGBASE_SPI1 0xff128400
+#define MRST_CLK_SPI0_REG 0xff11d86c
+
+/* Bit fields in CTRLR0 */
+#define SPI_DFS_OFFSET 0
+
+#define SPI_FRF_OFFSET 4
+#define SPI_FRF_SPI 0x0
+#define SPI_FRF_SSP 0x1
+#define SPI_FRF_MICROWIRE 0x2
+#define SPI_FRF_RESV 0x3
+
+#define SPI_MODE_OFFSET 6
+#define SPI_SCPH_OFFSET 6
+#define SPI_SCOL_OFFSET 7
+#define SPI_TMOD_OFFSET 8
+#define SPI_TMOD_TR 0x0 /* xmit & recv */
+#define SPI_TMOD_TO 0x1 /* xmit only */
+#define SPI_TMOD_RO 0x2 /* recv only */
+#define SPI_TMOD_EPROMREAD 0x3 /* eeprom read mode */
+
+#define SPI_SLVOE_OFFSET 10
+#define SPI_SRL_OFFSET 11
+#define SPI_CFS_OFFSET 12
+
+/* Bit fields in SR, 7 bits */
+#define SR_MASK 0x7f /* cover 7 bits */
+#define SR_BUSY (1 << 0)
+#define SR_TF_NOT_FULL (1 << 1)
+#define SR_TF_EMPT (1 << 2)
+#define SR_RF_NOT_EMPT (1 << 3)
+#define SR_RF_FULL (1 << 4)
+#define SR_TX_ERR (1 << 5)
+#define SR_DCOL (1 << 6)
+
+struct dw_spi_reg {
+ u32 ctrl0;
+ u32 ctrl1;
+ u32 ssienr;
+ u32 mwcr;
+ u32 ser;
+ u32 baudr;
+ u32 txfltr;
+ u32 rxfltr;
+ u32 txflr;
+ u32 rxflr;
+ u32 sr;
+ u32 imr;
+ u32 isr;
+ u32 risr;
+ u32 txoicr;
+ u32 rxoicr;
+ u32 rxuicr;
+ u32 msticr;
+ u32 icr;
+ u32 dmacr;
+ u32 dmatdlr;
+ u32 dmardlr;
+ u32 idr;
+ u32 version;
+
+ /* Currently operates as 32 bits, though only the low 16 bits matter */
+ u32 dr;
+} __packed;
+
+#define dw_readl(dw, name) __raw_readl(&(dw)->name)
+#define dw_writel(dw, name, val) __raw_writel((val), &(dw)->name)
+
+/* Default use SPI0 register for mrst, we will detect Penwell and use SPI1 */
+static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
+
+static u32 *pclk_spi0;
+/* Always contains an accessable address, start with 0 */
+static struct dw_spi_reg *pspi;
+
+static struct kmsg_dumper dw_dumper;
+static int dumper_registered;
+
+static void dw_kmsg_dump(struct kmsg_dumper *dumper,
+ enum kmsg_dump_reason reason,
+ const char *s1, unsigned long l1,
+ const char *s2, unsigned long l2)
+{
+ int i;
+
+ /* When run to this, we'd better re-init the HW */
+ mrst_early_console_init();
+
+ for (i = 0; i < l1; i++)
+ early_mrst_console.write(&early_mrst_console, s1 + i, 1);
+ for (i = 0; i < l2; i++)
+ early_mrst_console.write(&early_mrst_console, s2 + i, 1);
+}
+
+/* Set the ratio rate to 115200, 8n1, IRQ disabled */
+static void max3110_write_config(void)
+{
+ u16 config;
+
+ config = 0xc001;
+ dw_writel(pspi, dr, config);
+}
+
+/* Translate char to a eligible word and send to max3110 */
+static void max3110_write_data(char c)
+{
+ u16 data;
+
+ data = 0x8000 | c;
+ dw_writel(pspi, dr, data);
+}
+
+void mrst_early_console_init(void)
+{
+ u32 ctrlr0 = 0;
+ u32 spi0_cdiv;
+ u32 freq; /* Freqency info only need be searched once */
+
+ /* Base clk is 100 MHz, the actual clk = 100M / (clk_divider + 1) */
+ pclk_spi0 = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+ MRST_CLK_SPI0_REG);
+ spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9;
+ freq = 100000000 / (spi0_cdiv + 1);
+
+ if (mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL)
+ mrst_spi_paddr = MRST_REGBASE_SPI1;
+
+ pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+ mrst_spi_paddr);
+
+ /* Disable SPI controller */
+ dw_writel(pspi, ssienr, 0);
+
+ /* Set control param, 8 bits, transmit only mode */
+ ctrlr0 = dw_readl(pspi, ctrl0);
+
+ ctrlr0 &= 0xfcc0;
+ ctrlr0 |= 0xf | (SPI_FRF_SPI << SPI_FRF_OFFSET)
+ | (SPI_TMOD_TO << SPI_TMOD_OFFSET);
+ dw_writel(pspi, ctrl0, ctrlr0);
+
+ /*
+ * Change the spi0 clk to comply with 115200 bps, use 100000 to
+ * calculate the clk dividor to make the clock a little slower
+ * than real baud rate.
+ */
+ dw_writel(pspi, baudr, freq/100000);
+
+ /* Disable all INT for early phase */
+ dw_writel(pspi, imr, 0x0);
+
+ /* Set the cs to spi-uart */
+ dw_writel(pspi, ser, 0x2);
+
+ /* Enable the HW, the last step for HW init */
+ dw_writel(pspi, ssienr, 0x1);
+
+ /* Set the default configuration */
+ max3110_write_config();
+
+ /* Register the kmsg dumper */
+ if (!dumper_registered) {
+ dw_dumper.dump = dw_kmsg_dump;
+ kmsg_dump_register(&dw_dumper);
+ dumper_registered = 1;
+ }
+}
+
+/* Slave select should be called in the read/write function */
+static void early_mrst_spi_putc(char c)
+{
+ unsigned int timeout;
+ u32 sr;
+
+ timeout = MRST_SPI_TIMEOUT;
+ /* Early putc needs to make sure the TX FIFO is not full */
+ while (--timeout) {
+ sr = dw_readl(pspi, sr);
+ if (!(sr & SR_TF_NOT_FULL))
+ cpu_relax();
+ else
+ break;
+ }
+
+ if (!timeout)
+ pr_warning("MRST earlycon: timed out\n");
+ else
+ max3110_write_data(c);
+}
+
+/* Early SPI only uses polling mode */
+static void early_mrst_spi_write(struct console *con, const char *str, unsigned n)
+{
+ int i;
+
+ for (i = 0; i < n && *str; i++) {
+ if (*str == '\n')
+ early_mrst_spi_putc('\r');
+ early_mrst_spi_putc(*str);
+ str++;
+ }
+}
+
+struct console early_mrst_console = {
+ .name = "earlymrst",
+ .write = early_mrst_spi_write,
+ .flags = CON_PRINTBUFFER,
+ .index = -1,
+};
+
+/*
+ * Following is the early console based on Medfield HSU (High
+ * Speed UART) device.
+ */
+#define HSU_PORT2_PADDR 0xffa28180
+
+static void __iomem *phsu;
+
+void hsu_early_console_init(void)
+{
+ u8 lcr;
+
+ phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+ HSU_PORT2_PADDR);
+
+ /* Disable FIFO */
+ writeb(0x0, phsu + UART_FCR);
+
+ /* Set to default 115200 bps, 8n1 */
+ lcr = readb(phsu + UART_LCR);
+ writeb((0x80 | lcr), phsu + UART_LCR);
+ writeb(0x18, phsu + UART_DLL);
+ writeb(lcr, phsu + UART_LCR);
+ writel(0x3600, phsu + UART_MUL*4);
+
+ writeb(0x8, phsu + UART_MCR);
+ writeb(0x7, phsu + UART_FCR);
+ writeb(0x3, phsu + UART_LCR);
+
+ /* Clear IRQ status */
+ readb(phsu + UART_LSR);
+ readb(phsu + UART_RX);
+ readb(phsu + UART_IIR);
+ readb(phsu + UART_MSR);
+
+ /* Enable FIFO */
+ writeb(0x7, phsu + UART_FCR);
+}
+
+#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
+
+static void early_hsu_putc(char ch)
+{
+ unsigned int timeout = 10000; /* 10ms */
+ u8 status;
+
+ while (--timeout) {
+ status = readb(phsu + UART_LSR);
+ if (status & BOTH_EMPTY)
+ break;
+ udelay(1);
+ }
+
+ /* Only write the char when there was no timeout */
+ if (timeout)
+ writeb(ch, phsu + UART_TX);
+}
+
+static void early_hsu_write(struct console *con, const char *str, unsigned n)
+{
+ int i;
+
+ for (i = 0; i < n && *str; i++) {
+ if (*str == '\n')
+ early_hsu_putc('\r');
+ early_hsu_putc(*str);
+ str++;
+ }
+}
+
+struct console early_hsu_console = {
+ .name = "earlyhsu",
+ .write = early_hsu_write,
+ .flags = CON_PRINTBUFFER,
+ .index = -1,
+};
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 227d00920d2f..9fb188d7bc76 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -115,8 +115,7 @@
/* unfortunately push/pop can't be no-op */
.macro PUSH_GS
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
.endm
.macro POP_GS pop=0
addl $(4 + \pop), %esp
@@ -140,14 +139,12 @@
#else /* CONFIG_X86_32_LAZY_GS */
.macro PUSH_GS
- pushl %gs
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %gs
/*CFI_REL_OFFSET gs, 0*/
.endm
.macro POP_GS pop=0
-98: popl %gs
- CFI_ADJUST_CFA_OFFSET -4
+98: popl_cfi %gs
/*CFI_RESTORE gs*/
.if \pop <> 0
add $\pop, %esp
@@ -195,35 +192,25 @@
.macro SAVE_ALL
cld
PUSH_GS
- pushl %fs
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %fs
/*CFI_REL_OFFSET fs, 0;*/
- pushl %es
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %es
/*CFI_REL_OFFSET es, 0;*/
- pushl %ds
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ds
/*CFI_REL_OFFSET ds, 0;*/
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
CFI_REL_OFFSET eax, 0
- pushl %ebp
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ebp
CFI_REL_OFFSET ebp, 0
- pushl %edi
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %edi
CFI_REL_OFFSET edi, 0
- pushl %esi
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %esi
CFI_REL_OFFSET esi, 0
- pushl %edx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %edx
CFI_REL_OFFSET edx, 0
- pushl %ecx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ecx
CFI_REL_OFFSET ecx, 0
- pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ebx
CFI_REL_OFFSET ebx, 0
movl $(__USER_DS), %edx
movl %edx, %ds
@@ -234,39 +221,29 @@
.endm
.macro RESTORE_INT_REGS
- popl %ebx
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %ebx
CFI_RESTORE ebx
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %ecx
CFI_RESTORE ecx
- popl %edx
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %edx
CFI_RESTORE edx
- popl %esi
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %esi
CFI_RESTORE esi
- popl %edi
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %edi
CFI_RESTORE edi
- popl %ebp
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %ebp
CFI_RESTORE ebp
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %eax
CFI_RESTORE eax
.endm
.macro RESTORE_REGS pop=0
RESTORE_INT_REGS
-1: popl %ds
- CFI_ADJUST_CFA_OFFSET -4
+1: popl_cfi %ds
/*CFI_RESTORE ds;*/
-2: popl %es
- CFI_ADJUST_CFA_OFFSET -4
+2: popl_cfi %es
/*CFI_RESTORE es;*/
-3: popl %fs
- CFI_ADJUST_CFA_OFFSET -4
+3: popl_cfi %fs
/*CFI_RESTORE fs;*/
POP_GS \pop
.pushsection .fixup, "ax"
@@ -320,16 +297,12 @@
ENTRY(ret_from_fork)
CFI_STARTPROC
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
call schedule_tail
GET_THREAD_INFO(%ebp)
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
- pushl $0x0202 # Reset kernel eflags
- CFI_ADJUST_CFA_OFFSET 4
- popfl
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %eax
+ pushl_cfi $0x0202 # Reset kernel eflags
+ popfl_cfi
jmp syscall_exit
CFI_ENDPROC
END(ret_from_fork)
@@ -409,29 +382,23 @@ sysenter_past_esp:
* enough kernel state to call TRACE_IRQS_OFF can be called - but
* we immediately enable interrupts at that point anyway.
*/
- pushl $(__USER_DS)
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $(__USER_DS)
/*CFI_REL_OFFSET ss, 0*/
- pushl %ebp
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ebp
CFI_REL_OFFSET esp, 0
- pushfl
+ pushfl_cfi
orl $X86_EFLAGS_IF, (%esp)
- CFI_ADJUST_CFA_OFFSET 4
- pushl $(__USER_CS)
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $(__USER_CS)
/*CFI_REL_OFFSET cs, 0*/
/*
* Push current_thread_info()->sysenter_return to the stack.
* A tiny bit of offset fixup is necessary - 4*4 means the 4 words
* pushed above; +8 corresponds to copy_thread's esp0 setting.
*/
- pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
CFI_REL_OFFSET eip, 0
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
SAVE_ALL
ENABLE_INTERRUPTS(CLBR_NONE)
@@ -486,8 +453,7 @@ sysenter_audit:
movl %eax,%edx /* 2nd arg: syscall number */
movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
call audit_syscall_entry
- pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ebx
movl PT_EAX(%esp),%eax /* reload syscall number */
jmp sysenter_do_call
@@ -529,8 +495,7 @@ ENDPROC(ia32_sysenter_target)
# system call handler stub
ENTRY(system_call)
RING0_INT_FRAME # can't unwind into user space anyway
- pushl %eax # save orig_eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax # save orig_eax
SAVE_ALL
GET_THREAD_INFO(%ebp)
# system call tracing in operation / emulation
@@ -566,7 +531,6 @@ restore_all_notrace:
je ldt_ss # returning to user-space with LDT SS
restore_nocheck:
RESTORE_REGS 4 # skip orig_eax/error_code
- CFI_ADJUST_CFA_OFFSET -4
irq_return:
INTERRUPT_RETURN
.section .fixup,"ax"
@@ -619,10 +583,8 @@ ldt_ss:
shr $16, %edx
mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
- pushl $__ESPFIX_SS
- CFI_ADJUST_CFA_OFFSET 4
- push %eax /* new kernel esp */
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $__ESPFIX_SS
+ pushl_cfi %eax /* new kernel esp */
/* Disable interrupts, but do not irqtrace this section: we
* will soon execute iret and the tracer was already set to
* the irqstate after the iret */
@@ -666,11 +628,9 @@ work_notifysig: # deal with pending signals and
ALIGN
work_notifysig_v86:
- pushl %ecx # save ti_flags for do_notify_resume
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ecx # save ti_flags for do_notify_resume
call save_v86_state # %eax contains pt_regs pointer
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %ecx
movl %eax, %esp
#else
movl %esp, %eax
@@ -750,14 +710,18 @@ ptregs_##name: \
#define PTREGSCALL3(name) \
ALIGN; \
ptregs_##name: \
+ CFI_STARTPROC; \
leal 4(%esp),%eax; \
- pushl %eax; \
+ pushl_cfi %eax; \
movl PT_EDX(%eax),%ecx; \
movl PT_ECX(%eax),%edx; \
movl PT_EBX(%eax),%eax; \
call sys_##name; \
addl $4,%esp; \
- ret
+ CFI_ADJUST_CFA_OFFSET -4; \
+ ret; \
+ CFI_ENDPROC; \
+ENDPROC(ptregs_##name)
PTREGSCALL1(iopl)
PTREGSCALL0(fork)
@@ -772,15 +736,19 @@ PTREGSCALL1(vm86old)
/* Clone is an oddball. The 4th arg is in %edi */
ALIGN;
ptregs_clone:
+ CFI_STARTPROC
leal 4(%esp),%eax
- pushl %eax
- pushl PT_EDI(%eax)
+ pushl_cfi %eax
+ pushl_cfi PT_EDI(%eax)
movl PT_EDX(%eax),%ecx
movl PT_ECX(%eax),%edx
movl PT_EBX(%eax),%eax
call sys_clone
addl $8,%esp
+ CFI_ADJUST_CFA_OFFSET -8
ret
+ CFI_ENDPROC
+ENDPROC(ptregs_clone)
.macro FIXUP_ESPFIX_STACK
/*
@@ -795,10 +763,8 @@ ptregs_clone:
mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
shl $16, %eax
addl %esp, %eax /* the adjusted stack pointer */
- pushl $__KERNEL_DS
- CFI_ADJUST_CFA_OFFSET 4
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $__KERNEL_DS
+ pushl_cfi %eax
lss (%esp), %esp /* switch to the normal stack segment */
CFI_ADJUST_CFA_OFFSET -8
.endm
@@ -835,8 +801,7 @@ vector=FIRST_EXTERNAL_VECTOR
.if vector <> FIRST_EXTERNAL_VECTOR
CFI_ADJUST_CFA_OFFSET -4
.endif
-1: pushl $(~vector+0x80) /* Note: always in signed byte range */
- CFI_ADJUST_CFA_OFFSET 4
+1: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */
.if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
jmp 2f
.endif
@@ -876,8 +841,7 @@ ENDPROC(common_interrupt)
#define BUILD_INTERRUPT3(name, nr, fn) \
ENTRY(name) \
RING0_INT_FRAME; \
- pushl $~(nr); \
- CFI_ADJUST_CFA_OFFSET 4; \
+ pushl_cfi $~(nr); \
SAVE_ALL; \
TRACE_IRQS_OFF \
movl %esp,%eax; \
@@ -893,21 +857,18 @@ ENDPROC(name)
ENTRY(coprocessor_error)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_coprocessor_error
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
+ pushl_cfi $do_coprocessor_error
jmp error_code
CFI_ENDPROC
END(coprocessor_error)
ENTRY(simd_coprocessor_error)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
#ifdef CONFIG_X86_INVD_BUG
/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
-661: pushl $do_general_protection
+661: pushl_cfi $do_general_protection
662:
.section .altinstructions,"a"
.balign 4
@@ -922,19 +883,16 @@ ENTRY(simd_coprocessor_error)
664:
.previous
#else
- pushl $do_simd_coprocessor_error
+ pushl_cfi $do_simd_coprocessor_error
#endif
- CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
END(simd_coprocessor_error)
ENTRY(device_not_available)
RING0_INT_FRAME
- pushl $-1 # mark this as an int
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_device_not_available
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $-1 # mark this as an int
+ pushl_cfi $do_device_not_available
jmp error_code
CFI_ENDPROC
END(device_not_available)
@@ -956,82 +914,68 @@ END(native_irq_enable_sysexit)
ENTRY(overflow)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_overflow
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
+ pushl_cfi $do_overflow
jmp error_code
CFI_ENDPROC
END(overflow)
ENTRY(bounds)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_bounds
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
+ pushl_cfi $do_bounds
jmp error_code
CFI_ENDPROC
END(bounds)
ENTRY(invalid_op)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_invalid_op
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
+ pushl_cfi $do_invalid_op
jmp error_code
CFI_ENDPROC
END(invalid_op)
ENTRY(coprocessor_segment_overrun)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_coprocessor_segment_overrun
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
+ pushl_cfi $do_coprocessor_segment_overrun
jmp error_code
CFI_ENDPROC
END(coprocessor_segment_overrun)
ENTRY(invalid_TSS)
RING0_EC_FRAME
- pushl $do_invalid_TSS
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $do_invalid_TSS
jmp error_code
CFI_ENDPROC
END(invalid_TSS)
ENTRY(segment_not_present)
RING0_EC_FRAME
- pushl $do_segment_not_present
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $do_segment_not_present
jmp error_code
CFI_ENDPROC
END(segment_not_present)
ENTRY(stack_segment)
RING0_EC_FRAME
- pushl $do_stack_segment
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $do_stack_segment
jmp error_code
CFI_ENDPROC
END(stack_segment)
ENTRY(alignment_check)
RING0_EC_FRAME
- pushl $do_alignment_check
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $do_alignment_check
jmp error_code
CFI_ENDPROC
END(alignment_check)
ENTRY(divide_error)
RING0_INT_FRAME
- pushl $0 # no error code
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_divide_error
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0 # no error code
+ pushl_cfi $do_divide_error
jmp error_code
CFI_ENDPROC
END(divide_error)
@@ -1039,10 +983,8 @@ END(divide_error)
#ifdef CONFIG_X86_MCE
ENTRY(machine_check)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl machine_check_vector
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
+ pushl_cfi machine_check_vector
jmp error_code
CFI_ENDPROC
END(machine_check)
@@ -1050,10 +992,8 @@ END(machine_check)
ENTRY(spurious_interrupt_bug)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_spurious_interrupt_bug
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
+ pushl_cfi $do_spurious_interrupt_bug
jmp error_code
CFI_ENDPROC
END(spurious_interrupt_bug)
@@ -1084,8 +1024,7 @@ ENTRY(xen_sysenter_target)
ENTRY(xen_hypervisor_callback)
CFI_STARTPROC
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
SAVE_ALL
TRACE_IRQS_OFF
@@ -1121,23 +1060,20 @@ ENDPROC(xen_hypervisor_callback)
# We distinguish between categories by maintaining a status value in EAX.
ENTRY(xen_failsafe_callback)
CFI_STARTPROC
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
movl $1,%eax
1: mov 4(%esp),%ds
2: mov 8(%esp),%es
3: mov 12(%esp),%fs
4: mov 16(%esp),%gs
testl %eax,%eax
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %eax
lea 16(%esp),%esp
CFI_ADJUST_CFA_OFFSET -16
jz 5f
addl $16,%esp
jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
-5: pushl $0 # EAX == 0 => Category 1 (Bad segment)
- CFI_ADJUST_CFA_OFFSET 4
+5: pushl_cfi $0 # EAX == 0 => Category 1 (Bad segment)
SAVE_ALL
jmp ret_from_exception
CFI_ENDPROC
@@ -1287,40 +1223,29 @@ syscall_table_size=(.-sys_call_table)
ENTRY(page_fault)
RING0_EC_FRAME
- pushl $do_page_fault
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $do_page_fault
ALIGN
error_code:
/* the function address is in %gs's slot on the stack */
- pushl %fs
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %fs
/*CFI_REL_OFFSET fs, 0*/
- pushl %es
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %es
/*CFI_REL_OFFSET es, 0*/
- pushl %ds
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ds
/*CFI_REL_OFFSET ds, 0*/
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
CFI_REL_OFFSET eax, 0
- pushl %ebp
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ebp
CFI_REL_OFFSET ebp, 0
- pushl %edi
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %edi
CFI_REL_OFFSET edi, 0
- pushl %esi
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %esi
CFI_REL_OFFSET esi, 0
- pushl %edx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %edx
CFI_REL_OFFSET edx, 0
- pushl %ecx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ecx
CFI_REL_OFFSET ecx, 0
- pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ebx
CFI_REL_OFFSET ebx, 0
cld
movl $(__KERNEL_PERCPU), %ecx
@@ -1362,12 +1287,9 @@ END(page_fault)
movl TSS_sysenter_sp0 + \offset(%esp), %esp
CFI_DEF_CFA esp, 0
CFI_UNDEFINED eip
- pushfl
- CFI_ADJUST_CFA_OFFSET 4
- pushl $__KERNEL_CS
- CFI_ADJUST_CFA_OFFSET 4
- pushl $sysenter_past_esp
- CFI_ADJUST_CFA_OFFSET 4
+ pushfl_cfi
+ pushl_cfi $__KERNEL_CS
+ pushl_cfi $sysenter_past_esp
CFI_REL_OFFSET eip, 0
.endm
@@ -1377,8 +1299,7 @@ ENTRY(debug)
jne debug_stack_correct
FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
debug_stack_correct:
- pushl $-1 # mark this as an int
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $-1 # mark this as an int
SAVE_ALL
TRACE_IRQS_OFF
xorl %edx,%edx # error code 0
@@ -1398,32 +1319,27 @@ END(debug)
*/
ENTRY(nmi)
RING0_INT_FRAME
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
movl %ss, %eax
cmpw $__ESPFIX_SS, %ax
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %eax
je nmi_espfix_stack
cmpl $ia32_sysenter_target,(%esp)
je nmi_stack_fixup
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
movl %esp,%eax
/* Do not access memory above the end of our stack page,
* it might not exist.
*/
andl $(THREAD_SIZE-1),%eax
cmpl $(THREAD_SIZE-20),%eax
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %eax
jae nmi_stack_correct
cmpl $ia32_sysenter_target,12(%esp)
je nmi_debug_stack_check
nmi_stack_correct:
/* We have a RING0_INT_FRAME here */
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
SAVE_ALL
xorl %edx,%edx # zero error code
movl %esp,%eax # pt_regs pointer
@@ -1452,18 +1368,14 @@ nmi_espfix_stack:
*
* create the pointer to lss back
*/
- pushl %ss
- CFI_ADJUST_CFA_OFFSET 4
- pushl %esp
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ss
+ pushl_cfi %esp
addl $4, (%esp)
/* copy the iret frame of 12 bytes */
.rept 3
- pushl 16(%esp)
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi 16(%esp)
.endr
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
SAVE_ALL
FIXUP_ESPFIX_STACK # %eax == %esp
xorl %edx,%edx # zero error code
@@ -1477,8 +1389,7 @@ END(nmi)
ENTRY(int3)
RING0_INT_FRAME
- pushl $-1 # mark this as an int
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $-1 # mark this as an int
SAVE_ALL
TRACE_IRQS_OFF
xorl %edx,%edx # zero error code
@@ -1490,8 +1401,7 @@ END(int3)
ENTRY(general_protection)
RING0_EC_FRAME
- pushl $do_general_protection
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $do_general_protection
jmp error_code
CFI_ENDPROC
END(general_protection)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c375c79065f8..a7ae7fd1010f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -213,23 +213,17 @@ ENDPROC(native_usergs_sysret64)
.macro FAKE_STACK_FRAME child_rip
/* push in order ss, rsp, eflags, cs, rip */
xorl %eax, %eax
- pushq $__KERNEL_DS /* ss */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi $__KERNEL_DS /* ss */
/*CFI_REL_OFFSET ss,0*/
- pushq %rax /* rsp */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rax /* rsp */
CFI_REL_OFFSET rsp,0
- pushq $X86_EFLAGS_IF /* eflags - interrupts on */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */
/*CFI_REL_OFFSET rflags,0*/
- pushq $__KERNEL_CS /* cs */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi $__KERNEL_CS /* cs */
/*CFI_REL_OFFSET cs,0*/
- pushq \child_rip /* rip */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi \child_rip /* rip */
CFI_REL_OFFSET rip,0
- pushq %rax /* orig rax */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rax /* orig rax */
.endm
.macro UNFAKE_STACK_FRAME
@@ -398,10 +392,8 @@ ENTRY(ret_from_fork)
LOCK ; btr $TIF_FORK,TI_flags(%r8)
- push kernel_eflags(%rip)
- CFI_ADJUST_CFA_OFFSET 8
- popf # reset kernel eflags
- CFI_ADJUST_CFA_OFFSET -8
+ pushq_cfi kernel_eflags(%rip)
+ popfq_cfi # reset kernel eflags
call schedule_tail # rdi: 'prev' task parameter
@@ -521,11 +513,9 @@ sysret_careful:
jnc sysret_signal
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rdi
call schedule
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
+ popq_cfi %rdi
jmp sysret_check
/* Handle a signal */
@@ -634,11 +624,9 @@ int_careful:
jnc int_very_careful
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rdi
call schedule
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
+ popq_cfi %rdi
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check
@@ -652,12 +640,10 @@ int_check_syscall_exit_work:
/* Check for syscall exit trace */
testl $_TIF_WORK_SYSCALL_EXIT,%edx
jz int_signal
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rdi
leaq 8(%rsp),%rdi # &ptregs -> arg1
call syscall_trace_leave
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
+ popq_cfi %rdi
andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
jmp int_restore_rest
@@ -714,9 +700,8 @@ END(ptregscall_common)
ENTRY(stub_execve)
CFI_STARTPROC
- popq %r11
- CFI_ADJUST_CFA_OFFSET -8
- CFI_REGISTER rip, r11
+ addq $8, %rsp
+ PARTIAL_FRAME 0
SAVE_REST
FIXUP_TOP_OF_STACK %r11
movq %rsp, %rcx
@@ -735,7 +720,7 @@ END(stub_execve)
ENTRY(stub_rt_sigreturn)
CFI_STARTPROC
addq $8, %rsp
- CFI_ADJUST_CFA_OFFSET -8
+ PARTIAL_FRAME 0
SAVE_REST
movq %rsp,%rdi
FIXUP_TOP_OF_STACK %r11
@@ -766,8 +751,7 @@ vector=FIRST_EXTERNAL_VECTOR
.if vector <> FIRST_EXTERNAL_VECTOR
CFI_ADJUST_CFA_OFFSET -8
.endif
-1: pushq $(~vector+0x80) /* Note: always in signed byte range */
- CFI_ADJUST_CFA_OFFSET 8
+1: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */
.if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
jmp 2f
.endif
@@ -796,8 +780,8 @@ END(interrupt)
/* 0(%rsp): ~(interrupt number) */
.macro interrupt func
- subq $10*8, %rsp
- CFI_ADJUST_CFA_OFFSET 10*8
+ subq $ORIG_RAX-ARGOFFSET+8, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-ARGOFFSET+8
call save_args
PARTIAL_FRAME 0
call \func
@@ -822,6 +806,7 @@ ret_from_intr:
TRACE_IRQS_OFF
decl PER_CPU_VAR(irq_count)
leaveq
+ CFI_RESTORE rbp
CFI_DEF_CFA_REGISTER rsp
CFI_ADJUST_CFA_OFFSET -8
exit_intr:
@@ -903,11 +888,9 @@ retint_careful:
jnc retint_signal
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rdi
call schedule
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
+ popq_cfi %rdi
GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
@@ -956,8 +939,7 @@ END(common_interrupt)
.macro apicinterrupt num sym do_sym
ENTRY(\sym)
INTR_FRAME
- pushq $~(\num)
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi $~(\num)
interrupt \do_sym
jmp ret_from_intr
CFI_ENDPROC
@@ -1036,8 +1018,8 @@ ENTRY(\sym)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $15*8,%rsp
- CFI_ADJUST_CFA_OFFSET 15*8
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call error_entry
DEFAULT_FRAME 0
movq %rsp,%rdi /* pt_regs pointer */
@@ -1052,9 +1034,9 @@ END(\sym)
ENTRY(\sym)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $-1 /* ORIG_RAX: no syscall to restart */
- CFI_ADJUST_CFA_OFFSET 8
- subq $15*8, %rsp
+ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call save_paranoid
TRACE_IRQS_OFF
movq %rsp,%rdi /* pt_regs pointer */
@@ -1070,9 +1052,9 @@ END(\sym)
ENTRY(\sym)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $-1 /* ORIG_RAX: no syscall to restart */
- CFI_ADJUST_CFA_OFFSET 8
- subq $15*8, %rsp
+ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call save_paranoid
TRACE_IRQS_OFF
movq %rsp,%rdi /* pt_regs pointer */
@@ -1089,8 +1071,8 @@ END(\sym)
ENTRY(\sym)
XCPT_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- subq $15*8,%rsp
- CFI_ADJUST_CFA_OFFSET 15*8
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call error_entry
DEFAULT_FRAME 0
movq %rsp,%rdi /* pt_regs pointer */
@@ -1107,8 +1089,8 @@ END(\sym)
ENTRY(\sym)
XCPT_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- subq $15*8,%rsp
- CFI_ADJUST_CFA_OFFSET 15*8
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call save_paranoid
DEFAULT_FRAME 0
TRACE_IRQS_OFF
@@ -1139,16 +1121,14 @@ zeroentry simd_coprocessor_error do_simd_coprocessor_error
/* edi: new selector */
ENTRY(native_load_gs_index)
CFI_STARTPROC
- pushf
- CFI_ADJUST_CFA_OFFSET 8
+ pushfq_cfi
DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
SWAPGS
gs_change:
movl %edi,%gs
2: mfence /* workaround */
SWAPGS
- popf
- CFI_ADJUST_CFA_OFFSET -8
+ popfq_cfi
ret
CFI_ENDPROC
END(native_load_gs_index)
@@ -1215,8 +1195,7 @@ END(kernel_execve)
/* Call softirq on interrupt stack. Interrupts are off. */
ENTRY(call_softirq)
CFI_STARTPROC
- push %rbp
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rbp
CFI_REL_OFFSET rbp,0
mov %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
@@ -1225,6 +1204,7 @@ ENTRY(call_softirq)
push %rbp # backlink for old unwinder
call __do_softirq
leaveq
+ CFI_RESTORE rbp
CFI_DEF_CFA_REGISTER rsp
CFI_ADJUST_CFA_OFFSET -8
decl PER_CPU_VAR(irq_count)
@@ -1368,7 +1348,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
/* ebx: no swapgs flag */
ENTRY(paranoid_exit)
- INTR_FRAME
+ DEFAULT_FRAME
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
testl %ebx,%ebx /* swapgs needed? */
@@ -1445,7 +1425,6 @@ error_swapgs:
error_sti:
TRACE_IRQS_OFF
ret
- CFI_ENDPROC
/*
* There are two places in the kernel that can potentially fault with
@@ -1470,6 +1449,7 @@ bstep_iret:
/* Fix truncated RIP */
movq %rcx,RIP+8(%rsp)
jmp error_swapgs
+ CFI_ENDPROC
END(error_entry)
@@ -1498,8 +1478,8 @@ ENTRY(nmi)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq_cfi $-1
- subq $15*8, %rsp
- CFI_ADJUST_CFA_OFFSET 15*8
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call save_paranoid
DEFAULT_FRAME 0
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 7494999141b3..efaf906daf93 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -440,9 +440,9 @@ static int hpet_legacy_next_event(unsigned long delta,
static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
static struct hpet_dev *hpet_devs;
-void hpet_msi_unmask(unsigned int irq)
+void hpet_msi_unmask(struct irq_data *data)
{
- struct hpet_dev *hdev = get_irq_data(irq);
+ struct hpet_dev *hdev = data->handler_data;
unsigned int cfg;
/* unmask it */
@@ -451,10 +451,10 @@ void hpet_msi_unmask(unsigned int irq)
hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
}
-void hpet_msi_mask(unsigned int irq)
+void hpet_msi_mask(struct irq_data *data)
{
+ struct hpet_dev *hdev = data->handler_data;
unsigned int cfg;
- struct hpet_dev *hdev = get_irq_data(irq);
/* mask it */
cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
@@ -462,18 +462,14 @@ void hpet_msi_mask(unsigned int irq)
hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
}
-void hpet_msi_write(unsigned int irq, struct msi_msg *msg)
+void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg)
{
- struct hpet_dev *hdev = get_irq_data(irq);
-
hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num));
hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4);
}
-void hpet_msi_read(unsigned int irq, struct msi_msg *msg)
+void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg)
{
- struct hpet_dev *hdev = get_irq_data(irq);
-
msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num));
msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4);
msg->address_hi = 0;
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index a46cb3522c0c..58bb239a2fd7 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -68,19 +68,22 @@ static void __cpuinit init_thread_xstate(void)
*/
if (!HAVE_HWFP) {
+ /*
+ * Disable xsave as we do not support it if i387
+ * emulation is enabled.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
xstate_size = sizeof(struct i387_soft_struct);
return;
}
if (cpu_has_fxsr)
xstate_size = sizeof(struct i387_fxsave_struct);
-#ifdef CONFIG_X86_32
else
xstate_size = sizeof(struct i387_fsave_struct);
-#endif
}
-#ifdef CONFIG_X86_64
/*
* Called at bootup to set up the initial FPU state that is later cloned
* into all processes.
@@ -88,12 +91,21 @@ static void __cpuinit init_thread_xstate(void)
void __cpuinit fpu_init(void)
{
- unsigned long oldcr0 = read_cr0();
+ unsigned long cr0;
+ unsigned long cr4_mask = 0;
- set_in_cr4(X86_CR4_OSFXSR);
- set_in_cr4(X86_CR4_OSXMMEXCPT);
+ if (cpu_has_fxsr)
+ cr4_mask |= X86_CR4_OSFXSR;
+ if (cpu_has_xmm)
+ cr4_mask |= X86_CR4_OSXMMEXCPT;
+ if (cr4_mask)
+ set_in_cr4(cr4_mask);
- write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
+ cr0 = read_cr0();
+ cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
+ if (!HAVE_HWFP)
+ cr0 |= X86_CR0_EM;
+ write_cr0(cr0);
if (!smp_processor_id())
init_thread_xstate();
@@ -104,24 +116,12 @@ void __cpuinit fpu_init(void)
clear_used_math();
}
-#else /* CONFIG_X86_64 */
-
-void __cpuinit fpu_init(void)
-{
- if (!smp_processor_id())
- init_thread_xstate();
-}
-
-#endif /* CONFIG_X86_32 */
-
void fpu_finit(struct fpu *fpu)
{
-#ifdef CONFIG_X86_32
if (!HAVE_HWFP) {
finit_soft_fpu(&fpu->state->soft);
return;
}
-#endif
if (cpu_has_fxsr) {
struct i387_fxsave_struct *fx = &fpu->state->fxsave;
@@ -386,19 +386,17 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
#ifdef CONFIG_X86_64
env->fip = fxsave->rip;
env->foo = fxsave->rdp;
+ /*
+ * should be actually ds/cs at fpu exception time, but
+ * that information is not available in 64bit mode.
+ */
+ env->fcs = task_pt_regs(tsk)->cs;
if (tsk == current) {
- /*
- * should be actually ds/cs at fpu exception time, but
- * that information is not available in 64bit mode.
- */
- asm("mov %%ds, %[fos]" : [fos] "=r" (env->fos));
- asm("mov %%cs, %[fcs]" : [fcs] "=r" (env->fcs));
+ savesegment(ds, env->fos);
} else {
- struct pt_regs *regs = task_pt_regs(tsk);
-
- env->fos = 0xffff0000 | tsk->thread.ds;
- env->fcs = regs->cs;
+ env->fos = tsk->thread.ds;
}
+ env->fos |= 0xffff0000;
#else
env->fip = fxsave->fip;
env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index cafa7c80ac95..20757cb2efa3 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -29,24 +29,10 @@
* plus some generic x86 specific things if generic specifics makes
* any sense at all.
*/
+static void init_8259A(int auto_eoi);
static int i8259A_auto_eoi;
DEFINE_RAW_SPINLOCK(i8259A_lock);
-static void mask_and_ack_8259A(unsigned int);
-static void mask_8259A(void);
-static void unmask_8259A(void);
-static void disable_8259A_irq(unsigned int irq);
-static void enable_8259A_irq(unsigned int irq);
-static void init_8259A(int auto_eoi);
-static int i8259A_irq_pending(unsigned int irq);
-
-struct irq_chip i8259A_chip = {
- .name = "XT-PIC",
- .mask = disable_8259A_irq,
- .disable = disable_8259A_irq,
- .unmask = enable_8259A_irq,
- .mask_ack = mask_and_ack_8259A,
-};
/*
* 8259A PIC functions to handle ISA devices:
@@ -68,7 +54,7 @@ unsigned int cached_irq_mask = 0xffff;
*/
unsigned long io_apic_irqs;
-static void disable_8259A_irq(unsigned int irq)
+static void mask_8259A_irq(unsigned int irq)
{
unsigned int mask = 1 << irq;
unsigned long flags;
@@ -82,7 +68,12 @@ static void disable_8259A_irq(unsigned int irq)
raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
-static void enable_8259A_irq(unsigned int irq)
+static void disable_8259A_irq(struct irq_data *data)
+{
+ mask_8259A_irq(data->irq);
+}
+
+static void unmask_8259A_irq(unsigned int irq)
{
unsigned int mask = ~(1 << irq);
unsigned long flags;
@@ -96,6 +87,11 @@ static void enable_8259A_irq(unsigned int irq)
raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
+static void enable_8259A_irq(struct irq_data *data)
+{
+ unmask_8259A_irq(data->irq);
+}
+
static int i8259A_irq_pending(unsigned int irq)
{
unsigned int mask = 1<irq;
unsigned int irqmask = 1 << irq;
unsigned long flags;
@@ -223,6 +220,14 @@ spurious_8259A_irq:
}
}
+struct irq_chip i8259A_chip = {
+ .name = "XT-PIC",
+ .irq_mask = disable_8259A_irq,
+ .irq_disable = disable_8259A_irq,
+ .irq_unmask = enable_8259A_irq,
+ .irq_mask_ack = mask_and_ack_8259A,
+};
+
static char irq_trigger[2];
/**
* ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
@@ -342,9 +347,9 @@ static void init_8259A(int auto_eoi)
* In AEOI mode we just have to mask the interrupt
* when acking.
*/
- i8259A_chip.mask_ack = disable_8259A_irq;
+ i8259A_chip.irq_mask_ack = disable_8259A_irq;
else
- i8259A_chip.mask_ack = mask_and_ack_8259A;
+ i8259A_chip.irq_mask_ack = mask_and_ack_8259A;
udelay(100); /* wait for 8259A to initialize */
@@ -363,14 +368,6 @@ static void init_8259A(int auto_eoi)
static void legacy_pic_noop(void) { };
static void legacy_pic_uint_noop(unsigned int unused) { };
static void legacy_pic_int_noop(int unused) { };
-
-static struct irq_chip dummy_pic_chip = {
- .name = "dummy pic",
- .mask = legacy_pic_uint_noop,
- .unmask = legacy_pic_uint_noop,
- .disable = legacy_pic_uint_noop,
- .mask_ack = legacy_pic_uint_noop,
-};
static int legacy_pic_irq_pending_noop(unsigned int irq)
{
return 0;
@@ -378,7 +375,9 @@ static int legacy_pic_irq_pending_noop(unsigned int irq)
struct legacy_pic null_legacy_pic = {
.nr_legacy_irqs = 0,
- .chip = &dummy_pic_chip,
+ .chip = &dummy_irq_chip,
+ .mask = legacy_pic_uint_noop,
+ .unmask = legacy_pic_uint_noop,
.mask_all = legacy_pic_noop,
.restore_mask = legacy_pic_noop,
.init = legacy_pic_int_noop,
@@ -389,7 +388,9 @@ struct legacy_pic null_legacy_pic = {
struct legacy_pic default_legacy_pic = {
.nr_legacy_irqs = NR_IRQS_LEGACY,
.chip = &i8259A_chip,
- .mask_all = mask_8259A,
+ .mask = mask_8259A_irq,
+ .unmask = unmask_8259A_irq,
+ .mask_all = mask_8259A,
.restore_mask = unmask_8259A,
.init = init_8259A,
.irq_pending = i8259A_irq_pending,
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 44edb03fc9ec..83ec0175f986 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -159,7 +159,7 @@ int show_interrupts(struct seq_file *p, void *v)
seq_printf(p, "%*d: ", prec, i);
for_each_online_cpu(j)
seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
- seq_printf(p, " %8s", desc->chip->name);
+ seq_printf(p, " %8s", desc->irq_data.chip->name);
seq_printf(p, "-%-8s", desc->name);
if (action) {
@@ -282,6 +282,7 @@ void fixup_irqs(void)
unsigned int irq, vector;
static int warned;
struct irq_desc *desc;
+ struct irq_data *data;
for_each_irq_desc(irq, desc) {
int break_affinity = 0;
@@ -296,7 +297,8 @@ void fixup_irqs(void)
/* interrupt's are disabled at this point */
raw_spin_lock(&desc->lock);
- affinity = desc->affinity;
+ data = &desc->irq_data;
+ affinity = data->affinity;
if (!irq_has_action(irq) ||
cpumask_equal(affinity, cpu_online_mask)) {
raw_spin_unlock(&desc->lock);
@@ -315,16 +317,16 @@ void fixup_irqs(void)
affinity = cpu_all_mask;
}
- if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask)
- desc->chip->mask(irq);
+ if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_mask)
+ data->chip->irq_mask(data);
- if (desc->chip->set_affinity)
- desc->chip->set_affinity(irq, affinity);
+ if (data->chip->irq_set_affinity)
+ data->chip->irq_set_affinity(data, affinity, true);
else if (!(warned++))
set_affinity = 0;
- if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask)
- desc->chip->unmask(irq);
+ if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_unmask)
+ data->chip->irq_unmask(data);
raw_spin_unlock(&desc->lock);
@@ -355,10 +357,10 @@ void fixup_irqs(void)
if (irr & (1 << (vector % 32))) {
irq = __get_cpu_var(vector_irq)[vector];
- desc = irq_to_desc(irq);
+ data = irq_get_irq_data(irq);
raw_spin_lock(&desc->lock);
- if (desc->chip->retrigger)
- desc->chip->retrigger(irq);
+ if (data->chip->irq_retrigger)
+ data->chip->irq_retrigger(data);
raw_spin_unlock(&desc->lock);
}
}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 713969b9266b..c752e973958d 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -100,6 +100,8 @@ int vector_used_by_percpu_irq(unsigned int vector)
void __init init_ISA_irqs(void)
{
+ struct irq_chip *chip = legacy_pic->chip;
+ const char *name = chip->name;
int i;
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
@@ -107,19 +109,8 @@ void __init init_ISA_irqs(void)
#endif
legacy_pic->init(0);
- /*
- * 16 old-style INTA-cycle interrupts:
- */
- for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) {
- struct irq_desc *desc = irq_to_desc(i);
-
- desc->status = IRQ_DISABLED;
- desc->action = NULL;
- desc->depth = 1;
-
- set_irq_chip_and_handler_name(i, &i8259A_chip,
- handle_level_irq, "XT");
- }
+ for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
+ set_irq_chip_and_handler_name(i, chip, handle_level_irq, name);
}
void __init init_IRQ(void)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 035c8c529181..b3ea9db39db6 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -36,7 +36,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
if (!page)
goto out;
pud = (pud_t *)page_address(page);
- memset(pud, 0, PAGE_SIZE);
+ clear_page(pud);
set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
}
pud = pud_offset(pgd, addr);
@@ -45,7 +45,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
if (!page)
goto out;
pmd = (pmd_t *)page_address(page);
- memset(pmd, 0, PAGE_SIZE);
+ clear_page(pmd);
set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
}
pmd = pmd_offset(pud, addr);
diff --git a/arch/x86/kernel/olpc-xo1.c b/arch/x86/kernel/olpc-xo1.c
new file mode 100644
index 000000000000..f5442c03abc3
--- /dev/null
+++ b/arch/x86/kernel/olpc-xo1.c
@@ -0,0 +1,140 @@
+/*
+ * Support for features of the OLPC XO-1 laptop
+ *
+ * Copyright (C) 2010 One Laptop per Child
+ * Copyright (C) 2006 Red Hat, Inc.
+ * Copyright (C) 2006 Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+#define DRV_NAME "olpc-xo1"
+
+#define PMS_BAR 4
+#define ACPI_BAR 5
+
+/* PMC registers (PMS block) */
+#define PM_SCLK 0x10
+#define PM_IN_SLPCTL 0x20
+#define PM_WKXD 0x34
+#define PM_WKD 0x30
+#define PM_SSC 0x54
+
+/* PM registers (ACPI block) */
+#define PM1_CNT 0x08
+#define PM_GPE0_STS 0x18
+
+static unsigned long acpi_base;
+static unsigned long pms_base;
+
+static void xo1_power_off(void)
+{
+ printk(KERN_INFO "OLPC XO-1 power off sequence...\n");
+
+ /* Enable all of these controls with 0 delay */
+ outl(0x40000000, pms_base + PM_SCLK);
+ outl(0x40000000, pms_base + PM_IN_SLPCTL);
+ outl(0x40000000, pms_base + PM_WKXD);
+ outl(0x40000000, pms_base + PM_WKD);
+
+ /* Clear status bits (possibly unnecessary) */
+ outl(0x0002ffff, pms_base + PM_SSC);
+ outl(0xffffffff, acpi_base + PM_GPE0_STS);
+
+ /* Write SLP_EN bit to start the machinery */
+ outl(0x00002000, acpi_base + PM1_CNT);
+}
+
+/* Read the base addresses from the PCI BAR info */
+static int __devinit setup_bases(struct pci_dev *pdev)
+{
+ int r;
+
+ r = pci_enable_device_io(pdev);
+ if (r) {
+ dev_err(&pdev->dev, "can't enable device IO\n");
+ return r;
+ }
+
+ r = pci_request_region(pdev, ACPI_BAR, DRV_NAME);
+ if (r) {
+ dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", ACPI_BAR);
+ return r;
+ }
+
+ r = pci_request_region(pdev, PMS_BAR, DRV_NAME);
+ if (r) {
+ dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", PMS_BAR);
+ pci_release_region(pdev, ACPI_BAR);
+ return r;
+ }
+
+ acpi_base = pci_resource_start(pdev, ACPI_BAR);
+ pms_base = pci_resource_start(pdev, PMS_BAR);
+
+ return 0;
+}
+
+static int __devinit olpc_xo1_probe(struct platform_device *pdev)
+{
+ struct pci_dev *pcidev;
+ int r;
+
+ pcidev = pci_get_device(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA,
+ NULL);
+ if (!pdev)
+ return -ENODEV;
+
+ r = setup_bases(pcidev);
+ if (r)
+ return r;
+
+ pm_power_off = xo1_power_off;
+
+ printk(KERN_INFO "OLPC XO-1 support registered\n");
+ return 0;
+}
+
+static int __devexit olpc_xo1_remove(struct platform_device *pdev)
+{
+ pm_power_off = NULL;
+ return 0;
+}
+
+static struct platform_driver olpc_xo1_driver = {
+ .driver = {
+ .name = DRV_NAME,
+ .owner = THIS_MODULE,
+ },
+ .probe = olpc_xo1_probe,
+ .remove = __devexit_p(olpc_xo1_remove),
+};
+
+static int __init olpc_xo1_init(void)
+{
+ return platform_driver_register(&olpc_xo1_driver);
+}
+
+static void __exit olpc_xo1_exit(void)
+{
+ platform_driver_unregister(&olpc_xo1_driver);
+}
+
+MODULE_AUTHOR("Daniel Drake ");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:olpc-xo1");
+
+module_init(olpc_xo1_init);
+module_exit(olpc_xo1_exit);
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 0e0cdde519be..edaf3fe8dc5e 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -17,6 +17,7 @@
#include
#include
#include
+#include
#include
#include
@@ -114,6 +115,7 @@ int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
unsigned long flags;
int ret = -EIO;
int i;
+ int restarts = 0;
spin_lock_irqsave(&ec_lock, flags);
@@ -169,7 +171,9 @@ restart:
if (wait_on_obf(0x6c, 1)) {
printk(KERN_ERR "olpc-ec: timeout waiting for"
" EC to provide data!\n");
- goto restart;
+ if (restarts++ < 10)
+ goto restart;
+ goto err;
}
outbuf[i] = inb(0x68);
pr_devel("olpc-ec: received 0x%x\n", outbuf[i]);
@@ -183,8 +187,21 @@ err:
}
EXPORT_SYMBOL_GPL(olpc_ec_cmd);
-#ifdef CONFIG_OLPC_OPENFIRMWARE
-static void __init platform_detect(void)
+static bool __init check_ofw_architecture(void)
+{
+ size_t propsize;
+ char olpc_arch[5];
+ const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 };
+ void *res[] = { &propsize };
+
+ if (olpc_ofw("getprop", args, res)) {
+ printk(KERN_ERR "ofw: getprop call failed!\n");
+ return false;
+ }
+ return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0;
+}
+
+static u32 __init get_board_revision(void)
{
size_t propsize;
__be32 rev;
@@ -193,46 +210,44 @@ static void __init platform_detect(void)
if (olpc_ofw("getprop", args, res) || propsize != 4) {
printk(KERN_ERR "ofw: getprop call failed!\n");
- rev = cpu_to_be32(0);
+ return cpu_to_be32(0);
}
- olpc_platform_info.boardrev = be32_to_cpu(rev);
+ return be32_to_cpu(rev);
}
-#else
-static void __init platform_detect(void)
+
+static bool __init platform_detect(void)
{
- /* stopgap until OFW support is added to the kernel */
- olpc_platform_info.boardrev = olpc_board(0xc2);
+ if (!check_ofw_architecture())
+ return false;
+ olpc_platform_info.flags |= OLPC_F_PRESENT;
+ olpc_platform_info.boardrev = get_board_revision();
+ return true;
+}
+
+static int __init add_xo1_platform_devices(void)
+{
+ struct platform_device *pdev;
+
+ pdev = platform_device_register_simple("xo1-rfkill", -1, NULL, 0);
+ if (IS_ERR(pdev))
+ return PTR_ERR(pdev);
+
+ pdev = platform_device_register_simple("olpc-xo1", -1, NULL, 0);
+ if (IS_ERR(pdev))
+ return PTR_ERR(pdev);
+
+ return 0;
}
-#endif
static int __init olpc_init(void)
{
- unsigned char *romsig;
+ int r = 0;
- /* The ioremap check is dangerous; limit what we run it on */
- if (!is_geode() || cs5535_has_vsa2())
+ if (!olpc_ofw_present() || !platform_detect())
return 0;
spin_lock_init(&ec_lock);
- romsig = ioremap(0xffffffc0, 16);
- if (!romsig)
- return 0;
-
- if (strncmp(romsig, "CL1 Q", 7))
- goto unmap;
- if (strncmp(romsig+6, romsig+13, 3)) {
- printk(KERN_INFO "OLPC BIOS signature looks invalid. "
- "Assuming not OLPC\n");
- goto unmap;
- }
-
- printk(KERN_INFO "OLPC board with OpenFirmware %.16s\n", romsig);
- olpc_platform_info.flags |= OLPC_F_PRESENT;
-
- /* get the platform revision */
- platform_detect();
-
/* assume B1 and above models always have a DCON */
if (olpc_board_at_least(olpc_board(0xb1)))
olpc_platform_info.flags |= OLPC_F_DCON;
@@ -242,8 +257,10 @@ static int __init olpc_init(void)
(unsigned char *) &olpc_platform_info.ecver, 1);
#ifdef CONFIG_PCI_OLPC
- /* If the VSA exists let it emulate PCI, if not emulate in kernel */
- if (!cs5535_has_vsa2())
+ /* If the VSA exists let it emulate PCI, if not emulate in kernel.
+ * XO-1 only. */
+ if (olpc_platform_info.boardrev < olpc_board_pre(0xd0) &&
+ !cs5535_has_vsa2())
x86_init.pci.arch_init = pci_olpc_init;
#endif
@@ -252,8 +269,12 @@ static int __init olpc_init(void)
olpc_platform_info.boardrev >> 4,
olpc_platform_info.ecver);
-unmap:
- iounmap(romsig);
+ if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) { /* XO-1 */
+ r = add_xo1_platform_devices();
+ if (r)
+ return r;
+ }
+
return 0;
}
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c
index 3218aa71ab5e..787320464379 100644
--- a/arch/x86/kernel/olpc_ofw.c
+++ b/arch/x86/kernel/olpc_ofw.c
@@ -74,6 +74,12 @@ int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
}
EXPORT_SYMBOL_GPL(__olpc_ofw);
+bool olpc_ofw_present(void)
+{
+ return olpc_ofw_cif != NULL;
+}
+EXPORT_SYMBOL_GPL(olpc_ofw_present);
+
/* OFW cif _should_ be above this address */
#define OFW_MIN 0xff000000
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1db183ed7c01..c5b250011fd4 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -413,7 +413,6 @@ struct pv_mmu_ops pv_mmu_ops = {
.alloc_pte = paravirt_nop,
.alloc_pmd = paravirt_nop,
- .alloc_pmd_clone = paravirt_nop,
.alloc_pud = paravirt_nop,
.release_pte = paravirt_nop,
.release_pmd = paravirt_nop,
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 0f7f130caa67..c562207b1b3d 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -39,7 +39,7 @@
#include
#include
#include
-#include
+#include
#include
static unsigned long iommu_bus_base; /* GART remapping area (physical) */
@@ -560,8 +560,11 @@ static void enable_gart_translations(void)
{
int i;
- for (i = 0; i < num_k8_northbridges; i++) {
- struct pci_dev *dev = k8_northbridges[i];
+ if (!k8_northbridges.gart_supported)
+ return;
+
+ for (i = 0; i < k8_northbridges.num; i++) {
+ struct pci_dev *dev = k8_northbridges.nb_misc[i];
enable_gart_translation(dev, __pa(agp_gatt_table));
}
@@ -592,16 +595,19 @@ static void gart_fixup_northbridges(struct sys_device *dev)
if (!fix_up_north_bridges)
return;
+ if (!k8_northbridges.gart_supported)
+ return;
+
pr_info("PCI-DMA: Restoring GART aperture settings\n");
- for (i = 0; i < num_k8_northbridges; i++) {
- struct pci_dev *dev = k8_northbridges[i];
+ for (i = 0; i < k8_northbridges.num; i++) {
+ struct pci_dev *dev = k8_northbridges.nb_misc[i];
/*
* Don't enable translations just yet. That is the next
* step. Restore the pre-suspend aperture settings.
*/
- pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1);
+ gart_set_size_and_enable(dev, aperture_order);
pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
}
}
@@ -649,8 +655,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
aper_size = aper_base = info->aper_size = 0;
dev = NULL;
- for (i = 0; i < num_k8_northbridges; i++) {
- dev = k8_northbridges[i];
+ for (i = 0; i < k8_northbridges.num; i++) {
+ dev = k8_northbridges.nb_misc[i];
new_aper_base = read_aperture(dev, &new_aper_size);
if (!new_aper_base)
goto nommu;
@@ -718,10 +724,13 @@ static void gart_iommu_shutdown(void)
if (!no_agp)
return;
- for (i = 0; i < num_k8_northbridges; i++) {
+ if (!k8_northbridges.gart_supported)
+ return;
+
+ for (i = 0; i < k8_northbridges.num; i++) {
u32 ctl;
- dev = k8_northbridges[i];
+ dev = k8_northbridges.nb_misc[i];
pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
ctl &= ~GARTEN;
@@ -739,7 +748,7 @@ int __init gart_iommu_init(void)
unsigned long scratch;
long i;
- if (num_k8_northbridges == 0)
+ if (!k8_northbridges.gart_supported)
return 0;
#ifndef CONFIG_AGP_AMD64
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c
deleted file mode 100644
index b112406f1996..000000000000
--- a/arch/x86/kernel/pmtimer_64.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Ported over from i386 by AK, original copyright was:
- *
- * (C) Dominik Brodowski 2003
- *
- * Driver to use the Power Management Timer (PMTMR) available in some
- * southbridges as primary timing source for the Linux kernel.
- *
- * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
- * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
- *
- * This file is licensed under the GPL v2.
- *
- * Dropped all the hardware bug workarounds for now. Hopefully they
- * are not needed on 64bit chipsets.
- */
-
-#include
-#include
-#include
-#include
-#include
-#include
-
-#include
-#include
-#include
-#include
-
-static inline u32 cyc2us(u32 cycles)
-{
- /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
- * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
- *
- * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
- * easily be multiplied with 286 (=0x11E) without having to fear
- * u32 overflows.
- */
- cycles *= 286;
- return (cycles >> 10);
-}
-
-static unsigned pmtimer_wait_tick(void)
-{
- u32 a, b;
- for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK;
- a == b;
- b = inl(pmtmr_ioport) & ACPI_PM_MASK)
- cpu_relax();
- return b;
-}
-
-/* note: wait time is rounded up to one tick */
-void pmtimer_wait(unsigned us)
-{
- u32 a, b;
- a = pmtimer_wait_tick();
- do {
- b = inl(pmtmr_ioport);
- cpu_relax();
- } while (cyc2us(b - a) < us);
-}
-
-static int __init nopmtimer_setup(char *s)
-{
- pmtmr_ioport = 0;
- return 1;
-}
-
-__setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3d9ea531ddd1..b3d7a3a04f38 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -424,7 +424,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
load_TLS(next, cpu);
/* Must be after DS reload */
- unlazy_fpu(prev_p);
+ __unlazy_fpu(prev_p);
/* Make sure cpu is ready for new context */
if (preload_fpu)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index e3af342fe83a..7a4cf14223ba 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -84,7 +84,7 @@ static int __init reboot_setup(char *str)
}
/* we will leave sorting out the final value
when we are ready to reboot, since we might not
- have set up boot_cpu_id or smp_num_cpu */
+ have detected BSP APIC ID or smp_num_cpu */
break;
#endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 00e167870f71..a59f6a6df5e2 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -83,7 +83,6 @@
#include
#include
#include
-#include
#include
#include
#include
@@ -107,7 +106,7 @@
#include
#include
#include
-#include
+#include
#ifdef CONFIG_X86_64
#include
#endif
@@ -126,7 +125,6 @@ unsigned long max_pfn_mapped;
RESERVE_BRK(dmi_alloc, 65536);
#endif
-unsigned int boot_cpu_id __read_mostly;
static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
unsigned long _brk_end = (unsigned long)__brk_base;
@@ -619,79 +617,7 @@ static __init void reserve_ibft_region(void)
reserve_early_overlap_ok(addr, addr + size, "ibft");
}
-#ifdef CONFIG_X86_RESERVE_LOW_64K
-static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
-{
- printk(KERN_NOTICE
- "%s detected: BIOS may corrupt low RAM, working around it.\n",
- d->ident);
-
- e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
-
- return 0;
-}
-#endif
-
-/* List of systems that have known low memory corruption BIOS problems */
-static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
-#ifdef CONFIG_X86_RESERVE_LOW_64K
- {
- .callback = dmi_low_memory_corruption,
- .ident = "AMI BIOS",
- .matches = {
- DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
- },
- },
- {
- .callback = dmi_low_memory_corruption,
- .ident = "Phoenix BIOS",
- .matches = {
- DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
- },
- },
- {
- .callback = dmi_low_memory_corruption,
- .ident = "Phoenix/MSC BIOS",
- .matches = {
- DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
- },
- },
- /*
- * AMI BIOS with low memory corruption was found on Intel DG45ID and
- * DG45FC boards.
- * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
- * match only DMI_BOARD_NAME and see if there is more bad products
- * with this vendor.
- */
- {
- .callback = dmi_low_memory_corruption,
- .ident = "AMI BIOS",
- .matches = {
- DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
- },
- },
- {
- .callback = dmi_low_memory_corruption,
- .ident = "AMI BIOS",
- .matches = {
- DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
- },
- },
- /*
- * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so
- * match on the product name.
- */
- {
- .callback = dmi_low_memory_corruption,
- .ident = "Phoenix BIOS",
- .matches = {
- DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"),
- },
- },
-#endif
- {}
-};
+static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
static void __init trim_bios_range(void)
{
@@ -699,8 +625,14 @@ static void __init trim_bios_range(void)
* A special case is the first 4Kb of memory;
* This is a BIOS owned area, not kernel ram, but generally
* not listed as such in the E820 table.
+ *
+ * This typically reserves additional memory (64KiB by default)
+ * since some BIOSes are known to corrupt low memory. See the
+ * Kconfig help text for X86_RESERVE_LOW.
*/
- e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED);
+ e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE),
+ E820_RAM, E820_RESERVED);
+
/*
* special case: Some BIOSen report the PC BIOS
* area (640->1Mb) as ram even though it is not.
@@ -710,6 +642,28 @@ static void __init trim_bios_range(void)
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
}
+static int __init parse_reservelow(char *p)
+{
+ unsigned long long size;
+
+ if (!p)
+ return -EINVAL;
+
+ size = memparse(p, &p);
+
+ if (size < 4096)
+ size = 4096;
+
+ if (size > 640*1024)
+ size = 640*1024;
+
+ reserve_low = size;
+
+ return 0;
+}
+
+early_param("reservelow", parse_reservelow);
+
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
@@ -736,10 +690,10 @@ void __init setup_arch(char **cmdline_p)
printk(KERN_INFO "Command line: %s\n", boot_command_line);
#endif
- /* VMI may relocate the fixmap; do this before touching ioremap area */
- vmi_init();
-
- /* OFW also may relocate the fixmap */
+ /*
+ * If we have OLPC OFW, we might end up relocating the fixmap due to
+ * reserve_top(), so do this before touching the ioremap area.
+ */
olpc_ofw_detect();
early_trap_init();
@@ -840,9 +794,6 @@ void __init setup_arch(char **cmdline_p)
x86_report_nx();
- /* Must be before kernel pagetables are setup */
- vmi_activate();
-
/* after early param, so could get panic from serial */
reserve_early_setup_data();
@@ -865,8 +816,6 @@ void __init setup_arch(char **cmdline_p)
dmi_scan_machine();
- dmi_check_system(bad_bios_dmi_table);
-
/*
* VMware detection requires dmi to be available, so this
* needs to be done after dmi_scan_machine, for the BP.
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index a60df9ae6454..2335c15c93a4 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -253,7 +253,7 @@ void __init setup_per_cpu_areas(void)
* Up to this point, the boot CPU has been using .init.data
* area. Reload any changed state for the boot CPU.
*/
- if (cpu == boot_cpu_id)
+ if (!cpu)
switch_to_new_gdt(cpu);
}
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
index cb22acf3ed09..dd4c281ffe57 100644
--- a/arch/x86/kernel/sfi.c
+++ b/arch/x86/kernel/sfi.c
@@ -34,7 +34,7 @@
#ifdef CONFIG_X86_LOCAL_APIC
static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
-void __init mp_sfi_register_lapic_address(unsigned long address)
+static void __init mp_sfi_register_lapic_address(unsigned long address)
{
mp_lapic_addr = address;
@@ -46,7 +46,7 @@ void __init mp_sfi_register_lapic_address(unsigned long address)
}
/* All CPUs enumerated by SFI must be present and enabled */
-void __cpuinit mp_sfi_register_lapic(u8 id)
+static void __cpuinit mp_sfi_register_lapic(u8 id)
{
if (MAX_APICS - id <= 0) {
pr_warning("Processor #%d invalid (max %d)\n",
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8b3bfc4dd708..dfb50890b5b7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,7 +62,7 @@
#include
#include
#include
-#include
+#include
#include
#include
#include
@@ -311,7 +311,6 @@ notrace static void __cpuinit start_secondary(void *unused)
__flush_tlb_all();
#endif
- vmi_bringup();
cpu_init();
preempt_disable();
smp_callin();
@@ -324,9 +323,9 @@ notrace static void __cpuinit start_secondary(void *unused)
check_tsc_sync_target();
if (nmi_watchdog == NMI_IO_APIC) {
- legacy_pic->chip->mask(0);
+ legacy_pic->mask(0);
enable_NMI_through_LVT0();
- legacy_pic->chip->unmask(0);
+ legacy_pic->unmask(0);
}
/* This must be done before setting cpu_online_mask */
@@ -397,6 +396,19 @@ void __cpuinit smp_store_cpu_info(int id)
identify_secondary_cpu(c);
}
+static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
+{
+ struct cpuinfo_x86 *c1 = &cpu_data(cpu1);
+ struct cpuinfo_x86 *c2 = &cpu_data(cpu2);
+
+ cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
+ cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
+ cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
+ cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
+ cpumask_set_cpu(cpu1, c2->llc_shared_map);
+ cpumask_set_cpu(cpu2, c1->llc_shared_map);
+}
+
void __cpuinit set_cpu_sibling_map(int cpu)
{
@@ -409,14 +421,13 @@ void __cpuinit set_cpu_sibling_map(int cpu)
for_each_cpu(i, cpu_sibling_setup_mask) {
struct cpuinfo_x86 *o = &cpu_data(i);
- if (c->phys_proc_id == o->phys_proc_id &&
- c->cpu_core_id == o->cpu_core_id) {
- cpumask_set_cpu(i, cpu_sibling_mask(cpu));
- cpumask_set_cpu(cpu, cpu_sibling_mask(i));
- cpumask_set_cpu(i, cpu_core_mask(cpu));
- cpumask_set_cpu(cpu, cpu_core_mask(i));
- cpumask_set_cpu(i, c->llc_shared_map);
- cpumask_set_cpu(cpu, o->llc_shared_map);
+ if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+ if (c->phys_proc_id == o->phys_proc_id &&
+ c->compute_unit_id == o->compute_unit_id)
+ link_thread_siblings(cpu, i);
+ } else if (c->phys_proc_id == o->phys_proc_id &&
+ c->cpu_core_id == o->cpu_core_id) {
+ link_thread_siblings(cpu, i);
}
}
} else {
@@ -1109,8 +1120,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
}
set_cpu_sibling_map(0);
- enable_IR_x2apic();
- default_setup_apic_routing();
if (smp_sanity_check(max_cpus) < 0) {
printk(KERN_INFO "SMP disabled\n");
@@ -1118,6 +1127,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
goto out;
}
+ default_setup_apic_routing();
+
preempt_disable();
if (read_apic_id() != boot_cpu_physical_apicid) {
panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
@@ -1383,11 +1394,88 @@ void play_dead_common(void)
local_irq_disable();
}
+/*
+ * We need to flush the caches before going to sleep, lest we have
+ * dirty data in our caches when we come back up.
+ */
+static inline void mwait_play_dead(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ unsigned int highest_cstate = 0;
+ unsigned int highest_subcstate = 0;
+ int i;
+ void *mwait_ptr;
+
+ if (!cpu_has(¤t_cpu_data, X86_FEATURE_MWAIT))
+ return;
+ if (!cpu_has(¤t_cpu_data, X86_FEATURE_CLFLSH))
+ return;
+ if (current_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+ return;
+
+ eax = CPUID_MWAIT_LEAF;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ /*
+ * eax will be 0 if EDX enumeration is not valid.
+ * Initialized below to cstate, sub_cstate value when EDX is valid.
+ */
+ if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
+ eax = 0;
+ } else {
+ edx >>= MWAIT_SUBSTATE_SIZE;
+ for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
+ if (edx & MWAIT_SUBSTATE_MASK) {
+ highest_cstate = i;
+ highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
+ }
+ }
+ eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
+ (highest_subcstate - 1);
+ }
+
+ /*
+ * This should be a memory location in a cache line which is
+ * unlikely to be touched by other processors. The actual
+ * content is immaterial as it is not actually modified in any way.
+ */
+ mwait_ptr = ¤t_thread_info()->flags;
+
+ wbinvd();
+
+ while (1) {
+ /*
+ * The CLFLUSH is a workaround for erratum AAI65 for
+ * the Xeon 7400 series. It's not clear it is actually
+ * needed, but it should be harmless in either case.
+ * The WBINVD is insufficient due to the spurious-wakeup
+ * case where we return around the loop.
+ */
+ clflush(mwait_ptr);
+ __monitor(mwait_ptr, 0, 0);
+ mb();
+ __mwait(eax, 0);
+ }
+}
+
+static inline void hlt_play_dead(void)
+{
+ if (current_cpu_data.x86 >= 4)
+ wbinvd();
+
+ while (1) {
+ native_halt();
+ }
+}
+
void native_play_dead(void)
{
play_dead_common();
tboot_shutdown(TB_SHUTDOWN_WFS);
- wbinvd_halt();
+
+ mwait_play_dead(); /* Only returns on failure */
+ hlt_play_dead();
}
#else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index d5e06624e34a..0b0cb5fede19 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -33,8 +33,8 @@ int kernel_execve(const char *filename,
const char *const envp[])
{
long __res;
- asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
+ asm volatile ("int $0x80"
: "=a" (__res)
- : "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory");
+ : "0" (__NR_execve), "b" (filename), "c" (argv), "d" (envp) : "memory");
return __res;
}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 60788dee0f8a..d43968503dd2 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -776,21 +776,10 @@ asmlinkage void math_state_restore(void)
}
EXPORT_SYMBOL_GPL(math_state_restore);
-#ifndef CONFIG_MATH_EMULATION
-void math_emulate(struct math_emu_info *info)
-{
- printk(KERN_EMERG
- "math-emulation not enabled and no coprocessor found.\n");
- printk(KERN_EMERG "killing %s.\n", current->comm);
- force_sig(SIGFPE, current);
- schedule();
-}
-#endif /* CONFIG_MATH_EMULATION */
-
dotraplinkage void __kprobes
do_device_not_available(struct pt_regs *regs, long error_code)
{
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_MATH_EMULATION
if (read_cr0() & X86_CR0_EM) {
struct math_emu_info info = { };
@@ -798,12 +787,12 @@ do_device_not_available(struct pt_regs *regs, long error_code)
info.regs = regs;
math_emulate(&info);
- } else {
- math_state_restore(); /* interrupts still off */
- conditional_sti(regs);
+ return;
}
-#else
- math_state_restore();
+#endif
+ math_state_restore(); /* interrupts still off */
+#ifdef CONFIG_X86_32
+ conditional_sti(regs);
#endif
}
@@ -881,18 +870,6 @@ void __init trap_init(void)
#endif
#ifdef CONFIG_X86_32
- if (cpu_has_fxsr) {
- printk(KERN_INFO "Enabling fast FPU save and restore... ");
- set_in_cr4(X86_CR4_OSFXSR);
- printk("done.\n");
- }
- if (cpu_has_xmm) {
- printk(KERN_INFO
- "Enabling unmasked SIMD FPU exception support... ");
- set_in_cr4(X86_CR4_OSXMMEXCPT);
- printk("done.\n");
- }
-
set_system_trap_gate(SYSCALL_VECTOR, &system_call);
set_bit(SYSCALL_VECTOR, used_vectors);
#endif
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 26a863a9c2a8..0c40d8b72416 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,10 +104,14 @@ int __init notsc_setup(char *str)
__setup("notsc", notsc_setup);
+static int no_sched_irq_time;
+
static int __init tsc_setup(char *str)
{
if (!strcmp(str, "reliable"))
tsc_clocksource_reliable = 1;
+ if (!strncmp(str, "noirqtime", 9))
+ no_sched_irq_time = 1;
return 1;
}
@@ -801,6 +805,7 @@ void mark_tsc_unstable(char *reason)
if (!tsc_unstable) {
tsc_unstable = 1;
sched_clock_stable = 0;
+ disable_sched_clock_irqtime();
printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
/* Change only the rating, when not registered */
if (clocksource_tsc.mult)
@@ -892,60 +897,6 @@ static void __init init_tsc_clocksource(void)
clocksource_register_khz(&clocksource_tsc, tsc_khz);
}
-#ifdef CONFIG_X86_64
-/*
- * calibrate_cpu is used on systems with fixed rate TSCs to determine
- * processor frequency
- */
-#define TICK_COUNT 100000000
-static unsigned long __init calibrate_cpu(void)
-{
- int tsc_start, tsc_now;
- int i, no_ctr_free;
- unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
- unsigned long flags;
-
- for (i = 0; i < 4; i++)
- if (avail_to_resrv_perfctr_nmi_bit(i))
- break;
- no_ctr_free = (i == 4);
- if (no_ctr_free) {
- WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
- "cpu_khz value may be incorrect.\n");
- i = 3;
- rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
- wrmsrl(MSR_K7_EVNTSEL3, 0);
- rdmsrl(MSR_K7_PERFCTR3, pmc3);
- } else {
- reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
- reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
- }
- local_irq_save(flags);
- /* start measuring cycles, incrementing from 0 */
- wrmsrl(MSR_K7_PERFCTR0 + i, 0);
- wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
- rdtscl(tsc_start);
- do {
- rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
- tsc_now = get_cycles();
- } while ((tsc_now - tsc_start) < TICK_COUNT);
-
- local_irq_restore(flags);
- if (no_ctr_free) {
- wrmsrl(MSR_K7_EVNTSEL3, 0);
- wrmsrl(MSR_K7_PERFCTR3, pmc3);
- wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
- } else {
- release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
- release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
- }
-
- return pmc_now * tsc_khz / (tsc_now - tsc_start);
-}
-#else
-static inline unsigned long calibrate_cpu(void) { return cpu_khz; }
-#endif
-
void __init tsc_init(void)
{
u64 lpj;
@@ -964,10 +915,6 @@ void __init tsc_init(void)
return;
}
- if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
- (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
- cpu_khz = calibrate_cpu();
-
printk("Detected %lu.%03lu MHz processor.\n",
(unsigned long)cpu_khz / 1000,
(unsigned long)cpu_khz % 1000);
@@ -987,6 +934,9 @@ void __init tsc_init(void)
/* now allow native_sched_clock() to use rdtsc */
tsc_disabled = 0;
+ if (!no_sched_irq_time)
+ enable_sched_clock_irqtime();
+
lpj = ((u64)tsc_khz * 1000);
do_div(lpj, HZ);
lpj_fine = lpj;
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index 1132129db792..7b24460917d5 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -28,34 +28,21 @@ struct uv_irq_2_mmr_pnode{
static spinlock_t uv_irq_lock;
static struct rb_root uv_irq_root;
-static int uv_set_irq_affinity(unsigned int, const struct cpumask *);
+static int uv_set_irq_affinity(struct irq_data *, const struct cpumask *, bool);
-static void uv_noop(unsigned int irq)
-{
-}
+static void uv_noop(struct irq_data *data) { }
-static unsigned int uv_noop_ret(unsigned int irq)
-{
- return 0;
-}
-
-static void uv_ack_apic(unsigned int irq)
+static void uv_ack_apic(struct irq_data *data)
{
ack_APIC_irq();
}
static struct irq_chip uv_irq_chip = {
- .name = "UV-CORE",
- .startup = uv_noop_ret,
- .shutdown = uv_noop,
- .enable = uv_noop,
- .disable = uv_noop,
- .ack = uv_noop,
- .mask = uv_noop,
- .unmask = uv_noop,
- .eoi = uv_ack_apic,
- .end = uv_noop,
- .set_affinity = uv_set_irq_affinity,
+ .name = "UV-CORE",
+ .irq_mask = uv_noop,
+ .irq_unmask = uv_noop,
+ .irq_eoi = uv_ack_apic,
+ .irq_set_affinity = uv_set_irq_affinity,
};
/*
@@ -144,26 +131,22 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
unsigned long mmr_offset, int limit)
{
const struct cpumask *eligible_cpu = cpumask_of(cpu);
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
- int mmr_pnode;
+ struct irq_cfg *cfg = get_irq_chip_data(irq);
unsigned long mmr_value;
struct uv_IO_APIC_route_entry *entry;
- int err;
+ int mmr_pnode, err;
BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
sizeof(unsigned long));
- cfg = irq_cfg(irq);
-
err = assign_irq_vector(irq, cfg, eligible_cpu);
if (err != 0)
return err;
if (limit == UV_AFFINITY_CPU)
- desc->status |= IRQ_NO_BALANCING;
+ irq_set_status_flags(irq, IRQ_NO_BALANCING);
else
- desc->status |= IRQ_MOVE_PCNTXT;
+ irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
irq_name);
@@ -206,17 +189,17 @@ static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
}
-static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int
+uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg = desc->chip_data;
+ struct irq_cfg *cfg = data->chip_data;
unsigned int dest;
- unsigned long mmr_value;
+ unsigned long mmr_value, mmr_offset;
struct uv_IO_APIC_route_entry *entry;
- unsigned long mmr_offset;
int mmr_pnode;
- if (set_desc_affinity(desc, mask, &dest))
+ if (__ioapic_set_affinity(data, mask, &dest))
return -1;
mmr_value = 0;
@@ -231,7 +214,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
entry->dest = dest;
/* Get previously stored MMR and pnode of hub sourcing interrupts */
- if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode))
+ if (uv_irq_2_mmr_info(data->irq, &mmr_offset, &mmr_pnode))
return -1;
uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index e680ea52db9b..3371bd053b89 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -66,10 +66,7 @@ static void __init visws_time_init(void)
}
/* Replaces the default init_ISA_irqs in the generic setup */
-static void __init visws_pre_intr_init(void)
-{
- init_VISWS_APIC_irqs();
-}
+static void __init visws_pre_intr_init(void);
/* Quirk for machine specific memory setup. */
@@ -429,67 +426,34 @@ static int is_co_apic(unsigned int irq)
/*
* This is the SGI Cobalt (IO-)APIC:
*/
-
-static void enable_cobalt_irq(unsigned int irq)
+static void enable_cobalt_irq(struct irq_data *data)
{
- co_apic_set(is_co_apic(irq), irq);
+ co_apic_set(is_co_apic(data->irq), data->irq);
}
-static void disable_cobalt_irq(unsigned int irq)
+static void disable_cobalt_irq(struct irq_data *data)
{
- int entry = is_co_apic(irq);
+ int entry = is_co_apic(data->irq);
co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
co_apic_read(CO_APIC_LO(entry));
}
-/*
- * "irq" really just serves to identify the device. Here is where we
- * map this to the Cobalt APIC entry where it's physically wired.
- * This is called via request_irq -> setup_irq -> irq_desc->startup()
- */
-static unsigned int startup_cobalt_irq(unsigned int irq)
-{
- unsigned long flags;
- struct irq_desc *desc = irq_to_desc(irq);
-
- spin_lock_irqsave(&cobalt_lock, flags);
- if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
- desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
- enable_cobalt_irq(irq);
- spin_unlock_irqrestore(&cobalt_lock, flags);
- return 0;
-}
-
-static void ack_cobalt_irq(unsigned int irq)
+static void ack_cobalt_irq(struct irq_data *data)
{
unsigned long flags;
spin_lock_irqsave(&cobalt_lock, flags);
- disable_cobalt_irq(irq);
+ disable_cobalt_irq(data);
apic_write(APIC_EOI, APIC_EIO_ACK);
spin_unlock_irqrestore(&cobalt_lock, flags);
}
-static void end_cobalt_irq(unsigned int irq)
-{
- unsigned long flags;
- struct irq_desc *desc = irq_to_desc(irq);
-
- spin_lock_irqsave(&cobalt_lock, flags);
- if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS)))
- enable_cobalt_irq(irq);
- spin_unlock_irqrestore(&cobalt_lock, flags);
-}
-
static struct irq_chip cobalt_irq_type = {
- .name = "Cobalt-APIC",
- .startup = startup_cobalt_irq,
- .shutdown = disable_cobalt_irq,
- .enable = enable_cobalt_irq,
- .disable = disable_cobalt_irq,
- .ack = ack_cobalt_irq,
- .end = end_cobalt_irq,
+ .name = "Cobalt-APIC",
+ .irq_enable = enable_cobalt_irq,
+ .irq_disable = disable_cobalt_irq,
+ .irq_ack = ack_cobalt_irq,
};
@@ -503,35 +467,34 @@ static struct irq_chip cobalt_irq_type = {
* interrupt controller type, and through a special virtual interrupt-
* controller. Device drivers only see the virtual interrupt sources.
*/
-static unsigned int startup_piix4_master_irq(unsigned int irq)
+static unsigned int startup_piix4_master_irq(struct irq_data *data)
{
legacy_pic->init(0);
-
- return startup_cobalt_irq(irq);
+ enable_cobalt_irq(data);
}
-static void end_piix4_master_irq(unsigned int irq)
+static void end_piix4_master_irq(struct irq_data *data)
{
unsigned long flags;
spin_lock_irqsave(&cobalt_lock, flags);
- enable_cobalt_irq(irq);
+ enable_cobalt_irq(data);
spin_unlock_irqrestore(&cobalt_lock, flags);
}
static struct irq_chip piix4_master_irq_type = {
- .name = "PIIX4-master",
- .startup = startup_piix4_master_irq,
- .ack = ack_cobalt_irq,
- .end = end_piix4_master_irq,
+ .name = "PIIX4-master",
+ .irq_startup = startup_piix4_master_irq,
+ .irq_ack = ack_cobalt_irq,
};
+static void pii4_mask(struct irq_data *data) { }
static struct irq_chip piix4_virtual_irq_type = {
- .name = "PIIX4-virtual",
+ .name = "PIIX4-virtual",
+ .mask = pii4_mask,
};
-
/*
* PIIX4-8259 master/virtual functions to handle interrupt requests
* from legacy devices: floppy, parallel, serial, rtc.
@@ -549,9 +512,8 @@ static struct irq_chip piix4_virtual_irq_type = {
*/
static irqreturn_t piix4_master_intr(int irq, void *dev_id)
{
- int realirq;
- struct irq_desc *desc;
unsigned long flags;
+ int realirq;
raw_spin_lock_irqsave(&i8259A_lock, flags);
@@ -592,18 +554,10 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
raw_spin_unlock_irqrestore(&i8259A_lock, flags);
- desc = irq_to_desc(realirq);
-
/*
* handle this 'virtual interrupt' as a Cobalt one now.
*/
- kstat_incr_irqs_this_cpu(realirq, desc);
-
- if (likely(desc->action != NULL))
- handle_IRQ_event(realirq, desc->action);
-
- if (!(desc->status & IRQ_DISABLED))
- legacy_pic->chip->unmask(realirq);
+ generic_handle_irq(realirq);
return IRQ_HANDLED;
@@ -624,41 +578,35 @@ static struct irqaction cascade_action = {
static inline void set_piix4_virtual_irq_type(void)
{
- piix4_virtual_irq_type.shutdown = i8259A_chip.mask;
piix4_virtual_irq_type.enable = i8259A_chip.unmask;
piix4_virtual_irq_type.disable = i8259A_chip.mask;
+ piix4_virtual_irq_type.unmask = i8259A_chip.unmask;
}
-void init_VISWS_APIC_irqs(void)
+static void __init visws_pre_intr_init(void)
{
int i;
+ set_piix4_virtual_irq_type();
+
for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
- struct irq_desc *desc = irq_to_desc(i);
+ struct irq_chip *chip = NULL;
- desc->status = IRQ_DISABLED;
- desc->action = 0;
- desc->depth = 1;
+ if (i == 0)
+ chip = &cobalt_irq_type;
+ else if (i == CO_IRQ_IDE0)
+ chip = &cobalt_irq_type;
+ else if (i == CO_IRQ_IDE1)
+ >chip = &cobalt_irq_type;
+ else if (i == CO_IRQ_8259)
+ chip = &piix4_master_irq_type;
+ else if (i < CO_IRQ_APIC0)
+ chip = &piix4_virtual_irq_type;
+ else if (IS_CO_APIC(i))
+ chip = &cobalt_irq_type;
- if (i == 0) {
- desc->chip = &cobalt_irq_type;
- }
- else if (i == CO_IRQ_IDE0) {
- desc->chip = &cobalt_irq_type;
- }
- else if (i == CO_IRQ_IDE1) {
- desc->chip = &cobalt_irq_type;
- }
- else if (i == CO_IRQ_8259) {
- desc->chip = &piix4_master_irq_type;
- }
- else if (i < CO_IRQ_APIC0) {
- set_piix4_virtual_irq_type();
- desc->chip = &piix4_virtual_irq_type;
- }
- else if (IS_CO_APIC(i)) {
- desc->chip = &cobalt_irq_type;
- }
+ if (chip)
+ set_irq_chip(i, chip);
}
setup_irq(CO_IRQ_8259, &master_action);
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
deleted file mode 100644
index ce9fbacb7526..000000000000
--- a/arch/x86/kernel/vmi_32.c
+++ /dev/null
@@ -1,893 +0,0 @@
-/*
- * VMI specific paravirt-ops implementation
- *
- * Copyright (C) 2005, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to zach@vmware.com
- *
- */
-
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-
-/* Convenient for calling VMI functions indirectly in the ROM */
-typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
-typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
-
-#define call_vrom_func(rom,func) \
- (((VROMFUNC *)(rom->func))())
-
-#define call_vrom_long_func(rom,func,arg) \
- (((VROMLONGFUNC *)(rom->func)) (arg))
-
-static struct vrom_header *vmi_rom;
-static int disable_pge;
-static int disable_pse;
-static int disable_sep;
-static int disable_tsc;
-static int disable_mtrr;
-static int disable_noidle;
-static int disable_vmi_timer;
-
-/* Cached VMI operations */
-static struct {
- void (*cpuid)(void /* non-c */);
- void (*_set_ldt)(u32 selector);
- void (*set_tr)(u32 selector);
- void (*write_idt_entry)(struct desc_struct *, int, u32, u32);
- void (*write_gdt_entry)(struct desc_struct *, int, u32, u32);
- void (*write_ldt_entry)(struct desc_struct *, int, u32, u32);
- void (*set_kernel_stack)(u32 selector, u32 sp0);
- void (*allocate_page)(u32, u32, u32, u32, u32);
- void (*release_page)(u32, u32);
- void (*set_pte)(pte_t, pte_t *, unsigned);
- void (*update_pte)(pte_t *, unsigned);
- void (*set_linear_mapping)(int, void *, u32, u32);
- void (*_flush_tlb)(int);
- void (*set_initial_ap_state)(int, int);
- void (*halt)(void);
- void (*set_lazy_mode)(int mode);
-} vmi_ops;
-
-/* Cached VMI operations */
-struct vmi_timer_ops vmi_timer_ops;
-
-/*
- * VMI patching routines.
- */
-#define MNEM_CALL 0xe8
-#define MNEM_JMP 0xe9
-#define MNEM_RET 0xc3
-
-#define IRQ_PATCH_INT_MASK 0
-#define IRQ_PATCH_DISABLE 5
-
-static inline void patch_offset(void *insnbuf,
- unsigned long ip, unsigned long dest)
-{
- *(unsigned long *)(insnbuf+1) = dest-ip-5;
-}
-
-static unsigned patch_internal(int call, unsigned len, void *insnbuf,
- unsigned long ip)
-{
- u64 reloc;
- struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
- reloc = call_vrom_long_func(vmi_rom, get_reloc, call);
- switch(rel->type) {
- case VMI_RELOCATION_CALL_REL:
- BUG_ON(len < 5);
- *(char *)insnbuf = MNEM_CALL;
- patch_offset(insnbuf, ip, (unsigned long)rel->eip);
- return 5;
-
- case VMI_RELOCATION_JUMP_REL:
- BUG_ON(len < 5);
- *(char *)insnbuf = MNEM_JMP;
- patch_offset(insnbuf, ip, (unsigned long)rel->eip);
- return 5;
-
- case VMI_RELOCATION_NOP:
- /* obliterate the whole thing */
- return 0;
-
- case VMI_RELOCATION_NONE:
- /* leave native code in place */
- break;
-
- default:
- BUG();
- }
- return len;
-}
-
-/*
- * Apply patch if appropriate, return length of new instruction
- * sequence. The callee does nop padding for us.
- */
-static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
- unsigned long ip, unsigned len)
-{
- switch (type) {
- case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
- return patch_internal(VMI_CALL_DisableInterrupts, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
- return patch_internal(VMI_CALL_EnableInterrupts, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
- return patch_internal(VMI_CALL_SetInterruptMask, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_irq_ops.save_fl):
- return patch_internal(VMI_CALL_GetInterruptMask, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_cpu_ops.iret):
- return patch_internal(VMI_CALL_IRET, len, insns, ip);
- case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
- return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
- default:
- break;
- }
- return len;
-}
-
-/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
-static void vmi_cpuid(unsigned int *ax, unsigned int *bx,
- unsigned int *cx, unsigned int *dx)
-{
- int override = 0;
- if (*ax == 1)
- override = 1;
- asm volatile ("call *%6"
- : "=a" (*ax),
- "=b" (*bx),
- "=c" (*cx),
- "=d" (*dx)
- : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid));
- if (override) {
- if (disable_pse)
- *dx &= ~X86_FEATURE_PSE;
- if (disable_pge)
- *dx &= ~X86_FEATURE_PGE;
- if (disable_sep)
- *dx &= ~X86_FEATURE_SEP;
- if (disable_tsc)
- *dx &= ~X86_FEATURE_TSC;
- if (disable_mtrr)
- *dx &= ~X86_FEATURE_MTRR;
- }
-}
-
-static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
-{
- if (gdt[nr].a != new->a || gdt[nr].b != new->b)
- write_gdt_entry(gdt, nr, new, 0);
-}
-
-static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
-{
- struct desc_struct *gdt = get_cpu_gdt_table(cpu);
- vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
- vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
- vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
-}
-
-static void vmi_set_ldt(const void *addr, unsigned entries)
-{
- unsigned cpu = smp_processor_id();
- struct desc_struct desc;
-
- pack_descriptor(&desc, (unsigned long)addr,
- entries * sizeof(struct desc_struct) - 1,
- DESC_LDT, 0);
- write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT);
- vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
-}
-
-static void vmi_set_tr(void)
-{
- vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
-}
-
-static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
-{
- u32 *idt_entry = (u32 *)g;
- vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[1]);
-}
-
-static void vmi_write_gdt_entry(struct desc_struct *dt, int entry,
- const void *desc, int type)
-{
- u32 *gdt_entry = (u32 *)desc;
- vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[1]);
-}
-
-static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
- const void *desc)
-{
- u32 *ldt_entry = (u32 *)desc;
- vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
-}
-
-static void vmi_load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
-{
- tss->x86_tss.sp0 = thread->sp0;
-
- /* This can only happen when SEP is enabled, no need to test "SEP"arately */
- if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
- tss->x86_tss.ss1 = thread->sysenter_cs;
- wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
- }
- vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0);
-}
-
-static void vmi_flush_tlb_user(void)
-{
- vmi_ops._flush_tlb(VMI_FLUSH_TLB);
-}
-
-static void vmi_flush_tlb_kernel(void)
-{
- vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
-}
-
-/* Stub to do nothing at all; used for delays and unimplemented calls */
-static void vmi_nop(void)
-{
-}
-
-static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
-{
- vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
-}
-
-static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
-{
- /*
- * This call comes in very early, before mem_map is setup.
- * It is called only for swapper_pg_dir, which already has
- * data on it.
- */
- vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
-}
-
-static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
-{
- vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
-}
-
-static void vmi_release_pte(unsigned long pfn)
-{
- vmi_ops.release_page(pfn, VMI_PAGE_L1);
-}
-
-static void vmi_release_pmd(unsigned long pfn)
-{
- vmi_ops.release_page(pfn, VMI_PAGE_L2);
-}
-
-/*
- * We use the pgd_free hook for releasing the pgd page:
- */
-static void vmi_pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- unsigned long pfn = __pa(pgd) >> PAGE_SHIFT;
-
- vmi_ops.release_page(pfn, VMI_PAGE_L2);
-}
-
-/*
- * Helper macros for MMU update flags. We can defer updates until a flush
- * or page invalidation only if the update is to the current address space
- * (otherwise, there is no flush). We must check against init_mm, since
- * this could be a kernel update, which usually passes init_mm, although
- * sometimes this check can be skipped if we know the particular function
- * is only called on user mode PTEs. We could change the kernel to pass
- * current->active_mm here, but in particular, I was unsure if changing
- * mm/highmem.c to do this would still be correct on other architectures.
- */
-#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \
- (!mustbeuser && (mm) == &init_mm))
-#define vmi_flags_addr(mm, addr, level, user) \
- ((level) | (is_current_as(mm, user) ? \
- (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
-#define vmi_flags_addr_defer(mm, addr, level, user) \
- ((level) | (is_current_as(mm, user) ? \
- (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
-
-static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
- vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
- vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_set_pte(pte_t *ptep, pte_t pte)
-{
- /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
- vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
-}
-
-static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
-{
- vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-#ifdef CONFIG_X86_PAE
- const pte_t pte = { .pte = pmdval.pmd };
-#else
- const pte_t pte = { pmdval.pud.pgd.pgd };
-#endif
- vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
-}
-
-#ifdef CONFIG_X86_PAE
-
-static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
-{
- /*
- * XXX This is called from set_pmd_pte, but at both PT
- * and PD layers so the VMI_PAGE_PT flag is wrong. But
- * it is only called for large page mapping changes,
- * the Xen backend, doesn't support large pages, and the
- * ESX backend doesn't depend on the flag.
- */
- set_64bit((unsigned long long *)ptep,pte_val(pteval));
- vmi_ops.update_pte(ptep, VMI_PAGE_PT);
-}
-
-static void vmi_set_pud(pud_t *pudp, pud_t pudval)
-{
- /* Um, eww */
- const pte_t pte = { .pte = pudval.pgd.pgd };
- vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
-}
-
-static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
- const pte_t pte = { .pte = 0 };
- vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_pmd_clear(pmd_t *pmd)
-{
- const pte_t pte = { .pte = 0 };
- vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
-}
-#endif
-
-#ifdef CONFIG_SMP
-static void __devinit
-vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
- unsigned long start_esp)
-{
- struct vmi_ap_state ap;
-
- /* Default everything to zero. This is fine for most GPRs. */
- memset(&ap, 0, sizeof(struct vmi_ap_state));
-
- ap.gdtr_limit = GDT_SIZE - 1;
- ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
-
- ap.idtr_limit = IDT_ENTRIES * 8 - 1;
- ap.idtr_base = (unsigned long) idt_table;
-
- ap.ldtr = 0;
-
- ap.cs = __KERNEL_CS;
- ap.eip = (unsigned long) start_eip;
- ap.ss = __KERNEL_DS;
- ap.esp = (unsigned long) start_esp;
-
- ap.ds = __USER_DS;
- ap.es = __USER_DS;
- ap.fs = __KERNEL_PERCPU;
- ap.gs = __KERNEL_STACK_CANARY;
-
- ap.eflags = 0;
-
-#ifdef CONFIG_X86_PAE
- /* efer should match BSP efer. */
- if (cpu_has_nx) {
- unsigned l, h;
- rdmsr(MSR_EFER, l, h);
- ap.efer = (unsigned long long) h << 32 | l;
- }
-#endif
-
- ap.cr3 = __pa(swapper_pg_dir);
- /* Protected mode, paging, AM, WP, NE, MP. */
- ap.cr0 = 0x80050023;
- ap.cr4 = mmu_cr4_features;
- vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid);
-}
-#endif
-
-static void vmi_start_context_switch(struct task_struct *prev)
-{
- paravirt_start_context_switch(prev);
- vmi_ops.set_lazy_mode(2);
-}
-
-static void vmi_end_context_switch(struct task_struct *next)
-{
- vmi_ops.set_lazy_mode(0);
- paravirt_end_context_switch(next);
-}
-
-static void vmi_enter_lazy_mmu(void)
-{
- paravirt_enter_lazy_mmu();
- vmi_ops.set_lazy_mode(1);
-}
-
-static void vmi_leave_lazy_mmu(void)
-{
- vmi_ops.set_lazy_mode(0);
- paravirt_leave_lazy_mmu();
-}
-
-static inline int __init check_vmi_rom(struct vrom_header *rom)
-{
- struct pci_header *pci;
- struct pnp_header *pnp;
- const char *manufacturer = "UNKNOWN";
- const char *product = "UNKNOWN";
- const char *license = "unspecified";
-
- if (rom->rom_signature != 0xaa55)
- return 0;
- if (rom->vrom_signature != VMI_SIGNATURE)
- return 0;
- if (rom->api_version_maj != VMI_API_REV_MAJOR ||
- rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
- printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
- rom->api_version_maj,
- rom->api_version_min);
- return 0;
- }
-
- /*
- * Relying on the VMI_SIGNATURE field is not 100% safe, so check
- * the PCI header and device type to make sure this is really a
- * VMI device.
- */
- if (!rom->pci_header_offs) {
- printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
- return 0;
- }
-
- pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
- if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
- pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
- /* Allow it to run... anyways, but warn */
- printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
- }
-
- if (rom->pnp_header_offs) {
- pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
- if (pnp->manufacturer_offset)
- manufacturer = (const char *)rom+pnp->manufacturer_offset;
- if (pnp->product_offset)
- product = (const char *)rom+pnp->product_offset;
- }
-
- if (rom->license_offs)
- license = (char *)rom+rom->license_offs;
-
- printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
- manufacturer, product,
- rom->api_version_maj, rom->api_version_min,
- pci->rom_version_maj, pci->rom_version_min);
-
- /* Don't allow BSD/MIT here for now because we don't want to end up
- with any binary only shim layers */
- if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) {
- printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n",
- license);
- return 0;
- }
-
- return 1;
-}
-
-/*
- * Probe for the VMI option ROM
- */
-static inline int __init probe_vmi_rom(void)
-{
- unsigned long base;
-
- /* VMI ROM is in option ROM area, check signature */
- for (base = 0xC0000; base < 0xE0000; base += 2048) {
- struct vrom_header *romstart;
- romstart = (struct vrom_header *)isa_bus_to_virt(base);
- if (check_vmi_rom(romstart)) {
- vmi_rom = romstart;
- return 1;
- }
- }
- return 0;
-}
-
-/*
- * VMI setup common to all processors
- */
-void vmi_bringup(void)
-{
- /* We must establish the lowmem mapping for MMU ops to work */
- if (vmi_ops.set_linear_mapping)
- vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0);
-}
-
-/*
- * Return a pointer to a VMI function or NULL if unimplemented
- */
-static void *vmi_get_function(int vmicall)
-{
- u64 reloc;
- const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
- reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall);
- BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
- if (rel->type == VMI_RELOCATION_CALL_REL)
- return (void *)rel->eip;
- else
- return NULL;
-}
-
-/*
- * Helper macro for making the VMI paravirt-ops fill code readable.
- * For unimplemented operations, fall back to default, unless nop
- * is returned by the ROM.
- */
-#define para_fill(opname, vmicall) \
-do { \
- reloc = call_vrom_long_func(vmi_rom, get_reloc, \
- VMI_CALL_##vmicall); \
- if (rel->type == VMI_RELOCATION_CALL_REL) \
- opname = (void *)rel->eip; \
- else if (rel->type == VMI_RELOCATION_NOP) \
- opname = (void *)vmi_nop; \
- else if (rel->type != VMI_RELOCATION_NONE) \
- printk(KERN_WARNING "VMI: Unknown relocation " \
- "type %d for " #vmicall"\n",\
- rel->type); \
-} while (0)
-
-/*
- * Helper macro for making the VMI paravirt-ops fill code readable.
- * For cached operations which do not match the VMI ROM ABI and must
- * go through a tranlation stub. Ignore NOPs, since it is not clear
- * a NOP * VMI function corresponds to a NOP paravirt-op when the
- * functions are not in 1-1 correspondence.
- */
-#define para_wrap(opname, wrapper, cache, vmicall) \
-do { \
- reloc = call_vrom_long_func(vmi_rom, get_reloc, \
- VMI_CALL_##vmicall); \
- BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \
- if (rel->type == VMI_RELOCATION_CALL_REL) { \
- opname = wrapper; \
- vmi_ops.cache = (void *)rel->eip; \
- } \
-} while (0)
-
-/*
- * Activate the VMI interface and switch into paravirtualized mode
- */
-static inline int __init activate_vmi(void)
-{
- short kernel_cs;
- u64 reloc;
- const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
-
- /*
- * Prevent page tables from being allocated in highmem, even if
- * CONFIG_HIGHPTE is enabled.
- */
- __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
-
- if (call_vrom_func(vmi_rom, vmi_init) != 0) {
- printk(KERN_ERR "VMI ROM failed to initialize!");
- return 0;
- }
- savesegment(cs, kernel_cs);
-
- pv_info.paravirt_enabled = 1;
- pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
- pv_info.name = "vmi [deprecated]";
-
- pv_init_ops.patch = vmi_patch;
-
- /*
- * Many of these operations are ABI compatible with VMI.
- * This means we can fill in the paravirt-ops with direct
- * pointers into the VMI ROM. If the calling convention for
- * these operations changes, this code needs to be updated.
- *
- * Exceptions
- * CPUID paravirt-op uses pointers, not the native ISA
- * halt has no VMI equivalent; all VMI halts are "safe"
- * no MSR support yet - just trap and emulate. VMI uses the
- * same ABI as the native ISA, but Linux wants exceptions
- * from bogus MSR read / write handled
- * rdpmc is not yet used in Linux
- */
-
- /* CPUID is special, so very special it gets wrapped like a present */
- para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID);
-
- para_fill(pv_cpu_ops.clts, CLTS);
- para_fill(pv_cpu_ops.get_debugreg, GetDR);
- para_fill(pv_cpu_ops.set_debugreg, SetDR);
- para_fill(pv_cpu_ops.read_cr0, GetCR0);
- para_fill(pv_mmu_ops.read_cr2, GetCR2);
- para_fill(pv_mmu_ops.read_cr3, GetCR3);
- para_fill(pv_cpu_ops.read_cr4, GetCR4);
- para_fill(pv_cpu_ops.write_cr0, SetCR0);
- para_fill(pv_mmu_ops.write_cr2, SetCR2);
- para_fill(pv_mmu_ops.write_cr3, SetCR3);
- para_fill(pv_cpu_ops.write_cr4, SetCR4);
-
- para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
- para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
- para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
- para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
-
- para_fill(pv_cpu_ops.wbinvd, WBINVD);
- para_fill(pv_cpu_ops.read_tsc, RDTSC);
-
- /* The following we emulate with trap and emulate for now */
- /* paravirt_ops.read_msr = vmi_rdmsr */
- /* paravirt_ops.write_msr = vmi_wrmsr */
- /* paravirt_ops.rdpmc = vmi_rdpmc */
-
- /* TR interface doesn't pass TR value, wrap */
- para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR);
-
- /* LDT is special, too */
- para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
-
- para_fill(pv_cpu_ops.load_gdt, SetGDT);
- para_fill(pv_cpu_ops.load_idt, SetIDT);
- para_fill(pv_cpu_ops.store_gdt, GetGDT);
- para_fill(pv_cpu_ops.store_idt, GetIDT);
- para_fill(pv_cpu_ops.store_tr, GetTR);
- pv_cpu_ops.load_tls = vmi_load_tls;
- para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry,
- write_ldt_entry, WriteLDTEntry);
- para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry,
- write_gdt_entry, WriteGDTEntry);
- para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry,
- write_idt_entry, WriteIDTEntry);
- para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack);
- para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
- para_fill(pv_cpu_ops.io_delay, IODelay);
-
- para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
- set_lazy_mode, SetLazyMode);
- para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
- set_lazy_mode, SetLazyMode);
-
- para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
- set_lazy_mode, SetLazyMode);
- para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
- set_lazy_mode, SetLazyMode);
-
- /* user and kernel flush are just handled with different flags to FlushTLB */
- para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
- para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
- para_fill(pv_mmu_ops.flush_tlb_single, InvalPage);
-
- /*
- * Until a standard flag format can be agreed on, we need to
- * implement these as wrappers in Linux. Get the VMI ROM
- * function pointers for the two backend calls.
- */
-#ifdef CONFIG_X86_PAE
- vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
- vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
-#else
- vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
- vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
-#endif
-
- if (vmi_ops.set_pte) {
- pv_mmu_ops.set_pte = vmi_set_pte;
- pv_mmu_ops.set_pte_at = vmi_set_pte_at;
- pv_mmu_ops.set_pmd = vmi_set_pmd;
-#ifdef CONFIG_X86_PAE
- pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
- pv_mmu_ops.set_pud = vmi_set_pud;
- pv_mmu_ops.pte_clear = vmi_pte_clear;
- pv_mmu_ops.pmd_clear = vmi_pmd_clear;
-#endif
- }
-
- if (vmi_ops.update_pte) {
- pv_mmu_ops.pte_update = vmi_update_pte;
- pv_mmu_ops.pte_update_defer = vmi_update_pte_defer;
- }
-
- vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
- if (vmi_ops.allocate_page) {
- pv_mmu_ops.alloc_pte = vmi_allocate_pte;
- pv_mmu_ops.alloc_pmd = vmi_allocate_pmd;
- pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone;
- }
-
- vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
- if (vmi_ops.release_page) {
- pv_mmu_ops.release_pte = vmi_release_pte;
- pv_mmu_ops.release_pmd = vmi_release_pmd;
- pv_mmu_ops.pgd_free = vmi_pgd_free;
- }
-
- /* Set linear is needed in all cases */
- vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
-
- /*
- * These MUST always be patched. Don't support indirect jumps
- * through these operations, as the VMI interface may use either
- * a jump or a call to get to these operations, depending on
- * the backend. They are performance critical anyway, so requiring
- * a patch is not a big problem.
- */
- pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
- pv_cpu_ops.iret = (void *)0xbadbab0;
-
-#ifdef CONFIG_SMP
- para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
-#endif
-
-#ifdef CONFIG_X86_LOCAL_APIC
- para_fill(apic->read, APICRead);
- para_fill(apic->write, APICWrite);
-#endif
-
- /*
- * Check for VMI timer functionality by probing for a cycle frequency method
- */
- reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
- if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) {
- vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
- vmi_timer_ops.get_cycle_counter =
- vmi_get_function(VMI_CALL_GetCycleCounter);
- vmi_timer_ops.get_wallclock =
- vmi_get_function(VMI_CALL_GetWallclockTime);
- vmi_timer_ops.wallclock_updated =
- vmi_get_function(VMI_CALL_WallclockUpdated);
- vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
- vmi_timer_ops.cancel_alarm =
- vmi_get_function(VMI_CALL_CancelAlarm);
- x86_init.timers.timer_init = vmi_time_init;
-#ifdef CONFIG_X86_LOCAL_APIC
- x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init;
- x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init;
-#endif
- pv_time_ops.sched_clock = vmi_sched_clock;
- x86_platform.calibrate_tsc = vmi_tsc_khz;
- x86_platform.get_wallclock = vmi_get_wallclock;
- x86_platform.set_wallclock = vmi_set_wallclock;
-
- /* We have true wallclock functions; disable CMOS clock sync */
- no_sync_cmos_clock = 1;
- } else {
- disable_noidle = 1;
- disable_vmi_timer = 1;
- }
-
- para_fill(pv_irq_ops.safe_halt, Halt);
-
- /*
- * Alternative instruction rewriting doesn't happen soon enough
- * to convert VMI_IRET to a call instead of a jump; so we have
- * to do this before IRQs get reenabled. Fortunately, it is
- * idempotent.
- */
- apply_paravirt(__parainstructions, __parainstructions_end);
-
- vmi_bringup();
-
- return 1;
-}
-
-#undef para_fill
-
-void __init vmi_init(void)
-{
- if (!vmi_rom)
- probe_vmi_rom();
- else
- check_vmi_rom(vmi_rom);
-
- /* In case probing for or validating the ROM failed, basil */
- if (!vmi_rom)
- return;
-
- reserve_top_address(-vmi_rom->virtual_top);
-
-#ifdef CONFIG_X86_IO_APIC
- /* This is virtual hardware; timer routing is wired correctly */
- no_timer_check = 1;
-#endif
-}
-
-void __init vmi_activate(void)
-{
- unsigned long flags;
-
- if (!vmi_rom)
- return;
-
- local_irq_save(flags);
- activate_vmi();
- local_irq_restore(flags & X86_EFLAGS_IF);
-}
-
-static int __init parse_vmi(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- if (!strcmp(arg, "disable_pge")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
- disable_pge = 1;
- } else if (!strcmp(arg, "disable_pse")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE);
- disable_pse = 1;
- } else if (!strcmp(arg, "disable_sep")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
- disable_sep = 1;
- } else if (!strcmp(arg, "disable_tsc")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC);
- disable_tsc = 1;
- } else if (!strcmp(arg, "disable_mtrr")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR);
- disable_mtrr = 1;
- } else if (!strcmp(arg, "disable_timer")) {
- disable_vmi_timer = 1;
- disable_noidle = 1;
- } else if (!strcmp(arg, "disable_noidle"))
- disable_noidle = 1;
- return 0;
-}
-
-early_param("vmi", parse_vmi);
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
deleted file mode 100644
index 5e1ff66ecd73..000000000000
--- a/arch/x86/kernel/vmiclock_32.c
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- * VMI paravirtual timer support routines.
- *
- * Copyright (C) 2007, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#include
-#include
-#include
-#include
-#include
-
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-
-#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
-#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
-
-static DEFINE_PER_CPU(struct clock_event_device, local_events);
-
-static inline u32 vmi_counter(u32 flags)
-{
- /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
- * cycle counter. */
- return flags & VMI_ALARM_COUNTER_MASK;
-}
-
-/* paravirt_ops.get_wallclock = vmi_get_wallclock */
-unsigned long vmi_get_wallclock(void)
-{
- unsigned long long wallclock;
- wallclock = vmi_timer_ops.get_wallclock(); // nsec
- (void)do_div(wallclock, 1000000000); // sec
-
- return wallclock;
-}
-
-/* paravirt_ops.set_wallclock = vmi_set_wallclock */
-int vmi_set_wallclock(unsigned long now)
-{
- return 0;
-}
-
-/* paravirt_ops.sched_clock = vmi_sched_clock */
-unsigned long long vmi_sched_clock(void)
-{
- return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
-}
-
-/* x86_platform.calibrate_tsc = vmi_tsc_khz */
-unsigned long vmi_tsc_khz(void)
-{
- unsigned long long khz;
- khz = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(khz, 1000);
- return khz;
-}
-
-static inline unsigned int vmi_get_timer_vector(void)
-{
- return IRQ0_VECTOR;
-}
-
-/** vmi clockchip */
-#ifdef CONFIG_X86_LOCAL_APIC
-static unsigned int startup_timer_irq(unsigned int irq)
-{
- unsigned long val = apic_read(APIC_LVTT);
- apic_write(APIC_LVTT, vmi_get_timer_vector());
-
- return (val & APIC_SEND_PENDING);
-}
-
-static void mask_timer_irq(unsigned int irq)
-{
- unsigned long val = apic_read(APIC_LVTT);
- apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
-}
-
-static void unmask_timer_irq(unsigned int irq)
-{
- unsigned long val = apic_read(APIC_LVTT);
- apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
-}
-
-static void ack_timer_irq(unsigned int irq)
-{
- ack_APIC_irq();
-}
-
-static struct irq_chip vmi_chip __read_mostly = {
- .name = "VMI-LOCAL",
- .startup = startup_timer_irq,
- .mask = mask_timer_irq,
- .unmask = unmask_timer_irq,
- .ack = ack_timer_irq
-};
-#endif
-
-/** vmi clockevent */
-#define VMI_ALARM_WIRED_IRQ0 0x00000000
-#define VMI_ALARM_WIRED_LVTT 0x00010000
-static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
-
-static inline int vmi_get_alarm_wiring(void)
-{
- return vmi_wiring;
-}
-
-static void vmi_timer_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
-{
- cycle_t now, cycles_per_hz;
- BUG_ON(!irqs_disabled());
-
- switch (mode) {
- case CLOCK_EVT_MODE_ONESHOT:
- case CLOCK_EVT_MODE_RESUME:
- break;
- case CLOCK_EVT_MODE_PERIODIC:
- cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(cycles_per_hz, HZ);
- now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
- vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
- break;
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- switch (evt->mode) {
- case CLOCK_EVT_MODE_ONESHOT:
- vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
- break;
- case CLOCK_EVT_MODE_PERIODIC:
- vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
- break;
- default:
- break;
- }
- break;
- default:
- break;
- }
-}
-
-static int vmi_timer_next_event(unsigned long delta,
- struct clock_event_device *evt)
-{
- /* Unfortunately, set_next_event interface only passes relative
- * expiry, but we want absolute expiry. It'd be better if were
- * were passed an absolute expiry, since a bunch of time may
- * have been stolen between the time the delta is computed and
- * when we set the alarm below. */
- cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
-
- BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
- vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
- return 0;
-}
-
-static struct clock_event_device vmi_clockevent = {
- .name = "vmi-timer",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
- .shift = 22,
- .set_mode = vmi_timer_set_mode,
- .set_next_event = vmi_timer_next_event,
- .rating = 1000,
- .irq = 0,
-};
-
-static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
-{
- struct clock_event_device *evt = &__get_cpu_var(local_events);
- evt->event_handler(evt);
- return IRQ_HANDLED;
-}
-
-static struct irqaction vmi_clock_action = {
- .name = "vmi-timer",
- .handler = vmi_timer_interrupt,
- .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
-};
-
-static void __devinit vmi_time_init_clockevent(void)
-{
- cycle_t cycles_per_msec;
- struct clock_event_device *evt;
-
- int cpu = smp_processor_id();
- evt = &__get_cpu_var(local_events);
-
- /* Use cycles_per_msec since div_sc params are 32-bits. */
- cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(cycles_per_msec, 1000);
-
- memcpy(evt, &vmi_clockevent, sizeof(*evt));
- /* Must pick .shift such that .mult fits in 32-bits. Choosing
- * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
- * before overflow. */
- evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
- /* Upper bound is clockevent's use of ulong for cycle deltas. */
- evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
- evt->min_delta_ns = clockevent_delta2ns(1, evt);
- evt->cpumask = cpumask_of(cpu);
-
- printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n",
- evt->name, evt->mult, evt->shift);
- clockevents_register_device(evt);
-}
-
-void __init vmi_time_init(void)
-{
- unsigned int cpu;
- /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
- outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
-
- vmi_time_init_clockevent();
- setup_irq(0, &vmi_clock_action);
- for_each_possible_cpu(cpu)
- per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0;
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-void __devinit vmi_time_bsp_init(void)
-{
- /*
- * On APIC systems, we want local timers to fire on each cpu. We do
- * this by programming LVTT to deliver timer events to the IRQ handler
- * for IRQ-0, since we can't re-use the APIC local timer handler
- * without interfering with that code.
- */
- clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
- local_irq_disable();
-#ifdef CONFIG_SMP
- /*
- * XXX handle_percpu_irq only defined for SMP; we need to switch over
- * to using it, since this is a local interrupt, which each CPU must
- * handle individually without locking out or dropping simultaneous
- * local timers on other CPUs. We also don't want to trigger the
- * quirk workaround code for interrupts which gets invoked from
- * handle_percpu_irq via eoi, so we use our own IRQ chip.
- */
- set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
-#else
- set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
-#endif
- vmi_wiring = VMI_ALARM_WIRED_LVTT;
- apic_write(APIC_LVTT, vmi_get_timer_vector());
- local_irq_enable();
- clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
-}
-
-void __devinit vmi_time_ap_init(void)
-{
- vmi_time_init_clockevent();
- apic_write(APIC_LVTT, vmi_get_timer_vector());
-}
-#endif
-
-/** vmi clocksource */
-static struct clocksource clocksource_vmi;
-
-static cycle_t read_real_cycles(struct clocksource *cs)
-{
- cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
- return max(ret, clocksource_vmi.cycle_last);
-}
-
-static struct clocksource clocksource_vmi = {
- .name = "vmi-timer",
- .rating = 450,
- .read = read_real_cycles,
- .mask = CLOCKSOURCE_MASK(64),
- .mult = 0, /* to be set */
- .shift = 22,
- .flags = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-static int __init init_vmi_clocksource(void)
-{
- cycle_t cycles_per_msec;
-
- if (!vmi_timer_ops.get_cycle_frequency)
- return 0;
- /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
- cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(cycles_per_msec, 1000);
-
- /* Note that clocksource.{mult, shift} converts in the opposite direction
- * as clockevents. */
- clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
- clocksource_vmi.shift);
-
- printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
- return clocksource_register(&clocksource_vmi);
-
-}
-module_init(init_vmi_clocksource);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 77d8c0f4817d..22b06f7660f4 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1056,14 +1056,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
vcpu->arch.apic = apic;
- apic->regs_page = alloc_page(GFP_KERNEL);
+ apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
if (apic->regs_page == NULL) {
printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
vcpu->vcpu_id);
goto nomem_free_apic;
}
apic->regs = page_address(apic->regs_page);
- memset(apic->regs, 0, PAGE_SIZE);
apic->vcpu = vcpu;
hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3a09c625d526..6c2ecf0a806d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1991,13 +1991,14 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
0 /* Reserved, DCA */ | F(XMM4_1) |
F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
- 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX);
+ 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
+ F(F16C);
/* cpuid 0x80000001.ecx */
const u32 kvm_supported_word6_x86_features =
F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
- F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
- 0 /* SKINIT */ | 0 /* WDT */;
+ F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
+ 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 9d5f55848455..73b1e1a1f489 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -791,22 +791,22 @@ static void lguest_flush_tlb_kernel(void)
* simple as setting a bit. We don't actually "ack" interrupts as such, we
* just mask and unmask them. I wonder if we should be cleverer?
*/
-static void disable_lguest_irq(unsigned int irq)
+static void disable_lguest_irq(struct irq_data *data)
{
- set_bit(irq, lguest_data.blocked_interrupts);
+ set_bit(data->irq, lguest_data.blocked_interrupts);
}
-static void enable_lguest_irq(unsigned int irq)
+static void enable_lguest_irq(struct irq_data *data)
{
- clear_bit(irq, lguest_data.blocked_interrupts);
+ clear_bit(data->irq, lguest_data.blocked_interrupts);
}
/* This structure describes the lguest IRQ controller. */
static struct irq_chip lguest_irq_controller = {
.name = "lguest",
- .mask = disable_lguest_irq,
- .mask_ack = disable_lguest_irq,
- .unmask = enable_lguest_irq,
+ .irq_mask = disable_lguest_irq,
+ .irq_mask_ack = disable_lguest_irq,
+ .irq_unmask = enable_lguest_irq,
};
/*
@@ -838,12 +838,12 @@ static void __init lguest_init_IRQ(void)
* rather than set them in lguest_init_IRQ we are called here every time an
* lguest device needs an interrupt.
*
- * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should
+ * FIXME: irq_alloc_desc_at() can fail due to lack of memory, we should
* pass that up!
*/
void lguest_setup_irq(unsigned int irq)
{
- irq_to_desc_alloc_node(irq, 0);
+ irq_alloc_desc_at(irq, 0);
set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
handle_level_irq, "level");
}
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index 5415a9d06f53..b908a59eccf5 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -22,22 +22,187 @@ EXPORT_SYMBOL(memset);
void *memmove(void *dest, const void *src, size_t n)
{
- int d0, d1, d2;
+ int d0,d1,d2,d3,d4,d5;
+ char *ret = dest;
+
+ __asm__ __volatile__(
+ /* Handle more 16bytes in loop */
+ "cmp $0x10, %0\n\t"
+ "jb 1f\n\t"
+
+ /* Decide forward/backward copy mode */
+ "cmp %2, %1\n\t"
+ "jb 2f\n\t"
+
+ /*
+ * movs instruction have many startup latency
+ * so we handle small size by general register.
+ */
+ "cmp $680, %0\n\t"
+ "jb 3f\n\t"
+ /*
+ * movs instruction is only good for aligned case.
+ */
+ "mov %1, %3\n\t"
+ "xor %2, %3\n\t"
+ "and $0xff, %3\n\t"
+ "jz 4f\n\t"
+ "3:\n\t"
+ "sub $0x10, %0\n\t"
+
+ /*
+ * We gobble 16byts forward in each loop.
+ */
+ "3:\n\t"
+ "sub $0x10, %0\n\t"
+ "mov 0*4(%1), %3\n\t"
+ "mov 1*4(%1), %4\n\t"
+ "mov %3, 0*4(%2)\n\t"
+ "mov %4, 1*4(%2)\n\t"
+ "mov 2*4(%1), %3\n\t"
+ "mov 3*4(%1), %4\n\t"
+ "mov %3, 2*4(%2)\n\t"
+ "mov %4, 3*4(%2)\n\t"
+ "lea 0x10(%1), %1\n\t"
+ "lea 0x10(%2), %2\n\t"
+ "jae 3b\n\t"
+ "add $0x10, %0\n\t"
+ "jmp 1f\n\t"
+
+ /*
+ * Handle data forward by movs.
+ */
+ ".p2align 4\n\t"
+ "4:\n\t"
+ "mov -4(%1, %0), %3\n\t"
+ "lea -4(%2, %0), %4\n\t"
+ "shr $2, %0\n\t"
+ "rep movsl\n\t"
+ "mov %3, (%4)\n\t"
+ "jmp 11f\n\t"
+ /*
+ * Handle data backward by movs.
+ */
+ ".p2align 4\n\t"
+ "6:\n\t"
+ "mov (%1), %3\n\t"
+ "mov %2, %4\n\t"
+ "lea -4(%1, %0), %1\n\t"
+ "lea -4(%2, %0), %2\n\t"
+ "shr $2, %0\n\t"
+ "std\n\t"
+ "rep movsl\n\t"
+ "mov %3,(%4)\n\t"
+ "cld\n\t"
+ "jmp 11f\n\t"
+
+ /*
+ * Start to prepare for backward copy.
+ */
+ ".p2align 4\n\t"
+ "2:\n\t"
+ "cmp $680, %0\n\t"
+ "jb 5f\n\t"
+ "mov %1, %3\n\t"
+ "xor %2, %3\n\t"
+ "and $0xff, %3\n\t"
+ "jz 6b\n\t"
+
+ /*
+ * Calculate copy position to tail.
+ */
+ "5:\n\t"
+ "add %0, %1\n\t"
+ "add %0, %2\n\t"
+ "sub $0x10, %0\n\t"
+
+ /*
+ * We gobble 16byts backward in each loop.
+ */
+ "7:\n\t"
+ "sub $0x10, %0\n\t"
+
+ "mov -1*4(%1), %3\n\t"
+ "mov -2*4(%1), %4\n\t"
+ "mov %3, -1*4(%2)\n\t"
+ "mov %4, -2*4(%2)\n\t"
+ "mov -3*4(%1), %3\n\t"
+ "mov -4*4(%1), %4\n\t"
+ "mov %3, -3*4(%2)\n\t"
+ "mov %4, -4*4(%2)\n\t"
+ "lea -0x10(%1), %1\n\t"
+ "lea -0x10(%2), %2\n\t"
+ "jae 7b\n\t"
+ /*
+ * Calculate copy position to head.
+ */
+ "add $0x10, %0\n\t"
+ "sub %0, %1\n\t"
+ "sub %0, %2\n\t"
+
+ /*
+ * Move data from 8 bytes to 15 bytes.
+ */
+ ".p2align 4\n\t"
+ "1:\n\t"
+ "cmp $8, %0\n\t"
+ "jb 8f\n\t"
+ "mov 0*4(%1), %3\n\t"
+ "mov 1*4(%1), %4\n\t"
+ "mov -2*4(%1, %0), %5\n\t"
+ "mov -1*4(%1, %0), %1\n\t"
+
+ "mov %3, 0*4(%2)\n\t"
+ "mov %4, 1*4(%2)\n\t"
+ "mov %5, -2*4(%2, %0)\n\t"
+ "mov %1, -1*4(%2, %0)\n\t"
+ "jmp 11f\n\t"
+
+ /*
+ * Move data from 4 bytes to 7 bytes.
+ */
+ ".p2align 4\n\t"
+ "8:\n\t"
+ "cmp $4, %0\n\t"
+ "jb 9f\n\t"
+ "mov 0*4(%1), %3\n\t"
+ "mov -1*4(%1, %0), %4\n\t"
+ "mov %3, 0*4(%2)\n\t"
+ "mov %4, -1*4(%2, %0)\n\t"
+ "jmp 11f\n\t"
+
+ /*
+ * Move data from 2 bytes to 3 bytes.
+ */
+ ".p2align 4\n\t"
+ "9:\n\t"
+ "cmp $2, %0\n\t"
+ "jb 10f\n\t"
+ "movw 0*2(%1), %%dx\n\t"
+ "movw -1*2(%1, %0), %%bx\n\t"
+ "movw %%dx, 0*2(%2)\n\t"
+ "movw %%bx, -1*2(%2, %0)\n\t"
+ "jmp 11f\n\t"
+
+ /*
+ * Move data for 1 byte.
+ */
+ ".p2align 4\n\t"
+ "10:\n\t"
+ "cmp $1, %0\n\t"
+ "jb 11f\n\t"
+ "movb (%1), %%cl\n\t"
+ "movb %%cl, (%2)\n\t"
+ ".p2align 4\n\t"
+ "11:"
+ : "=&c" (d0), "=&S" (d1), "=&D" (d2),
+ "=r" (d3),"=r" (d4), "=r"(d5)
+ :"0" (n),
+ "1" (src),
+ "2" (dest)
+ :"memory");
+
+ return ret;
- if (dest < src) {
- memcpy(dest, src, n);
- } else {
- __asm__ __volatile__(
- "std\n\t"
- "rep\n\t"
- "movsb\n\t"
- "cld"
- : "=&c" (d0), "=&S" (d1), "=&D" (d2)
- :"0" (n),
- "1" (n-1+src),
- "2" (n-1+dest)
- :"memory");
- }
- return dest;
}
EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index bcbcd1e0f7d5..75ef61e35e38 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -40,84 +40,132 @@
ENTRY(__memcpy)
ENTRY(memcpy)
CFI_STARTPROC
-
- /*
- * Put the number of full 64-byte blocks into %ecx.
- * Tail portion is handled at the end:
- */
movq %rdi, %rax
- movl %edx, %ecx
- shrl $6, %ecx
- jz .Lhandle_tail
+ /*
+ * Use 32bit CMP here to avoid long NOP padding.
+ */
+ cmp $0x20, %edx
+ jb .Lhandle_tail
+
+ /*
+ * We check whether memory false dependece could occur,
+ * then jump to corresponding copy mode.
+ */
+ cmp %dil, %sil
+ jl .Lcopy_backward
+ subl $0x20, %edx
+.Lcopy_forward_loop:
+ subq $0x20, %rdx
+
+ /*
+ * Move in blocks of 4x8 bytes:
+ */
+ movq 0*8(%rsi), %r8
+ movq 1*8(%rsi), %r9
+ movq 2*8(%rsi), %r10
+ movq 3*8(%rsi), %r11
+ leaq 4*8(%rsi), %rsi
+
+ movq %r8, 0*8(%rdi)
+ movq %r9, 1*8(%rdi)
+ movq %r10, 2*8(%rdi)
+ movq %r11, 3*8(%rdi)
+ leaq 4*8(%rdi), %rdi
+ jae .Lcopy_forward_loop
+ addq $0x20, %rdx
+ jmp .Lhandle_tail
+
+.Lcopy_backward:
+ /*
+ * Calculate copy position to tail.
+ */
+ addq %rdx, %rsi
+ addq %rdx, %rdi
+ subq $0x20, %rdx
+ /*
+ * At most 3 ALU operations in one cycle,
+ * so append NOPS in the same 16bytes trunk.
+ */
.p2align 4
-.Lloop_64:
- /*
- * We decrement the loop index here - and the zero-flag is
- * checked at the end of the loop (instructions inbetween do
- * not change the zero flag):
- */
- decl %ecx
+.Lcopy_backward_loop:
+ subq $0x20, %rdx
+ movq -1*8(%rsi), %r8
+ movq -2*8(%rsi), %r9
+ movq -3*8(%rsi), %r10
+ movq -4*8(%rsi), %r11
+ leaq -4*8(%rsi), %rsi
+ movq %r8, -1*8(%rdi)
+ movq %r9, -2*8(%rdi)
+ movq %r10, -3*8(%rdi)
+ movq %r11, -4*8(%rdi)
+ leaq -4*8(%rdi), %rdi
+ jae .Lcopy_backward_loop
/*
- * Move in blocks of 4x16 bytes:
+ * Calculate copy position to head.
*/
- movq 0*8(%rsi), %r11
- movq 1*8(%rsi), %r8
- movq %r11, 0*8(%rdi)
- movq %r8, 1*8(%rdi)
-
- movq 2*8(%rsi), %r9
- movq 3*8(%rsi), %r10
- movq %r9, 2*8(%rdi)
- movq %r10, 3*8(%rdi)
-
- movq 4*8(%rsi), %r11
- movq 5*8(%rsi), %r8
- movq %r11, 4*8(%rdi)
- movq %r8, 5*8(%rdi)
-
- movq 6*8(%rsi), %r9
- movq 7*8(%rsi), %r10
- movq %r9, 6*8(%rdi)
- movq %r10, 7*8(%rdi)
-
- leaq 64(%rsi), %rsi
- leaq 64(%rdi), %rdi
-
- jnz .Lloop_64
-
+ addq $0x20, %rdx
+ subq %rdx, %rsi
+ subq %rdx, %rdi
.Lhandle_tail:
- movl %edx, %ecx
- andl $63, %ecx
- shrl $3, %ecx
- jz .Lhandle_7
+ cmpq $16, %rdx
+ jb .Lless_16bytes
+ /*
+ * Move data from 16 bytes to 31 bytes.
+ */
+ movq 0*8(%rsi), %r8
+ movq 1*8(%rsi), %r9
+ movq -2*8(%rsi, %rdx), %r10
+ movq -1*8(%rsi, %rdx), %r11
+ movq %r8, 0*8(%rdi)
+ movq %r9, 1*8(%rdi)
+ movq %r10, -2*8(%rdi, %rdx)
+ movq %r11, -1*8(%rdi, %rdx)
+ retq
.p2align 4
-.Lloop_8:
- decl %ecx
- movq (%rsi), %r8
- movq %r8, (%rdi)
- leaq 8(%rdi), %rdi
- leaq 8(%rsi), %rsi
- jnz .Lloop_8
-
-.Lhandle_7:
- movl %edx, %ecx
- andl $7, %ecx
- jz .Lend
-
+.Lless_16bytes:
+ cmpq $8, %rdx
+ jb .Lless_8bytes
+ /*
+ * Move data from 8 bytes to 15 bytes.
+ */
+ movq 0*8(%rsi), %r8
+ movq -1*8(%rsi, %rdx), %r9
+ movq %r8, 0*8(%rdi)
+ movq %r9, -1*8(%rdi, %rdx)
+ retq
.p2align 4
+.Lless_8bytes:
+ cmpq $4, %rdx
+ jb .Lless_3bytes
+
+ /*
+ * Move data from 4 bytes to 7 bytes.
+ */
+ movl (%rsi), %ecx
+ movl -4(%rsi, %rdx), %r8d
+ movl %ecx, (%rdi)
+ movl %r8d, -4(%rdi, %rdx)
+ retq
+ .p2align 4
+.Lless_3bytes:
+ cmpl $0, %edx
+ je .Lend
+ /*
+ * Move data from 1 bytes to 3 bytes.
+ */
.Lloop_1:
movb (%rsi), %r8b
movb %r8b, (%rdi)
incq %rdi
incq %rsi
- decl %ecx
+ decl %edx
jnz .Lloop_1
.Lend:
- ret
+ retq
CFI_ENDPROC
ENDPROC(memcpy)
ENDPROC(__memcpy)
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
index 0a33909bf122..6d0f0ec41b34 100644
--- a/arch/x86/lib/memmove_64.c
+++ b/arch/x86/lib/memmove_64.c
@@ -8,14 +8,185 @@
#undef memmove
void *memmove(void *dest, const void *src, size_t count)
{
- if (dest < src) {
- return memcpy(dest, src, count);
- } else {
- char *p = dest + count;
- const char *s = src + count;
- while (count--)
- *--p = *--s;
- }
- return dest;
+ unsigned long d0,d1,d2,d3,d4,d5,d6,d7;
+ char *ret;
+
+ __asm__ __volatile__(
+ /* Handle more 32bytes in loop */
+ "mov %2, %3\n\t"
+ "cmp $0x20, %0\n\t"
+ "jb 1f\n\t"
+
+ /* Decide forward/backward copy mode */
+ "cmp %2, %1\n\t"
+ "jb 2f\n\t"
+
+ /*
+ * movsq instruction have many startup latency
+ * so we handle small size by general register.
+ */
+ "cmp $680, %0\n\t"
+ "jb 3f\n\t"
+ /*
+ * movsq instruction is only good for aligned case.
+ */
+ "cmpb %%dil, %%sil\n\t"
+ "je 4f\n\t"
+ "3:\n\t"
+ "sub $0x20, %0\n\t"
+ /*
+ * We gobble 32byts forward in each loop.
+ */
+ "5:\n\t"
+ "sub $0x20, %0\n\t"
+ "movq 0*8(%1), %4\n\t"
+ "movq 1*8(%1), %5\n\t"
+ "movq 2*8(%1), %6\n\t"
+ "movq 3*8(%1), %7\n\t"
+ "leaq 4*8(%1), %1\n\t"
+
+ "movq %4, 0*8(%2)\n\t"
+ "movq %5, 1*8(%2)\n\t"
+ "movq %6, 2*8(%2)\n\t"
+ "movq %7, 3*8(%2)\n\t"
+ "leaq 4*8(%2), %2\n\t"
+ "jae 5b\n\t"
+ "addq $0x20, %0\n\t"
+ "jmp 1f\n\t"
+ /*
+ * Handle data forward by movsq.
+ */
+ ".p2align 4\n\t"
+ "4:\n\t"
+ "movq %0, %8\n\t"
+ "movq -8(%1, %0), %4\n\t"
+ "lea -8(%2, %0), %5\n\t"
+ "shrq $3, %8\n\t"
+ "rep movsq\n\t"
+ "movq %4, (%5)\n\t"
+ "jmp 13f\n\t"
+ /*
+ * Handle data backward by movsq.
+ */
+ ".p2align 4\n\t"
+ "7:\n\t"
+ "movq %0, %8\n\t"
+ "movq (%1), %4\n\t"
+ "movq %2, %5\n\t"
+ "leaq -8(%1, %0), %1\n\t"
+ "leaq -8(%2, %0), %2\n\t"
+ "shrq $3, %8\n\t"
+ "std\n\t"
+ "rep movsq\n\t"
+ "cld\n\t"
+ "movq %4, (%5)\n\t"
+ "jmp 13f\n\t"
+
+ /*
+ * Start to prepare for backward copy.
+ */
+ ".p2align 4\n\t"
+ "2:\n\t"
+ "cmp $680, %0\n\t"
+ "jb 6f \n\t"
+ "cmp %%dil, %%sil\n\t"
+ "je 7b \n\t"
+ "6:\n\t"
+ /*
+ * Calculate copy position to tail.
+ */
+ "addq %0, %1\n\t"
+ "addq %0, %2\n\t"
+ "subq $0x20, %0\n\t"
+ /*
+ * We gobble 32byts backward in each loop.
+ */
+ "8:\n\t"
+ "subq $0x20, %0\n\t"
+ "movq -1*8(%1), %4\n\t"
+ "movq -2*8(%1), %5\n\t"
+ "movq -3*8(%1), %6\n\t"
+ "movq -4*8(%1), %7\n\t"
+ "leaq -4*8(%1), %1\n\t"
+
+ "movq %4, -1*8(%2)\n\t"
+ "movq %5, -2*8(%2)\n\t"
+ "movq %6, -3*8(%2)\n\t"
+ "movq %7, -4*8(%2)\n\t"
+ "leaq -4*8(%2), %2\n\t"
+ "jae 8b\n\t"
+ /*
+ * Calculate copy position to head.
+ */
+ "addq $0x20, %0\n\t"
+ "subq %0, %1\n\t"
+ "subq %0, %2\n\t"
+ "1:\n\t"
+ "cmpq $16, %0\n\t"
+ "jb 9f\n\t"
+ /*
+ * Move data from 16 bytes to 31 bytes.
+ */
+ "movq 0*8(%1), %4\n\t"
+ "movq 1*8(%1), %5\n\t"
+ "movq -2*8(%1, %0), %6\n\t"
+ "movq -1*8(%1, %0), %7\n\t"
+ "movq %4, 0*8(%2)\n\t"
+ "movq %5, 1*8(%2)\n\t"
+ "movq %6, -2*8(%2, %0)\n\t"
+ "movq %7, -1*8(%2, %0)\n\t"
+ "jmp 13f\n\t"
+ ".p2align 4\n\t"
+ "9:\n\t"
+ "cmpq $8, %0\n\t"
+ "jb 10f\n\t"
+ /*
+ * Move data from 8 bytes to 15 bytes.
+ */
+ "movq 0*8(%1), %4\n\t"
+ "movq -1*8(%1, %0), %5\n\t"
+ "movq %4, 0*8(%2)\n\t"
+ "movq %5, -1*8(%2, %0)\n\t"
+ "jmp 13f\n\t"
+ "10:\n\t"
+ "cmpq $4, %0\n\t"
+ "jb 11f\n\t"
+ /*
+ * Move data from 4 bytes to 7 bytes.
+ */
+ "movl (%1), %4d\n\t"
+ "movl -4(%1, %0), %5d\n\t"
+ "movl %4d, (%2)\n\t"
+ "movl %5d, -4(%2, %0)\n\t"
+ "jmp 13f\n\t"
+ "11:\n\t"
+ "cmp $2, %0\n\t"
+ "jb 12f\n\t"
+ /*
+ * Move data from 2 bytes to 3 bytes.
+ */
+ "movw (%1), %4w\n\t"
+ "movw -2(%1, %0), %5w\n\t"
+ "movw %4w, (%2)\n\t"
+ "movw %5w, -2(%2, %0)\n\t"
+ "jmp 13f\n\t"
+ "12:\n\t"
+ "cmp $1, %0\n\t"
+ "jb 13f\n\t"
+ /*
+ * Move data for 1 byte.
+ */
+ "movb (%1), %4b\n\t"
+ "movb %4b, (%2)\n\t"
+ "13:\n\t"
+ : "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) ,
+ "=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7)
+ :"0" (count),
+ "1" (src),
+ "2" (dest)
+ :"memory");
+
+ return ret;
+
}
EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a24c6cfdccc4..79b0b372d2d0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -229,7 +229,16 @@ void vmalloc_sync_all(void)
spin_lock_irqsave(&pgd_lock, flags);
list_for_each_entry(page, &pgd_list, lru) {
- if (!vmalloc_sync_one(page_address(page), address))
+ spinlock_t *pgt_lock;
+ pmd_t *ret;
+
+ pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+
+ spin_lock(pgt_lock);
+ ret = vmalloc_sync_one(page_address(page), address);
+ spin_unlock(pgt_lock);
+
+ if (!ret)
break;
}
spin_unlock_irqrestore(&pgd_lock, flags);
@@ -328,29 +337,7 @@ out:
void vmalloc_sync_all(void)
{
- unsigned long address;
-
- for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
- address += PGDIR_SIZE) {
-
- const pgd_t *pgd_ref = pgd_offset_k(address);
- unsigned long flags;
- struct page *page;
-
- if (pgd_none(*pgd_ref))
- continue;
-
- spin_lock_irqsave(&pgd_lock, flags);
- list_for_each_entry(page, &pgd_list, lru) {
- pgd_t *pgd;
- pgd = (pgd_t *)page_address(page) + pgd_index(address);
- if (pgd_none(*pgd))
- set_pgd(pgd, *pgd_ref);
- else
- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
- }
- spin_unlock_irqrestore(&pgd_lock, flags);
- }
+ sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
}
/*
@@ -898,8 +885,14 @@ spurious_fault(unsigned long error_code, unsigned long address)
if (pmd_large(*pmd))
return spurious_fault_check(error_code, (pte_t *) pmd);
+ /*
+ * Note: don't use pte_present() here, since it returns true
+ * if the _PAGE_PROTNONE bit is set. However, this aliases the
+ * _PAGE_GLOBAL bit, which for kernel pages give false positives
+ * when CONFIG_DEBUG_PAGEALLOC is used.
+ */
pte = pte_offset_kernel(pmd, address);
- if (!pte_present(*pte))
+ if (!(pte_flags(*pte) & _PAGE_PRESENT))
return 0;
ret = spurious_fault_check(error_code, pte);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index bca79091b9d6..558f2d332076 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -67,7 +67,7 @@ static __init void *alloc_low_page(void)
panic("alloc_low_page: ran out of memory");
adr = __va(pfn * PAGE_SIZE);
- memset(adr, 0, PAGE_SIZE);
+ clear_page(adr);
return adr;
}
@@ -558,7 +558,7 @@ char swsusp_pg_dir[PAGE_SIZE]
static inline void save_pg_dir(void)
{
- memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
+ copy_page(swsusp_pg_dir, swapper_pg_dir);
}
#else /* !CONFIG_ACPI_SLEEP */
static inline void save_pg_dir(void)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9a6674689a20..c55f900fbf89 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -97,6 +97,43 @@ static int __init nonx32_setup(char *str)
}
__setup("noexec32=", nonx32_setup);
+/*
+ * When memory was added/removed make sure all the processes MM have
+ * suitable PGD entries in the local PGD level page.
+ */
+void sync_global_pgds(unsigned long start, unsigned long end)
+{
+ unsigned long address;
+
+ for (address = start; address <= end; address += PGDIR_SIZE) {
+ const pgd_t *pgd_ref = pgd_offset_k(address);
+ unsigned long flags;
+ struct page *page;
+
+ if (pgd_none(*pgd_ref))
+ continue;
+
+ spin_lock_irqsave(&pgd_lock, flags);
+ list_for_each_entry(page, &pgd_list, lru) {
+ pgd_t *pgd;
+ spinlock_t *pgt_lock;
+
+ pgd = (pgd_t *)page_address(page) + pgd_index(address);
+ pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+ spin_lock(pgt_lock);
+
+ if (pgd_none(*pgd))
+ set_pgd(pgd, *pgd_ref);
+ else
+ BUG_ON(pgd_page_vaddr(*pgd)
+ != pgd_page_vaddr(*pgd_ref));
+
+ spin_unlock(pgt_lock);
+ }
+ spin_unlock_irqrestore(&pgd_lock, flags);
+ }
+}
+
/*
* NOTE: This function is marked __ref because it calls __init function
* (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
@@ -293,7 +330,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
panic("alloc_low_page: ran out of memory");
adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
- memset(adr, 0, PAGE_SIZE);
+ clear_page(adr);
*phys = pfn * PAGE_SIZE;
return adr;
}
@@ -534,11 +571,13 @@ kernel_physical_mapping_init(unsigned long start,
unsigned long end,
unsigned long page_size_mask)
{
-
+ bool pgd_changed = false;
unsigned long next, last_map_addr = end;
+ unsigned long addr;
start = (unsigned long)__va(start);
end = (unsigned long)__va(end);
+ addr = start;
for (; start < end; start = next) {
pgd_t *pgd = pgd_offset_k(start);
@@ -563,7 +602,12 @@ kernel_physical_mapping_init(unsigned long start,
spin_lock(&init_mm.page_table_lock);
pgd_populate(&init_mm, pgd, __va(pud_phys));
spin_unlock(&init_mm.page_table_lock);
+ pgd_changed = true;
}
+
+ if (pgd_changed)
+ sync_global_pgds(addr, end);
+
__flush_tlb_all();
return last_map_addr;
@@ -1003,6 +1047,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
}
}
+ sync_global_pgds((unsigned long)start_page, end);
return 0;
}
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 970ed579d4e4..52d54bfc1ebb 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -22,7 +22,7 @@
#include
#include
#include
-#include
+#include
static struct bootnode __initdata nodes[8];
static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
@@ -54,8 +54,8 @@ static __init int find_northbridge(void)
static __init void early_get_boot_cpu_id(void)
{
/*
- * need to get boot_cpu_id so can use that to create apicid_to_node
- * in k8_scan_nodes()
+ * need to get the APIC ID of the BSP so can use that to
+ * create apicid_to_node in k8_scan_nodes()
*/
#ifdef CONFIG_X86_MPPARSE
/*
@@ -212,7 +212,7 @@ int __init k8_scan_nodes(void)
bits = boot_cpu_data.x86_coreid_bits;
cores = (1< 0) {
pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
index 63c19e27aa6f..324aa3f07237 100644
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -9,7 +9,7 @@ static bool opcode_is_prefix(uint8_t b)
b == 0xf0 || b == 0xf2 || b == 0xf3
/* Group 2 */
|| b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
- || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e
+ || b == 0x64 || b == 0x65
/* Group 3 */
|| b == 0x66
/* Group 4 */
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a7bcc23ef96c..4962f1aeda6f 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -18,7 +18,7 @@
#include
#include
#include
-#include
+#include
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5c4ee422590e..8be8c7d7bc89 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd)
#define UNSHARED_PTRS_PER_PGD \
(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
-static void pgd_ctor(pgd_t *pgd)
+
+static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
+{
+ BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
+ virt_to_page(pgd)->index = (pgoff_t)mm;
+}
+
+struct mm_struct *pgd_page_get_mm(struct page *page)
+{
+ return (struct mm_struct *)page->index;
+}
+
+static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
{
/* If the pgd points to a shared pagetable level (either the
ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -98,15 +110,13 @@ static void pgd_ctor(pgd_t *pgd)
clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
- paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
- __pa(swapper_pg_dir) >> PAGE_SHIFT,
- KERNEL_PGD_BOUNDARY,
- KERNEL_PGD_PTRS);
}
/* list required to sync kernel mapping updates */
- if (!SHARED_KERNEL_PMD)
+ if (!SHARED_KERNEL_PMD) {
+ pgd_set_mm(pgd, mm);
pgd_list_add(pgd);
+ }
}
static void pgd_dtor(pgd_t *pgd)
@@ -272,7 +282,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
*/
spin_lock_irqsave(&pgd_lock, flags);
- pgd_ctor(pgd);
+ pgd_ctor(mm, pgd);
pgd_prepopulate_pmd(mm, pgd, pmds);
spin_unlock_irqrestore(&pgd_lock, flags);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c03f14ab6667..49358481c733 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -5,6 +5,7 @@
#include
#include
#include
+#include
#include
#include
@@ -52,6 +53,8 @@ union smp_flush_state {
want false sharing in the per cpu data segment. */
static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
+static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
+
/*
* We cannot call mmdrop() because we are in interrupt context,
* instead update mm->cpu_vm_mask.
@@ -173,7 +176,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
union smp_flush_state *f;
/* Caller has disabled preemption */
- sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
+ sender = this_cpu_read(tlb_vector_offset);
f = &flush_state[sender];
/*
@@ -218,6 +221,47 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
flush_tlb_others_ipi(cpumask, mm, va);
}
+static void __cpuinit calculate_tlb_offset(void)
+{
+ int cpu, node, nr_node_vecs;
+ /*
+ * we are changing tlb_vector_offset for each CPU in runtime, but this
+ * will not cause inconsistency, as the write is atomic under X86. we
+ * might see more lock contentions in a short time, but after all CPU's
+ * tlb_vector_offset are changed, everything should go normal
+ *
+ * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
+ * waste some vectors.
+ **/
+ if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
+ nr_node_vecs = 1;
+ else
+ nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
+
+ for_each_online_node(node) {
+ int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) *
+ nr_node_vecs;
+ int cpu_offset = 0;
+ for_each_cpu(cpu, cpumask_of_node(node)) {
+ per_cpu(tlb_vector_offset, cpu) = node_offset +
+ cpu_offset;
+ cpu_offset++;
+ cpu_offset = cpu_offset % nr_node_vecs;
+ }
+ }
+}
+
+static int tlb_cpuhp_notify(struct notifier_block *n,
+ unsigned long action, void *hcpu)
+{
+ switch (action & 0xf) {
+ case CPU_ONLINE:
+ case CPU_DEAD:
+ calculate_tlb_offset();
+ }
+ return NOTIFY_OK;
+}
+
static int __cpuinit init_smp_flush(void)
{
int i;
@@ -225,6 +269,8 @@ static int __cpuinit init_smp_flush(void)
for (i = 0; i < ARRAY_SIZE(flush_state); i++)
raw_spin_lock_init(&flush_state[i].tlbstate_lock);
+ calculate_tlb_offset();
+ hotcpu_notifier(tlb_cpuhp_notify, 0);
return 0;
}
core_initcall(init_smp_flush);
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index b67a6b5aa8d4..42fb46f83883 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -64,15 +64,22 @@ static u64 ibs_op_ctl;
* IBS cpuid feature detection
*/
-#define IBS_CPUID_FEATURES 0x8000001b
+#define IBS_CPUID_FEATURES 0x8000001b
/*
* Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but
* bit 0 is used to indicate the existence of IBS.
*/
-#define IBS_CAPS_AVAIL (1LL<<0)
-#define IBS_CAPS_RDWROPCNT (1LL<<3)
-#define IBS_CAPS_OPCNT (1LL<<4)
+#define IBS_CAPS_AVAIL (1U<<0)
+#define IBS_CAPS_RDWROPCNT (1U<<3)
+#define IBS_CAPS_OPCNT (1U<<4)
+
+/*
+ * IBS APIC setup
+ */
+#define IBSCTL 0x1cc
+#define IBSCTL_LVT_OFFSET_VALID (1ULL<<8)
+#define IBSCTL_LVT_OFFSET_MASK 0x0F
/*
* IBS randomization macros
@@ -266,6 +273,74 @@ static void op_amd_stop_ibs(void)
wrmsrl(MSR_AMD64_IBSOPCTL, 0);
}
+static inline int eilvt_is_available(int offset)
+{
+ /* check if we may assign a vector */
+ return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
+}
+
+static inline int ibs_eilvt_valid(void)
+{
+ u64 val;
+ int offset;
+
+ rdmsrl(MSR_AMD64_IBSCTL, val);
+ if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
+ pr_err(FW_BUG "cpu %d, invalid IBS "
+ "interrupt offset %d (MSR%08X=0x%016llx)",
+ smp_processor_id(), offset,
+ MSR_AMD64_IBSCTL, val);
+ return 0;
+ }
+
+ offset = val & IBSCTL_LVT_OFFSET_MASK;
+
+ if (eilvt_is_available(offset))
+ return !0;
+
+ pr_err(FW_BUG "cpu %d, IBS interrupt offset %d "
+ "not available (MSR%08X=0x%016llx)",
+ smp_processor_id(), offset,
+ MSR_AMD64_IBSCTL, val);
+
+ return 0;
+}
+
+static inline int get_ibs_offset(void)
+{
+ u64 val;
+
+ rdmsrl(MSR_AMD64_IBSCTL, val);
+ if (!(val & IBSCTL_LVT_OFFSET_VALID))
+ return -EINVAL;
+
+ return val & IBSCTL_LVT_OFFSET_MASK;
+}
+
+static void setup_APIC_ibs(void)
+{
+ int offset;
+
+ offset = get_ibs_offset();
+ if (offset < 0)
+ goto failed;
+
+ if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
+ return;
+failed:
+ pr_warn("oprofile: IBS APIC setup failed on cpu #%d\n",
+ smp_processor_id());
+}
+
+static void clear_APIC_ibs(void)
+{
+ int offset;
+
+ offset = get_ibs_offset();
+ if (offset >= 0)
+ setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
+}
+
#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
@@ -376,13 +451,13 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
}
if (ibs_caps)
- setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0);
+ setup_APIC_ibs();
}
static void op_amd_cpu_shutdown(void)
{
if (ibs_caps)
- setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
+ clear_APIC_ibs();
}
static int op_amd_check_ctrs(struct pt_regs * const regs,
@@ -445,16 +520,11 @@ static void op_amd_stop(struct op_msrs const * const msrs)
op_amd_stop_ibs();
}
-static int __init_ibs_nmi(void)
+static int setup_ibs_ctl(int ibs_eilvt_off)
{
-#define IBSCTL_LVTOFFSETVAL (1 << 8)
-#define IBSCTL 0x1cc
struct pci_dev *cpu_cfg;
int nodes;
u32 value = 0;
- u8 ibs_eilvt_off;
-
- ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
nodes = 0;
cpu_cfg = NULL;
@@ -466,24 +536,63 @@ static int __init_ibs_nmi(void)
break;
++nodes;
pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
- | IBSCTL_LVTOFFSETVAL);
+ | IBSCTL_LVT_OFFSET_VALID);
pci_read_config_dword(cpu_cfg, IBSCTL, &value);
- if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) {
+ if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
pci_dev_put(cpu_cfg);
printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
- "IBSCTL = 0x%08x", value);
- return 1;
+ "IBSCTL = 0x%08x\n", value);
+ return -EINVAL;
}
} while (1);
if (!nodes) {
- printk(KERN_DEBUG "No CPU node configured for IBS");
- return 1;
+ printk(KERN_DEBUG "No CPU node configured for IBS\n");
+ return -ENODEV;
}
return 0;
}
+static int force_ibs_eilvt_setup(void)
+{
+ int i;
+ int ret;
+
+ /* find the next free available EILVT entry */
+ for (i = 1; i < 4; i++) {
+ if (!eilvt_is_available(i))
+ continue;
+ ret = setup_ibs_ctl(i);
+ if (ret)
+ return ret;
+ return 0;
+ }
+
+ printk(KERN_DEBUG "No EILVT entry available\n");
+
+ return -EBUSY;
+}
+
+static int __init_ibs_nmi(void)
+{
+ int ret;
+
+ if (ibs_eilvt_valid())
+ return 0;
+
+ ret = force_ibs_eilvt_setup();
+ if (ret)
+ return ret;
+
+ if (!ibs_eilvt_valid())
+ return -EFAULT;
+
+ pr_err(FW_BUG "workaround enabled for IBS LVT offset\n");
+
+ return 0;
+}
+
/* initialize the APIC for the IBS interrupts if available */
static void init_ibs(void)
{
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
index b34815408f58..13700ec8e2e4 100644
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -304,7 +304,7 @@ static struct pci_raw_ops pci_olpc_conf = {
int __init pci_olpc_init(void)
{
- printk(KERN_INFO "PCI: Using configuration type OLPC\n");
+ printk(KERN_INFO "PCI: Using configuration type OLPC XO-1\n");
raw_pci_ops = &pci_olpc_conf;
is_lx = is_geode_lx();
return 0;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 42086ac406af..b2363fcbcd0f 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1969,7 +1969,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
.alloc_pte = xen_alloc_pte_init,
.release_pte = xen_release_pte_init,
.alloc_pmd = xen_alloc_pmd_init,
- .alloc_pmd_clone = paravirt_nop,
.release_pmd = xen_release_pmd_init,
#ifdef CONFIG_X86_64
diff --git a/arch/xtensa/kernel/irq.c b/arch/xtensa/kernel/irq.c
index c64a5d387de5..87508886cbbd 100644
--- a/arch/xtensa/kernel/irq.c
+++ b/arch/xtensa/kernel/irq.c
@@ -92,7 +92,7 @@ int show_interrupts(struct seq_file *p, void *v)
for_each_online_cpu(j)
seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
#endif
- seq_printf(p, " %14s", irq_desc[i].chip->typename);
+ seq_printf(p, " %14s", irq_desc[i].chip->name);
seq_printf(p, " %s", action->name);
for (action=action->next; action; action = action->next)
diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c
index 6b115f6c4313..6afceb3d4034 100644
--- a/drivers/acpi/acpi_pad.c
+++ b/drivers/acpi/acpi_pad.c
@@ -30,18 +30,13 @@
#include
#include
#include
+#include
#define ACPI_PROCESSOR_AGGREGATOR_CLASS "acpi_pad"
#define ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME "Processor Aggregator"
#define ACPI_PROCESSOR_AGGREGATOR_NOTIFY 0x80
static DEFINE_MUTEX(isolated_cpus_lock);
-#define MWAIT_SUBSTATE_MASK (0xf)
-#define MWAIT_CSTATE_MASK (0xf)
-#define MWAIT_SUBSTATE_SIZE (4)
-#define CPUID_MWAIT_LEAF (5)
-#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
-#define CPUID5_ECX_INTERRUPT_BREAK (0x2)
static unsigned long power_saving_mwait_eax;
static unsigned char tsc_detected_unstable;
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index 9fc630ce1ddb..f6f37a05a0c3 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -45,7 +45,8 @@ static ssize_t show_##name(struct sys_device *dev, \
return sprintf(buf, "%d\n", topology_##name(cpu)); \
}
-#if defined(topology_thread_cpumask) || defined(topology_core_cpumask)
+#if defined(topology_thread_cpumask) || defined(topology_core_cpumask) || \
+ defined(topology_book_cpumask)
static ssize_t show_cpumap(int type, const struct cpumask *mask, char *buf)
{
ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
@@ -114,6 +115,14 @@ define_siblings_show_func(core_cpumask);
define_one_ro_named(core_siblings, show_core_cpumask);
define_one_ro_named(core_siblings_list, show_core_cpumask_list);
+#ifdef CONFIG_SCHED_BOOK
+define_id_show_func(book_id);
+define_one_ro(book_id);
+define_siblings_show_func(book_cpumask);
+define_one_ro_named(book_siblings, show_book_cpumask);
+define_one_ro_named(book_siblings_list, show_book_cpumask_list);
+#endif
+
static struct attribute *default_attrs[] = {
&attr_physical_package_id.attr,
&attr_core_id.attr,
@@ -121,6 +130,11 @@ static struct attribute *default_attrs[] = {
&attr_thread_siblings_list.attr,
&attr_core_siblings.attr,
&attr_core_siblings_list.attr,
+#ifdef CONFIG_SCHED_BOOK
+ &attr_book_id.attr,
+ &attr_book_siblings.attr,
+ &attr_book_siblings_list.attr,
+#endif
NULL
};
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index de277689da61..4b9359a6f6ca 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -488,4 +488,21 @@ config BLK_DEV_HD
If unsure, say N.
+config BLK_DEV_RBD
+ tristate "Rados block device (RBD)"
+ depends on INET && EXPERIMENTAL && BLOCK
+ select CEPH_LIB
+ select LIBCRC32C
+ select CRYPTO_AES
+ select CRYPTO
+ default n
+ help
+ Say Y here if you want include the Rados block device, which stripes
+ a block device over objects stored in the Ceph distributed object
+ store.
+
+ More information at http://ceph.newdream.net/.
+
+ If unsure, say N.
+
endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index aff5ac925c34..d7f463d6312d 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -37,5 +37,6 @@ obj-$(CONFIG_BLK_DEV_HD) += hd.o
obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
+obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
swim_mod-objs := swim.o swim_asm.o
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
new file mode 100644
index 000000000000..6ec9d53806c5
--- /dev/null
+++ b/drivers/block/rbd.c
@@ -0,0 +1,1841 @@
+/*
+ rbd.c -- Export ceph rados objects as a Linux block device
+
+
+ based on drivers/block/osdblk.c:
+
+ Copyright 2009 Red Hat, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+
+ Instructions for use
+ --------------------
+
+ 1) Map a Linux block device to an existing rbd image.
+
+ Usage: [snap name]
+
+ $ echo "192.168.0.1 name=admin rbd foo" > /sys/class/rbd/add
+
+ The snapshot name can be "-" or omitted to map the image read/write.
+
+ 2) List all active blkdev<->object mappings.
+
+ In this example, we have performed step #1 twice, creating two blkdevs,
+ mapped to two separate rados objects in the rados rbd pool
+
+ $ cat /sys/class/rbd/list
+ #id major client_name pool name snap KB
+ 0 254 client4143 rbd foo - 1024000
+
+ The columns, in order, are:
+ - blkdev unique id
+ - blkdev assigned major
+ - rados client id
+ - rados pool name
+ - rados block device name
+ - mapped snapshot ("-" if none)
+ - device size in KB
+
+
+ 3) Create a snapshot.
+
+ Usage:
+
+ $ echo "0 mysnap" > /sys/class/rbd/snap_create
+
+
+ 4) Listing a snapshot.
+
+ $ cat /sys/class/rbd/snaps_list
+ #id snap KB
+ 0 - 1024000 (*)
+ 0 foo 1024000
+
+ The columns, in order, are:
+ - blkdev unique id
+ - snapshot name, '-' means none (active read/write version)
+ - size of device at time of snapshot
+ - the (*) indicates this is the active version
+
+ 5) Rollback to snapshot.
+
+ Usage:
+
+ $ echo "0 mysnap" > /sys/class/rbd/snap_rollback
+
+
+ 6) Mapping an image using snapshot.
+
+ A snapshot mapping is read-only. This is being done by passing
+ snap= to the options when adding a device.
+
+ $ echo "192.168.0.1 name=admin,snap=mysnap rbd foo" > /sys/class/rbd/add
+
+
+ 7) Remove an active blkdev<->rbd image mapping.
+
+ In this example, we remove the mapping with blkdev unique id 1.
+
+ $ echo 1 > /sys/class/rbd/remove
+
+
+ NOTE: The actual creation and deletion of rados objects is outside the scope
+ of this driver.
+
+ */
+
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+
+#include "rbd_types.h"
+
+#define DRV_NAME "rbd"
+#define DRV_NAME_LONG "rbd (rados block device)"
+
+#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
+
+#define RBD_MAX_MD_NAME_LEN (96 + sizeof(RBD_SUFFIX))
+#define RBD_MAX_POOL_NAME_LEN 64
+#define RBD_MAX_SNAP_NAME_LEN 32
+#define RBD_MAX_OPT_LEN 1024
+
+#define RBD_SNAP_HEAD_NAME "-"
+
+#define DEV_NAME_LEN 32
+
+/*
+ * block device image metadata (in-memory version)
+ */
+struct rbd_image_header {
+ u64 image_size;
+ char block_name[32];
+ __u8 obj_order;
+ __u8 crypt_type;
+ __u8 comp_type;
+ struct rw_semaphore snap_rwsem;
+ struct ceph_snap_context *snapc;
+ size_t snap_names_len;
+ u64 snap_seq;
+ u32 total_snaps;
+
+ char *snap_names;
+ u64 *snap_sizes;
+};
+
+/*
+ * an instance of the client. multiple devices may share a client.
+ */
+struct rbd_client {
+ struct ceph_client *client;
+ struct kref kref;
+ struct list_head node;
+};
+
+/*
+ * a single io request
+ */
+struct rbd_request {
+ struct request *rq; /* blk layer request */
+ struct bio *bio; /* cloned bio */
+ struct page **pages; /* list of used pages */
+ u64 len;
+};
+
+/*
+ * a single device
+ */
+struct rbd_device {
+ int id; /* blkdev unique id */
+
+ int major; /* blkdev assigned major */
+ struct gendisk *disk; /* blkdev's gendisk and rq */
+ struct request_queue *q;
+
+ struct ceph_client *client;
+ struct rbd_client *rbd_client;
+
+ char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
+
+ spinlock_t lock; /* queue lock */
+
+ struct rbd_image_header header;
+ char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
+ int obj_len;
+ char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
+ char pool_name[RBD_MAX_POOL_NAME_LEN];
+ int poolid;
+
+ char snap_name[RBD_MAX_SNAP_NAME_LEN];
+ u32 cur_snap; /* index+1 of current snapshot within snap context
+ 0 - for the head */
+ int read_only;
+
+ struct list_head node;
+};
+
+static spinlock_t node_lock; /* protects client get/put */
+
+static struct class *class_rbd; /* /sys/class/rbd */
+static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
+static LIST_HEAD(rbd_dev_list); /* devices */
+static LIST_HEAD(rbd_client_list); /* clients */
+
+
+static int rbd_open(struct block_device *bdev, fmode_t mode)
+{
+ struct gendisk *disk = bdev->bd_disk;
+ struct rbd_device *rbd_dev = disk->private_data;
+
+ set_device_ro(bdev, rbd_dev->read_only);
+
+ if ((mode & FMODE_WRITE) && rbd_dev->read_only)
+ return -EROFS;
+
+ return 0;
+}
+
+static const struct block_device_operations rbd_bd_ops = {
+ .owner = THIS_MODULE,
+ .open = rbd_open,
+};
+
+/*
+ * Initialize an rbd client instance.
+ * We own *opt.
+ */
+static struct rbd_client *rbd_client_create(struct ceph_options *opt)
+{
+ struct rbd_client *rbdc;
+ int ret = -ENOMEM;
+
+ dout("rbd_client_create\n");
+ rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
+ if (!rbdc)
+ goto out_opt;
+
+ kref_init(&rbdc->kref);
+ INIT_LIST_HEAD(&rbdc->node);
+
+ rbdc->client = ceph_create_client(opt, rbdc);
+ if (IS_ERR(rbdc->client))
+ goto out_rbdc;
+ opt = NULL; /* Now rbdc->client is responsible for opt */
+
+ ret = ceph_open_session(rbdc->client);
+ if (ret < 0)
+ goto out_err;
+
+ spin_lock(&node_lock);
+ list_add_tail(&rbdc->node, &rbd_client_list);
+ spin_unlock(&node_lock);
+
+ dout("rbd_client_create created %p\n", rbdc);
+ return rbdc;
+
+out_err:
+ ceph_destroy_client(rbdc->client);
+out_rbdc:
+ kfree(rbdc);
+out_opt:
+ if (opt)
+ ceph_destroy_options(opt);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Find a ceph client with specific addr and configuration.
+ */
+static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
+{
+ struct rbd_client *client_node;
+
+ if (opt->flags & CEPH_OPT_NOSHARE)
+ return NULL;
+
+ list_for_each_entry(client_node, &rbd_client_list, node)
+ if (ceph_compare_options(opt, client_node->client) == 0)
+ return client_node;
+ return NULL;
+}
+
+/*
+ * Get a ceph client with specific addr and configuration, if one does
+ * not exist create it.
+ */
+static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
+ char *options)
+{
+ struct rbd_client *rbdc;
+ struct ceph_options *opt;
+ int ret;
+
+ ret = ceph_parse_options(&opt, options, mon_addr,
+ mon_addr + strlen(mon_addr), NULL, NULL);
+ if (ret < 0)
+ return ret;
+
+ spin_lock(&node_lock);
+ rbdc = __rbd_client_find(opt);
+ if (rbdc) {
+ ceph_destroy_options(opt);
+
+ /* using an existing client */
+ kref_get(&rbdc->kref);
+ rbd_dev->rbd_client = rbdc;
+ rbd_dev->client = rbdc->client;
+ spin_unlock(&node_lock);
+ return 0;
+ }
+ spin_unlock(&node_lock);
+
+ rbdc = rbd_client_create(opt);
+ if (IS_ERR(rbdc))
+ return PTR_ERR(rbdc);
+
+ rbd_dev->rbd_client = rbdc;
+ rbd_dev->client = rbdc->client;
+ return 0;
+}
+
+/*
+ * Destroy ceph client
+ */
+static void rbd_client_release(struct kref *kref)
+{
+ struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
+
+ dout("rbd_release_client %p\n", rbdc);
+ spin_lock(&node_lock);
+ list_del(&rbdc->node);
+ spin_unlock(&node_lock);
+
+ ceph_destroy_client(rbdc->client);
+ kfree(rbdc);
+}
+
+/*
+ * Drop reference to ceph client node. If it's not referenced anymore, release
+ * it.
+ */
+static void rbd_put_client(struct rbd_device *rbd_dev)
+{
+ kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
+ rbd_dev->rbd_client = NULL;
+ rbd_dev->client = NULL;
+}
+
+
+/*
+ * Create a new header structure, translate header format from the on-disk
+ * header.
+ */
+static int rbd_header_from_disk(struct rbd_image_header *header,
+ struct rbd_image_header_ondisk *ondisk,
+ int allocated_snaps,
+ gfp_t gfp_flags)
+{
+ int i;
+ u32 snap_count = le32_to_cpu(ondisk->snap_count);
+ int ret = -ENOMEM;
+
+ init_rwsem(&header->snap_rwsem);
+
+ header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
+ header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
+ snap_count *
+ sizeof(struct rbd_image_snap_ondisk),
+ gfp_flags);
+ if (!header->snapc)
+ return -ENOMEM;
+ if (snap_count) {
+ header->snap_names = kmalloc(header->snap_names_len,
+ GFP_KERNEL);
+ if (!header->snap_names)
+ goto err_snapc;
+ header->snap_sizes = kmalloc(snap_count * sizeof(u64),
+ GFP_KERNEL);
+ if (!header->snap_sizes)
+ goto err_names;
+ } else {
+ header->snap_names = NULL;
+ header->snap_sizes = NULL;
+ }
+ memcpy(header->block_name, ondisk->block_name,
+ sizeof(ondisk->block_name));
+
+ header->image_size = le64_to_cpu(ondisk->image_size);
+ header->obj_order = ondisk->options.order;
+ header->crypt_type = ondisk->options.crypt_type;
+ header->comp_type = ondisk->options.comp_type;
+
+ atomic_set(&header->snapc->nref, 1);
+ header->snap_seq = le64_to_cpu(ondisk->snap_seq);
+ header->snapc->num_snaps = snap_count;
+ header->total_snaps = snap_count;
+
+ if (snap_count &&
+ allocated_snaps == snap_count) {
+ for (i = 0; i < snap_count; i++) {
+ header->snapc->snaps[i] =
+ le64_to_cpu(ondisk->snaps[i].id);
+ header->snap_sizes[i] =
+ le64_to_cpu(ondisk->snaps[i].image_size);
+ }
+
+ /* copy snapshot names */
+ memcpy(header->snap_names, &ondisk->snaps[i],
+ header->snap_names_len);
+ }
+
+ return 0;
+
+err_names:
+ kfree(header->snap_names);
+err_snapc:
+ kfree(header->snapc);
+ return ret;
+}
+
+static int snap_index(struct rbd_image_header *header, int snap_num)
+{
+ return header->total_snaps - snap_num;
+}
+
+static u64 cur_snap_id(struct rbd_device *rbd_dev)
+{
+ struct rbd_image_header *header = &rbd_dev->header;
+
+ if (!rbd_dev->cur_snap)
+ return 0;
+
+ return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
+}
+
+static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
+ u64 *seq, u64 *size)
+{
+ int i;
+ char *p = header->snap_names;
+
+ for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
+ if (strcmp(snap_name, p) == 0)
+ break;
+ }
+ if (i == header->total_snaps)
+ return -ENOENT;
+ if (seq)
+ *seq = header->snapc->snaps[i];
+
+ if (size)
+ *size = header->snap_sizes[i];
+
+ return i;
+}
+
+static int rbd_header_set_snap(struct rbd_device *dev,
+ const char *snap_name,
+ u64 *size)
+{
+ struct rbd_image_header *header = &dev->header;
+ struct ceph_snap_context *snapc = header->snapc;
+ int ret = -ENOENT;
+
+ down_write(&header->snap_rwsem);
+
+ if (!snap_name ||
+ !*snap_name ||
+ strcmp(snap_name, "-") == 0 ||
+ strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) {
+ if (header->total_snaps)
+ snapc->seq = header->snap_seq;
+ else
+ snapc->seq = 0;
+ dev->cur_snap = 0;
+ dev->read_only = 0;
+ if (size)
+ *size = header->image_size;
+ } else {
+ ret = snap_by_name(header, snap_name, &snapc->seq, size);
+ if (ret < 0)
+ goto done;
+
+ dev->cur_snap = header->total_snaps - ret;
+ dev->read_only = 1;
+ }
+
+ ret = 0;
+done:
+ up_write(&header->snap_rwsem);
+ return ret;
+}
+
+static void rbd_header_free(struct rbd_image_header *header)
+{
+ kfree(header->snapc);
+ kfree(header->snap_names);
+ kfree(header->snap_sizes);
+}
+
+/*
+ * get the actual striped segment name, offset and length
+ */
+static u64 rbd_get_segment(struct rbd_image_header *header,
+ const char *block_name,
+ u64 ofs, u64 len,
+ char *seg_name, u64 *segofs)
+{
+ u64 seg = ofs >> header->obj_order;
+
+ if (seg_name)
+ snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
+ "%s.%012llx", block_name, seg);
+
+ ofs = ofs & ((1 << header->obj_order) - 1);
+ len = min_t(u64, len, (1 << header->obj_order) - ofs);
+
+ if (segofs)
+ *segofs = ofs;
+
+ return len;
+}
+
+/*
+ * bio helpers
+ */
+
+static void bio_chain_put(struct bio *chain)
+{
+ struct bio *tmp;
+
+ while (chain) {
+ tmp = chain;
+ chain = chain->bi_next;
+ bio_put(tmp);
+ }
+}
+
+/*
+ * zeros a bio chain, starting at specific offset
+ */
+static void zero_bio_chain(struct bio *chain, int start_ofs)
+{
+ struct bio_vec *bv;
+ unsigned long flags;
+ void *buf;
+ int i;
+ int pos = 0;
+
+ while (chain) {
+ bio_for_each_segment(bv, chain, i) {
+ if (pos + bv->bv_len > start_ofs) {
+ int remainder = max(start_ofs - pos, 0);
+ buf = bvec_kmap_irq(bv, &flags);
+ memset(buf + remainder, 0,
+ bv->bv_len - remainder);
+ bvec_kunmap_irq(buf, &flags);
+ }
+ pos += bv->bv_len;
+ }
+
+ chain = chain->bi_next;
+ }
+}
+
+/*
+ * bio_chain_clone - clone a chain of bios up to a certain length.
+ * might return a bio_pair that will need to be released.
+ */
+static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
+ struct bio_pair **bp,
+ int len, gfp_t gfpmask)
+{
+ struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
+ int total = 0;
+
+ if (*bp) {
+ bio_pair_release(*bp);
+ *bp = NULL;
+ }
+
+ while (old_chain && (total < len)) {
+ tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
+ if (!tmp)
+ goto err_out;
+
+ if (total + old_chain->bi_size > len) {
+ struct bio_pair *bp;
+
+ /*
+ * this split can only happen with a single paged bio,
+ * split_bio will BUG_ON if this is not the case
+ */
+ dout("bio_chain_clone split! total=%d remaining=%d"
+ "bi_size=%d\n",
+ (int)total, (int)len-total,
+ (int)old_chain->bi_size);
+
+ /* split the bio. We'll release it either in the next
+ call, or it will have to be released outside */
+ bp = bio_split(old_chain, (len - total) / 512ULL);
+ if (!bp)
+ goto err_out;
+
+ __bio_clone(tmp, &bp->bio1);
+
+ *next = &bp->bio2;
+ } else {
+ __bio_clone(tmp, old_chain);
+ *next = old_chain->bi_next;
+ }
+
+ tmp->bi_bdev = NULL;
+ gfpmask &= ~__GFP_WAIT;
+ tmp->bi_next = NULL;
+
+ if (!new_chain) {
+ new_chain = tail = tmp;
+ } else {
+ tail->bi_next = tmp;
+ tail = tmp;
+ }
+ old_chain = old_chain->bi_next;
+
+ total += tmp->bi_size;
+ }
+
+ BUG_ON(total < len);
+
+ if (tail)
+ tail->bi_next = NULL;
+
+ *old = old_chain;
+
+ return new_chain;
+
+err_out:
+ dout("bio_chain_clone with err\n");
+ bio_chain_put(new_chain);
+ return NULL;
+}
+
+/*
+ * helpers for osd request op vectors.
+ */
+static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
+ int num_ops,
+ int opcode,
+ u32 payload_len)
+{
+ *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
+ GFP_NOIO);
+ if (!*ops)
+ return -ENOMEM;
+ (*ops)[0].op = opcode;
+ /*
+ * op extent offset and length will be set later on
+ * in calc_raw_layout()
+ */
+ (*ops)[0].payload_len = payload_len;
+ return 0;
+}
+
+static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
+{
+ kfree(ops);
+}
+
+/*
+ * Send ceph osd request
+ */
+static int rbd_do_request(struct request *rq,
+ struct rbd_device *dev,
+ struct ceph_snap_context *snapc,
+ u64 snapid,
+ const char *obj, u64 ofs, u64 len,
+ struct bio *bio,
+ struct page **pages,
+ int num_pages,
+ int flags,
+ struct ceph_osd_req_op *ops,
+ int num_reply,
+ void (*rbd_cb)(struct ceph_osd_request *req,
+ struct ceph_msg *msg))
+{
+ struct ceph_osd_request *req;
+ struct ceph_file_layout *layout;
+ int ret;
+ u64 bno;
+ struct timespec mtime = CURRENT_TIME;
+ struct rbd_request *req_data;
+ struct ceph_osd_request_head *reqhead;
+ struct rbd_image_header *header = &dev->header;
+
+ ret = -ENOMEM;
+ req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
+ if (!req_data)
+ goto done;
+
+ dout("rbd_do_request len=%lld ofs=%lld\n", len, ofs);
+
+ down_read(&header->snap_rwsem);
+
+ req = ceph_osdc_alloc_request(&dev->client->osdc, flags,
+ snapc,
+ ops,
+ false,
+ GFP_NOIO, pages, bio);
+ if (IS_ERR(req)) {
+ up_read(&header->snap_rwsem);
+ ret = PTR_ERR(req);
+ goto done_pages;
+ }
+
+ req->r_callback = rbd_cb;
+
+ req_data->rq = rq;
+ req_data->bio = bio;
+ req_data->pages = pages;
+ req_data->len = len;
+
+ req->r_priv = req_data;
+
+ reqhead = req->r_request->front.iov_base;
+ reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
+
+ strncpy(req->r_oid, obj, sizeof(req->r_oid));
+ req->r_oid_len = strlen(req->r_oid);
+
+ layout = &req->r_file_layout;
+ memset(layout, 0, sizeof(*layout));
+ layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+ layout->fl_stripe_count = cpu_to_le32(1);
+ layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+ layout->fl_pg_preferred = cpu_to_le32(-1);
+ layout->fl_pg_pool = cpu_to_le32(dev->poolid);
+ ceph_calc_raw_layout(&dev->client->osdc, layout, snapid,
+ ofs, &len, &bno, req, ops);
+
+ ceph_osdc_build_request(req, ofs, &len,
+ ops,
+ snapc,
+ &mtime,
+ req->r_oid, req->r_oid_len);
+ up_read(&header->snap_rwsem);
+
+ ret = ceph_osdc_start_request(&dev->client->osdc, req, false);
+ if (ret < 0)
+ goto done_err;
+
+ if (!rbd_cb) {
+ ret = ceph_osdc_wait_request(&dev->client->osdc, req);
+ ceph_osdc_put_request(req);
+ }
+ return ret;
+
+done_err:
+ bio_chain_put(req_data->bio);
+ ceph_osdc_put_request(req);
+done_pages:
+ kfree(req_data);
+done:
+ if (rq)
+ blk_end_request(rq, ret, len);
+ return ret;
+}
+
+/*
+ * Ceph osd op callback
+ */
+static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
+{
+ struct rbd_request *req_data = req->r_priv;
+ struct ceph_osd_reply_head *replyhead;
+ struct ceph_osd_op *op;
+ __s32 rc;
+ u64 bytes;
+ int read_op;
+
+ /* parse reply */
+ replyhead = msg->front.iov_base;
+ WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
+ op = (void *)(replyhead + 1);
+ rc = le32_to_cpu(replyhead->result);
+ bytes = le64_to_cpu(op->extent.length);
+ read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
+
+ dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
+
+ if (rc == -ENOENT && read_op) {
+ zero_bio_chain(req_data->bio, 0);
+ rc = 0;
+ } else if (rc == 0 && read_op && bytes < req_data->len) {
+ zero_bio_chain(req_data->bio, bytes);
+ bytes = req_data->len;
+ }
+
+ blk_end_request(req_data->rq, rc, bytes);
+
+ if (req_data->bio)
+ bio_chain_put(req_data->bio);
+
+ ceph_osdc_put_request(req);
+ kfree(req_data);
+}
+
+/*
+ * Do a synchronous ceph osd operation
+ */
+static int rbd_req_sync_op(struct rbd_device *dev,
+ struct ceph_snap_context *snapc,
+ u64 snapid,
+ int opcode,
+ int flags,
+ struct ceph_osd_req_op *orig_ops,
+ int num_reply,
+ const char *obj,
+ u64 ofs, u64 len,
+ char *buf)
+{
+ int ret;
+ struct page **pages;
+ int num_pages;
+ struct ceph_osd_req_op *ops = orig_ops;
+ u32 payload_len;
+
+ num_pages = calc_pages_for(ofs , len);
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ if (!orig_ops) {
+ payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
+ ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
+ if (ret < 0)
+ goto done;
+
+ if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
+ ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
+ if (ret < 0)
+ goto done_ops;
+ }
+ }
+
+ ret = rbd_do_request(NULL, dev, snapc, snapid,
+ obj, ofs, len, NULL,
+ pages, num_pages,
+ flags,
+ ops,
+ 2,
+ NULL);
+ if (ret < 0)
+ goto done_ops;
+
+ if ((flags & CEPH_OSD_FLAG_READ) && buf)
+ ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
+
+done_ops:
+ if (!orig_ops)
+ rbd_destroy_ops(ops);
+done:
+ ceph_release_page_vector(pages, num_pages);
+ return ret;
+}
+
+/*
+ * Do an asynchronous ceph osd operation
+ */
+static int rbd_do_op(struct request *rq,
+ struct rbd_device *rbd_dev ,
+ struct ceph_snap_context *snapc,
+ u64 snapid,
+ int opcode, int flags, int num_reply,
+ u64 ofs, u64 len,
+ struct bio *bio)
+{
+ char *seg_name;
+ u64 seg_ofs;
+ u64 seg_len;
+ int ret;
+ struct ceph_osd_req_op *ops;
+ u32 payload_len;
+
+ seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
+ if (!seg_name)
+ return -ENOMEM;
+
+ seg_len = rbd_get_segment(&rbd_dev->header,
+ rbd_dev->header.block_name,
+ ofs, len,
+ seg_name, &seg_ofs);
+
+ payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
+
+ ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
+ if (ret < 0)
+ goto done;
+
+ /* we've taken care of segment sizes earlier when we
+ cloned the bios. We should never have a segment
+ truncated at this point */
+ BUG_ON(seg_len < len);
+
+ ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
+ seg_name, seg_ofs, seg_len,
+ bio,
+ NULL, 0,
+ flags,
+ ops,
+ num_reply,
+ rbd_req_cb);
+done:
+ kfree(seg_name);
+ return ret;
+}
+
+/*
+ * Request async osd write
+ */
+static int rbd_req_write(struct request *rq,
+ struct rbd_device *rbd_dev,
+ struct ceph_snap_context *snapc,
+ u64 ofs, u64 len,
+ struct bio *bio)
+{
+ return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
+ CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+ 2,
+ ofs, len, bio);
+}
+
+/*
+ * Request async osd read
+ */
+static int rbd_req_read(struct request *rq,
+ struct rbd_device *rbd_dev,
+ u64 snapid,
+ u64 ofs, u64 len,
+ struct bio *bio)
+{
+ return rbd_do_op(rq, rbd_dev, NULL,
+ (snapid ? snapid : CEPH_NOSNAP),
+ CEPH_OSD_OP_READ,
+ CEPH_OSD_FLAG_READ,
+ 2,
+ ofs, len, bio);
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_read(struct rbd_device *dev,
+ struct ceph_snap_context *snapc,
+ u64 snapid,
+ const char *obj,
+ u64 ofs, u64 len,
+ char *buf)
+{
+ return rbd_req_sync_op(dev, NULL,
+ (snapid ? snapid : CEPH_NOSNAP),
+ CEPH_OSD_OP_READ,
+ CEPH_OSD_FLAG_READ,
+ NULL,
+ 1, obj, ofs, len, buf);
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_rollback_obj(struct rbd_device *dev,
+ u64 snapid,
+ const char *obj)
+{
+ struct ceph_osd_req_op *ops;
+ int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_ROLLBACK, 0);
+ if (ret < 0)
+ return ret;
+
+ ops[0].snap.snapid = snapid;
+
+ ret = rbd_req_sync_op(dev, NULL,
+ CEPH_NOSNAP,
+ 0,
+ CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+ ops,
+ 1, obj, 0, 0, NULL);
+
+ rbd_destroy_ops(ops);
+
+ if (ret < 0)
+ return ret;
+
+ return ret;
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_exec(struct rbd_device *dev,
+ const char *obj,
+ const char *cls,
+ const char *method,
+ const char *data,
+ int len)
+{
+ struct ceph_osd_req_op *ops;
+ int cls_len = strlen(cls);
+ int method_len = strlen(method);
+ int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
+ cls_len + method_len + len);
+ if (ret < 0)
+ return ret;
+
+ ops[0].cls.class_name = cls;
+ ops[0].cls.class_len = (__u8)cls_len;
+ ops[0].cls.method_name = method;
+ ops[0].cls.method_len = (__u8)method_len;
+ ops[0].cls.argc = 0;
+ ops[0].cls.indata = data;
+ ops[0].cls.indata_len = len;
+
+ ret = rbd_req_sync_op(dev, NULL,
+ CEPH_NOSNAP,
+ 0,
+ CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+ ops,
+ 1, obj, 0, 0, NULL);
+
+ rbd_destroy_ops(ops);
+
+ dout("cls_exec returned %d\n", ret);
+ return ret;
+}
+
+/*
+ * block device queue callback
+ */
+static void rbd_rq_fn(struct request_queue *q)
+{
+ struct rbd_device *rbd_dev = q->queuedata;
+ struct request *rq;
+ struct bio_pair *bp = NULL;
+
+ rq = blk_fetch_request(q);
+
+ while (1) {
+ struct bio *bio;
+ struct bio *rq_bio, *next_bio = NULL;
+ bool do_write;
+ int size, op_size = 0;
+ u64 ofs;
+
+ /* peek at request from block layer */
+ if (!rq)
+ break;
+
+ dout("fetched request\n");
+
+ /* filter out block requests we don't understand */
+ if ((rq->cmd_type != REQ_TYPE_FS)) {
+ __blk_end_request_all(rq, 0);
+ goto next;
+ }
+
+ /* deduce our operation (read, write) */
+ do_write = (rq_data_dir(rq) == WRITE);
+
+ size = blk_rq_bytes(rq);
+ ofs = blk_rq_pos(rq) * 512ULL;
+ rq_bio = rq->bio;
+ if (do_write && rbd_dev->read_only) {
+ __blk_end_request_all(rq, -EROFS);
+ goto next;
+ }
+
+ spin_unlock_irq(q->queue_lock);
+
+ dout("%s 0x%x bytes at 0x%llx\n",
+ do_write ? "write" : "read",
+ size, blk_rq_pos(rq) * 512ULL);
+
+ do {
+ /* a bio clone to be passed down to OSD req */
+ dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
+ op_size = rbd_get_segment(&rbd_dev->header,
+ rbd_dev->header.block_name,
+ ofs, size,
+ NULL, NULL);
+ bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
+ op_size, GFP_ATOMIC);
+ if (!bio) {
+ spin_lock_irq(q->queue_lock);
+ __blk_end_request_all(rq, -ENOMEM);
+ goto next;
+ }
+
+ /* init OSD command: write or read */
+ if (do_write)
+ rbd_req_write(rq, rbd_dev,
+ rbd_dev->header.snapc,
+ ofs,
+ op_size, bio);
+ else
+ rbd_req_read(rq, rbd_dev,
+ cur_snap_id(rbd_dev),
+ ofs,
+ op_size, bio);
+
+ size -= op_size;
+ ofs += op_size;
+
+ rq_bio = next_bio;
+ } while (size > 0);
+
+ if (bp)
+ bio_pair_release(bp);
+
+ spin_lock_irq(q->queue_lock);
+next:
+ rq = blk_fetch_request(q);
+ }
+}
+
+/*
+ * a queue callback. Makes sure that we don't create a bio that spans across
+ * multiple osd objects. One exception would be with a single page bios,
+ * which we handle later at bio_chain_clone
+ */
+static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
+ struct bio_vec *bvec)
+{
+ struct rbd_device *rbd_dev = q->queuedata;
+ unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
+ sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
+ unsigned int bio_sectors = bmd->bi_size >> 9;
+ int max;
+
+ max = (chunk_sectors - ((sector & (chunk_sectors - 1))
+ + bio_sectors)) << 9;
+ if (max < 0)
+ max = 0; /* bio_add cannot handle a negative return */
+ if (max <= bvec->bv_len && bio_sectors == 0)
+ return bvec->bv_len;
+ return max;
+}
+
+static void rbd_free_disk(struct rbd_device *rbd_dev)
+{
+ struct gendisk *disk = rbd_dev->disk;
+
+ if (!disk)
+ return;
+
+ rbd_header_free(&rbd_dev->header);
+
+ if (disk->flags & GENHD_FL_UP)
+ del_gendisk(disk);
+ if (disk->queue)
+ blk_cleanup_queue(disk->queue);
+ put_disk(disk);
+}
+
+/*
+ * reload the ondisk the header
+ */
+static int rbd_read_header(struct rbd_device *rbd_dev,
+ struct rbd_image_header *header)
+{
+ ssize_t rc;
+ struct rbd_image_header_ondisk *dh;
+ int snap_count = 0;
+ u64 snap_names_len = 0;
+
+ while (1) {
+ int len = sizeof(*dh) +
+ snap_count * sizeof(struct rbd_image_snap_ondisk) +
+ snap_names_len;
+
+ rc = -ENOMEM;
+ dh = kmalloc(len, GFP_KERNEL);
+ if (!dh)
+ return -ENOMEM;
+
+ rc = rbd_req_sync_read(rbd_dev,
+ NULL, CEPH_NOSNAP,
+ rbd_dev->obj_md_name,
+ 0, len,
+ (char *)dh);
+ if (rc < 0)
+ goto out_dh;
+
+ rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
+ if (rc < 0)
+ goto out_dh;
+
+ if (snap_count != header->total_snaps) {
+ snap_count = header->total_snaps;
+ snap_names_len = header->snap_names_len;
+ rbd_header_free(header);
+ kfree(dh);
+ continue;
+ }
+ break;
+ }
+
+out_dh:
+ kfree(dh);
+ return rc;
+}
+
+/*
+ * create a snapshot
+ */
+static int rbd_header_add_snap(struct rbd_device *dev,
+ const char *snap_name,
+ gfp_t gfp_flags)
+{
+ int name_len = strlen(snap_name);
+ u64 new_snapid;
+ int ret;
+ void *data, *data_start, *data_end;
+
+ /* we should create a snapshot only if we're pointing at the head */
+ if (dev->cur_snap)
+ return -EINVAL;
+
+ ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid,
+ &new_snapid);
+ dout("created snapid=%lld\n", new_snapid);
+ if (ret < 0)
+ return ret;
+
+ data = kmalloc(name_len + 16, gfp_flags);
+ if (!data)
+ return -ENOMEM;
+
+ data_start = data;
+ data_end = data + name_len + 16;
+
+ ceph_encode_string_safe(&data, data_end, snap_name, name_len, bad);
+ ceph_encode_64_safe(&data, data_end, new_snapid, bad);
+
+ ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
+ data_start, data - data_start);
+
+ kfree(data_start);
+
+ if (ret < 0)
+ return ret;
+
+ dev->header.snapc->seq = new_snapid;
+
+ return 0;
+bad:
+ return -ERANGE;
+}
+
+/*
+ * only read the first part of the ondisk header, without the snaps info
+ */
+static int rbd_update_snaps(struct rbd_device *rbd_dev)
+{
+ int ret;
+ struct rbd_image_header h;
+ u64 snap_seq;
+
+ ret = rbd_read_header(rbd_dev, &h);
+ if (ret < 0)
+ return ret;
+
+ down_write(&rbd_dev->header.snap_rwsem);
+
+ snap_seq = rbd_dev->header.snapc->seq;
+
+ kfree(rbd_dev->header.snapc);
+ kfree(rbd_dev->header.snap_names);
+ kfree(rbd_dev->header.snap_sizes);
+
+ rbd_dev->header.total_snaps = h.total_snaps;
+ rbd_dev->header.snapc = h.snapc;
+ rbd_dev->header.snap_names = h.snap_names;
+ rbd_dev->header.snap_sizes = h.snap_sizes;
+ rbd_dev->header.snapc->seq = snap_seq;
+
+ up_write(&rbd_dev->header.snap_rwsem);
+
+ return 0;
+}
+
+static int rbd_init_disk(struct rbd_device *rbd_dev)
+{
+ struct gendisk *disk;
+ struct request_queue *q;
+ int rc;
+ u64 total_size = 0;
+
+ /* contact OSD, request size info about the object being mapped */
+ rc = rbd_read_header(rbd_dev, &rbd_dev->header);
+ if (rc)
+ return rc;
+
+ rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size);
+ if (rc)
+ return rc;
+
+ /* create gendisk info */
+ rc = -ENOMEM;
+ disk = alloc_disk(RBD_MINORS_PER_MAJOR);
+ if (!disk)
+ goto out;
+
+ sprintf(disk->disk_name, DRV_NAME "%d", rbd_dev->id);
+ disk->major = rbd_dev->major;
+ disk->first_minor = 0;
+ disk->fops = &rbd_bd_ops;
+ disk->private_data = rbd_dev;
+
+ /* init rq */
+ rc = -ENOMEM;
+ q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
+ if (!q)
+ goto out_disk;
+ blk_queue_merge_bvec(q, rbd_merge_bvec);
+ disk->queue = q;
+
+ q->queuedata = rbd_dev;
+
+ rbd_dev->disk = disk;
+ rbd_dev->q = q;
+
+ /* finally, announce the disk to the world */
+ set_capacity(disk, total_size / 512ULL);
+ add_disk(disk);
+
+ pr_info("%s: added with size 0x%llx\n",
+ disk->disk_name, (unsigned long long)total_size);
+ return 0;
+
+out_disk:
+ put_disk(disk);
+out:
+ return rc;
+}
+
+/********************************************************************
+ * /sys/class/rbd/
+ * add map rados objects to blkdev
+ * remove unmap rados objects
+ * list show mappings
+ *******************************************************************/
+
+static void class_rbd_release(struct class *cls)
+{
+ kfree(cls);
+}
+
+static ssize_t class_rbd_list(struct class *c,
+ struct class_attribute *attr,
+ char *data)
+{
+ int n = 0;
+ struct list_head *tmp;
+ int max = PAGE_SIZE;
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ n += snprintf(data, max,
+ "#id\tmajor\tclient_name\tpool\tname\tsnap\tKB\n");
+
+ list_for_each(tmp, &rbd_dev_list) {
+ struct rbd_device *rbd_dev;
+
+ rbd_dev = list_entry(tmp, struct rbd_device, node);
+ n += snprintf(data+n, max-n,
+ "%d\t%d\tclient%lld\t%s\t%s\t%s\t%lld\n",
+ rbd_dev->id,
+ rbd_dev->major,
+ ceph_client_id(rbd_dev->client),
+ rbd_dev->pool_name,
+ rbd_dev->obj, rbd_dev->snap_name,
+ rbd_dev->header.image_size >> 10);
+ if (n == max)
+ break;
+ }
+
+ mutex_unlock(&ctl_mutex);
+ return n;
+}
+
+static ssize_t class_rbd_add(struct class *c,
+ struct class_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct ceph_osd_client *osdc;
+ struct rbd_device *rbd_dev;
+ ssize_t rc = -ENOMEM;
+ int irc, new_id = 0;
+ struct list_head *tmp;
+ char *mon_dev_name;
+ char *options;
+
+ if (!try_module_get(THIS_MODULE))
+ return -ENODEV;
+
+ mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
+ if (!mon_dev_name)
+ goto err_out_mod;
+
+ options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
+ if (!options)
+ goto err_mon_dev;
+
+ /* new rbd_device object */
+ rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
+ if (!rbd_dev)
+ goto err_out_opt;
+
+ /* static rbd_device initialization */
+ spin_lock_init(&rbd_dev->lock);
+ INIT_LIST_HEAD(&rbd_dev->node);
+
+ /* generate unique id: find highest unique id, add one */
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ list_for_each(tmp, &rbd_dev_list) {
+ struct rbd_device *rbd_dev;
+
+ rbd_dev = list_entry(tmp, struct rbd_device, node);
+ if (rbd_dev->id >= new_id)
+ new_id = rbd_dev->id + 1;
+ }
+
+ rbd_dev->id = new_id;
+
+ /* add to global list */
+ list_add_tail(&rbd_dev->node, &rbd_dev_list);
+
+ /* parse add command */
+ if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
+ "%" __stringify(RBD_MAX_OPT_LEN) "s "
+ "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
+ "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
+ "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
+ mon_dev_name, options, rbd_dev->pool_name,
+ rbd_dev->obj, rbd_dev->snap_name) < 4) {
+ rc = -EINVAL;
+ goto err_out_slot;
+ }
+
+ if (rbd_dev->snap_name[0] == 0)
+ rbd_dev->snap_name[0] = '-';
+
+ rbd_dev->obj_len = strlen(rbd_dev->obj);
+ snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
+ rbd_dev->obj, RBD_SUFFIX);
+
+ /* initialize rest of new object */
+ snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id);
+ rc = rbd_get_client(rbd_dev, mon_dev_name, options);
+ if (rc < 0)
+ goto err_out_slot;
+
+ mutex_unlock(&ctl_mutex);
+
+ /* pick the pool */
+ osdc = &rbd_dev->client->osdc;
+ rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
+ if (rc < 0)
+ goto err_out_client;
+ rbd_dev->poolid = rc;
+
+ /* register our block device */
+ irc = register_blkdev(0, rbd_dev->name);
+ if (irc < 0) {
+ rc = irc;
+ goto err_out_client;
+ }
+ rbd_dev->major = irc;
+
+ /* set up and announce blkdev mapping */
+ rc = rbd_init_disk(rbd_dev);
+ if (rc)
+ goto err_out_blkdev;
+
+ return count;
+
+err_out_blkdev:
+ unregister_blkdev(rbd_dev->major, rbd_dev->name);
+err_out_client:
+ rbd_put_client(rbd_dev);
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+err_out_slot:
+ list_del_init(&rbd_dev->node);
+ mutex_unlock(&ctl_mutex);
+
+ kfree(rbd_dev);
+err_out_opt:
+ kfree(options);
+err_mon_dev:
+ kfree(mon_dev_name);
+err_out_mod:
+ dout("Error adding device %s\n", buf);
+ module_put(THIS_MODULE);
+ return rc;
+}
+
+static struct rbd_device *__rbd_get_dev(unsigned long id)
+{
+ struct list_head *tmp;
+ struct rbd_device *rbd_dev;
+
+ list_for_each(tmp, &rbd_dev_list) {
+ rbd_dev = list_entry(tmp, struct rbd_device, node);
+ if (rbd_dev->id == id)
+ return rbd_dev;
+ }
+ return NULL;
+}
+
+static ssize_t class_rbd_remove(struct class *c,
+ struct class_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct rbd_device *rbd_dev = NULL;
+ int target_id, rc;
+ unsigned long ul;
+
+ rc = strict_strtoul(buf, 10, &ul);
+ if (rc)
+ return rc;
+
+ /* convert to int; abort if we lost anything in the conversion */
+ target_id = (int) ul;
+ if (target_id != ul)
+ return -EINVAL;
+
+ /* remove object from list immediately */
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ rbd_dev = __rbd_get_dev(target_id);
+ if (rbd_dev)
+ list_del_init(&rbd_dev->node);
+
+ mutex_unlock(&ctl_mutex);
+
+ if (!rbd_dev)
+ return -ENOENT;
+
+ rbd_put_client(rbd_dev);
+
+ /* clean up and free blkdev */
+ rbd_free_disk(rbd_dev);
+ unregister_blkdev(rbd_dev->major, rbd_dev->name);
+ kfree(rbd_dev);
+
+ /* release module ref */
+ module_put(THIS_MODULE);
+
+ return count;
+}
+
+static ssize_t class_rbd_snaps_list(struct class *c,
+ struct class_attribute *attr,
+ char *data)
+{
+ struct rbd_device *rbd_dev = NULL;
+ struct list_head *tmp;
+ struct rbd_image_header *header;
+ int i, n = 0, max = PAGE_SIZE;
+ int ret;
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ n += snprintf(data, max, "#id\tsnap\tKB\n");
+
+ list_for_each(tmp, &rbd_dev_list) {
+ char *names, *p;
+ struct ceph_snap_context *snapc;
+
+ rbd_dev = list_entry(tmp, struct rbd_device, node);
+ header = &rbd_dev->header;
+
+ down_read(&header->snap_rwsem);
+
+ names = header->snap_names;
+ snapc = header->snapc;
+
+ n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n",
+ rbd_dev->id, RBD_SNAP_HEAD_NAME,
+ header->image_size >> 10,
+ (!rbd_dev->cur_snap ? " (*)" : ""));
+ if (n == max)
+ break;
+
+ p = names;
+ for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
+ n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n",
+ rbd_dev->id, p, header->snap_sizes[i] >> 10,
+ (rbd_dev->cur_snap &&
+ (snap_index(header, i) == rbd_dev->cur_snap) ?
+ " (*)" : ""));
+ if (n == max)
+ break;
+ }
+
+ up_read(&header->snap_rwsem);
+ }
+
+
+ ret = n;
+ mutex_unlock(&ctl_mutex);
+ return ret;
+}
+
+static ssize_t class_rbd_snaps_refresh(struct class *c,
+ struct class_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct rbd_device *rbd_dev = NULL;
+ int target_id, rc;
+ unsigned long ul;
+ int ret = count;
+
+ rc = strict_strtoul(buf, 10, &ul);
+ if (rc)
+ return rc;
+
+ /* convert to int; abort if we lost anything in the conversion */
+ target_id = (int) ul;
+ if (target_id != ul)
+ return -EINVAL;
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ rbd_dev = __rbd_get_dev(target_id);
+ if (!rbd_dev) {
+ ret = -ENOENT;
+ goto done;
+ }
+
+ rc = rbd_update_snaps(rbd_dev);
+ if (rc < 0)
+ ret = rc;
+
+done:
+ mutex_unlock(&ctl_mutex);
+ return ret;
+}
+
+static ssize_t class_rbd_snap_create(struct class *c,
+ struct class_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct rbd_device *rbd_dev = NULL;
+ int target_id, ret;
+ char *name;
+
+ name = kmalloc(RBD_MAX_SNAP_NAME_LEN + 1, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ /* parse snaps add command */
+ if (sscanf(buf, "%d "
+ "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
+ &target_id,
+ name) != 2) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ rbd_dev = __rbd_get_dev(target_id);
+ if (!rbd_dev) {
+ ret = -ENOENT;
+ goto done_unlock;
+ }
+
+ ret = rbd_header_add_snap(rbd_dev,
+ name, GFP_KERNEL);
+ if (ret < 0)
+ goto done_unlock;
+
+ ret = rbd_update_snaps(rbd_dev);
+ if (ret < 0)
+ goto done_unlock;
+
+ ret = count;
+done_unlock:
+ mutex_unlock(&ctl_mutex);
+done:
+ kfree(name);
+ return ret;
+}
+
+static ssize_t class_rbd_rollback(struct class *c,
+ struct class_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct rbd_device *rbd_dev = NULL;
+ int target_id, ret;
+ u64 snapid;
+ char snap_name[RBD_MAX_SNAP_NAME_LEN];
+ u64 cur_ofs;
+ char *seg_name;
+
+ /* parse snaps add command */
+ if (sscanf(buf, "%d "
+ "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
+ &target_id,
+ snap_name) != 2) {
+ return -EINVAL;
+ }
+
+ ret = -ENOMEM;
+ seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
+ if (!seg_name)
+ return ret;
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ rbd_dev = __rbd_get_dev(target_id);
+ if (!rbd_dev) {
+ ret = -ENOENT;
+ goto done_unlock;
+ }
+
+ ret = snap_by_name(&rbd_dev->header, snap_name, &snapid, NULL);
+ if (ret < 0)
+ goto done_unlock;
+
+ dout("snapid=%lld\n", snapid);
+
+ cur_ofs = 0;
+ while (cur_ofs < rbd_dev->header.image_size) {
+ cur_ofs += rbd_get_segment(&rbd_dev->header,
+ rbd_dev->obj,
+ cur_ofs, (u64)-1,
+ seg_name, NULL);
+ dout("seg_name=%s\n", seg_name);
+
+ ret = rbd_req_sync_rollback_obj(rbd_dev, snapid, seg_name);
+ if (ret < 0)
+ pr_warning("could not roll back obj %s err=%d\n",
+ seg_name, ret);
+ }
+
+ ret = rbd_update_snaps(rbd_dev);
+ if (ret < 0)
+ goto done_unlock;
+
+ ret = count;
+
+done_unlock:
+ mutex_unlock(&ctl_mutex);
+ kfree(seg_name);
+
+ return ret;
+}
+
+static struct class_attribute class_rbd_attrs[] = {
+ __ATTR(add, 0200, NULL, class_rbd_add),
+ __ATTR(remove, 0200, NULL, class_rbd_remove),
+ __ATTR(list, 0444, class_rbd_list, NULL),
+ __ATTR(snaps_refresh, 0200, NULL, class_rbd_snaps_refresh),
+ __ATTR(snap_create, 0200, NULL, class_rbd_snap_create),
+ __ATTR(snaps_list, 0444, class_rbd_snaps_list, NULL),
+ __ATTR(snap_rollback, 0200, NULL, class_rbd_rollback),
+ __ATTR_NULL
+};
+
+/*
+ * create control files in sysfs
+ * /sys/class/rbd/...
+ */
+static int rbd_sysfs_init(void)
+{
+ int ret = -ENOMEM;
+
+ class_rbd = kzalloc(sizeof(*class_rbd), GFP_KERNEL);
+ if (!class_rbd)
+ goto out;
+
+ class_rbd->name = DRV_NAME;
+ class_rbd->owner = THIS_MODULE;
+ class_rbd->class_release = class_rbd_release;
+ class_rbd->class_attrs = class_rbd_attrs;
+
+ ret = class_register(class_rbd);
+ if (ret)
+ goto out_class;
+ return 0;
+
+out_class:
+ kfree(class_rbd);
+ class_rbd = NULL;
+ pr_err(DRV_NAME ": failed to create class rbd\n");
+out:
+ return ret;
+}
+
+static void rbd_sysfs_cleanup(void)
+{
+ if (class_rbd)
+ class_destroy(class_rbd);
+ class_rbd = NULL;
+}
+
+int __init rbd_init(void)
+{
+ int rc;
+
+ rc = rbd_sysfs_init();
+ if (rc)
+ return rc;
+ spin_lock_init(&node_lock);
+ pr_info("loaded " DRV_NAME_LONG "\n");
+ return 0;
+}
+
+void __exit rbd_exit(void)
+{
+ rbd_sysfs_cleanup();
+}
+
+module_init(rbd_init);
+module_exit(rbd_exit);
+
+MODULE_AUTHOR("Sage Weil ");
+MODULE_AUTHOR("Yehuda Sadeh ");
+MODULE_DESCRIPTION("rados block device");
+
+/* following authorship retained from original osdblk.c */
+MODULE_AUTHOR("Jeff Garzik ");
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
new file mode 100644
index 000000000000..fc6c678aa2cb
--- /dev/null
+++ b/drivers/block/rbd_types.h
@@ -0,0 +1,73 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2010 Sage Weil
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RBD_TYPES_H
+#define CEPH_RBD_TYPES_H
+
+#include
+
+/*
+ * rbd image 'foo' consists of objects
+ * foo.rbd - image metadata
+ * foo.00000000
+ * foo.00000001
+ * ... - data
+ */
+
+#define RBD_SUFFIX ".rbd"
+#define RBD_DIRECTORY "rbd_directory"
+#define RBD_INFO "rbd_info"
+
+#define RBD_DEFAULT_OBJ_ORDER 22 /* 4MB */
+#define RBD_MIN_OBJ_ORDER 16
+#define RBD_MAX_OBJ_ORDER 30
+
+#define RBD_MAX_OBJ_NAME_LEN 96
+#define RBD_MAX_SEG_NAME_LEN 128
+
+#define RBD_COMP_NONE 0
+#define RBD_CRYPT_NONE 0
+
+#define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n"
+#define RBD_HEADER_SIGNATURE "RBD"
+#define RBD_HEADER_VERSION "001.005"
+
+struct rbd_info {
+ __le64 max_id;
+} __attribute__ ((packed));
+
+struct rbd_image_snap_ondisk {
+ __le64 id;
+ __le64 image_size;
+} __attribute__((packed));
+
+struct rbd_image_header_ondisk {
+ char text[40];
+ char block_name[24];
+ char signature[4];
+ char version[8];
+ struct {
+ __u8 order;
+ __u8 crypt_type;
+ __u8 comp_type;
+ __u8 unused;
+ } __attribute__((packed)) options;
+ __le64 image_size;
+ __le64 snap_seq;
+ __le32 snap_count;
+ __le32 reserved;
+ __le64 snap_names_len;
+ struct rbd_image_snap_ondisk snaps[0];
+} __attribute__((packed));
+
+
+#endif
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 1101e251a629..8320490226b7 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -2,7 +2,6 @@
#include
#include
#include
-#include
#include
#include
#include
@@ -222,8 +221,8 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
return err;
}
-static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned cmd, unsigned long data)
+static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long data)
{
struct gendisk *disk = bdev->bd_disk;
struct virtio_blk *vblk = disk->private_data;
@@ -238,18 +237,6 @@ static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
(void __user *)data);
}
-static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long param)
-{
- int ret;
-
- lock_kernel();
- ret = virtblk_locked_ioctl(bdev, mode, cmd, param);
- unlock_kernel();
-
- return ret;
-}
-
/* We provide getgeo only to please some old bootloader/partitioning tools */
static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
{
diff --git a/drivers/char/agp/Kconfig b/drivers/char/agp/Kconfig
index 4b66c69eaf57..5ddf67e76f8b 100644
--- a/drivers/char/agp/Kconfig
+++ b/drivers/char/agp/Kconfig
@@ -57,7 +57,7 @@ config AGP_AMD
config AGP_AMD64
tristate "AMD Opteron/Athlon64 on-CPU GART support"
- depends on AGP && X86 && K8_NB
+ depends on AGP && X86 && AMD_NB
help
This option gives you AGP support for the GLX component of
X using the on-CPU northbridge of the AMD Athlon64/Opteron CPUs.
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index 70312da4c968..42396df55556 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -15,7 +15,7 @@
#include
#include /* PAGE_SIZE */
#include
-#include
+#include
#include
#include "agp.h"
@@ -124,7 +124,7 @@ static int amd64_fetch_size(void)
u32 temp;
struct aper_size_info_32 *values;
- dev = k8_northbridges[0];
+ dev = k8_northbridges.nb_misc[0];
if (dev==NULL)
return 0;
@@ -181,10 +181,14 @@ static int amd_8151_configure(void)
unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real);
int i;
+ if (!k8_northbridges.gart_supported)
+ return 0;
+
/* Configure AGP regs in each x86-64 host bridge. */
- for (i = 0; i < num_k8_northbridges; i++) {
+ for (i = 0; i < k8_northbridges.num; i++) {
agp_bridge->gart_bus_addr =
- amd64_configure(k8_northbridges[i], gatt_bus);
+ amd64_configure(k8_northbridges.nb_misc[i],
+ gatt_bus);
}
k8_flush_garts();
return 0;
@@ -195,11 +199,15 @@ static void amd64_cleanup(void)
{
u32 tmp;
int i;
- for (i = 0; i < num_k8_northbridges; i++) {
- struct pci_dev *dev = k8_northbridges[i];
+
+ if (!k8_northbridges.gart_supported)
+ return;
+
+ for (i = 0; i < k8_northbridges.num; i++) {
+ struct pci_dev *dev = k8_northbridges.nb_misc[i];
/* disable gart translation */
pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &tmp);
- tmp &= ~AMD64_GARTEN;
+ tmp &= ~GARTEN;
pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, tmp);
}
}
@@ -313,22 +321,25 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp,
if (order < 0 || !agp_aperture_valid(aper, (32*1024*1024)<> 25);
return 0;
}
-static __devinit int cache_nbs (struct pci_dev *pdev, u32 cap_ptr)
+static __devinit int cache_nbs(struct pci_dev *pdev, u32 cap_ptr)
{
int i;
if (cache_k8_northbridges() < 0)
return -ENODEV;
+ if (!k8_northbridges.gart_supported)
+ return -ENODEV;
+
i = 0;
- for (i = 0; i < num_k8_northbridges; i++) {
- struct pci_dev *dev = k8_northbridges[i];
+ for (i = 0; i < k8_northbridges.num; i++) {
+ struct pci_dev *dev = k8_northbridges.nb_misc[i];
if (fix_northbridge(dev, pdev, cap_ptr) < 0) {
dev_err(&dev->dev, "no usable aperture found\n");
#ifdef __x86_64__
@@ -405,7 +416,8 @@ static int __devinit uli_agp_init(struct pci_dev *pdev)
}
/* shadow x86-64 registers into ULi registers */
- pci_read_config_dword (k8_northbridges[0], AMD64_GARTAPERTUREBASE, &httfea);
+ pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE,
+ &httfea);
/* if x86-64 aperture base is beyond 4G, exit here */
if ((httfea & 0x7fff) >> (32 - 25)) {
@@ -472,7 +484,8 @@ static int nforce3_agp_init(struct pci_dev *pdev)
pci_write_config_dword(dev1, NVIDIA_X86_64_1_APSIZE, tmp);
/* shadow x86-64 registers into NVIDIA registers */
- pci_read_config_dword (k8_northbridges[0], AMD64_GARTAPERTUREBASE, &apbase);
+ pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE,
+ &apbase);
/* if x86-64 aperture base is beyond 4G, exit here */
if ( (apbase & 0x7fff) >> (32 - 25) ) {
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c
index d2abf5143983..64255cef8a7d 100644
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -984,7 +984,9 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge)
bridge->driver->cache_flush();
#ifdef CONFIG_X86
- set_memory_uc((unsigned long)table, 1 << page_order);
+ if (set_memory_uc((unsigned long)table, 1 << page_order))
+ printk(KERN_WARNING "Could not set GATT table memory to UC!");
+
bridge->gatt_table = (void *)table;
#else
bridge->gatt_table = ioremap_nocache(virt_to_phys(table),
diff --git a/drivers/char/tpm/tpm.c b/drivers/char/tpm/tpm.c
index 05ad4a17a28f..7c4133582dba 100644
--- a/drivers/char/tpm/tpm.c
+++ b/drivers/char/tpm/tpm.c
@@ -47,6 +47,16 @@ enum tpm_duration {
#define TPM_MAX_PROTECTED_ORDINAL 12
#define TPM_PROTECTED_ORDINAL_MASK 0xFF
+/*
+ * Bug workaround - some TPM's don't flush the most
+ * recently changed pcr on suspend, so force the flush
+ * with an extend to the selected _unused_ non-volatile pcr.
+ */
+static int tpm_suspend_pcr;
+module_param_named(suspend_pcr, tpm_suspend_pcr, uint, 0644);
+MODULE_PARM_DESC(suspend_pcr,
+ "PCR to use for dummy writes to faciltate flush on suspend.");
+
static LIST_HEAD(tpm_chip_list);
static DEFINE_SPINLOCK(driver_lock);
static DECLARE_BITMAP(dev_mask, TPM_NUM_DEVICES);
@@ -1077,18 +1087,6 @@ static struct tpm_input_header savestate_header = {
.ordinal = TPM_ORD_SAVESTATE
};
-/* Bug workaround - some TPM's don't flush the most
- * recently changed pcr on suspend, so force the flush
- * with an extend to the selected _unused_ non-volatile pcr.
- */
-static int tpm_suspend_pcr;
-static int __init tpm_suspend_setup(char *str)
-{
- get_option(&str, &tpm_suspend_pcr);
- return 1;
-}
-__setup("tpm_suspend_pcr=", tpm_suspend_setup);
-
/*
* We are about to suspend. Save the TPM state
* so that it can be restored.
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 0f69c5ec0ecd..6c1b676643a9 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -48,6 +48,9 @@ struct ports_driver_data {
/* Used for exporting per-port information to debugfs */
struct dentry *debugfs_dir;
+ /* List of all the devices we're handling */
+ struct list_head portdevs;
+
/* Number of devices this driver is handling */
unsigned int index;
@@ -108,6 +111,9 @@ struct port_buffer {
* ports for that device (vdev->priv).
*/
struct ports_device {
+ /* Next portdev in the list, head is in the pdrvdata struct */
+ struct list_head list;
+
/*
* Workqueue handlers where we process deferred work after
* notification
@@ -178,15 +184,21 @@ struct port {
struct console cons;
/* Each port associates with a separate char device */
- struct cdev cdev;
+ struct cdev *cdev;
struct device *dev;
+ /* Reference-counting to handle port hot-unplugs and file operations */
+ struct kref kref;
+
/* A waitqueue for poll() or blocking read operations */
wait_queue_head_t waitqueue;
/* The 'name' of the port that we expose via sysfs properties */
char *name;
+ /* We can notify apps of host connect / disconnect events via SIGIO */
+ struct fasync_struct *async_queue;
+
/* The 'id' to identify the port with the Host */
u32 id;
@@ -221,6 +233,41 @@ out:
return port;
}
+static struct port *find_port_by_devt_in_portdev(struct ports_device *portdev,
+ dev_t dev)
+{
+ struct port *port;
+ unsigned long flags;
+
+ spin_lock_irqsave(&portdev->ports_lock, flags);
+ list_for_each_entry(port, &portdev->ports, list)
+ if (port->cdev->dev == dev)
+ goto out;
+ port = NULL;
+out:
+ spin_unlock_irqrestore(&portdev->ports_lock, flags);
+
+ return port;
+}
+
+static struct port *find_port_by_devt(dev_t dev)
+{
+ struct ports_device *portdev;
+ struct port *port;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pdrvdata_lock, flags);
+ list_for_each_entry(portdev, &pdrvdata.portdevs, list) {
+ port = find_port_by_devt_in_portdev(portdev, dev);
+ if (port)
+ goto out;
+ }
+ port = NULL;
+out:
+ spin_unlock_irqrestore(&pdrvdata_lock, flags);
+ return port;
+}
+
static struct port *find_port_by_id(struct ports_device *portdev, u32 id)
{
struct port *port;
@@ -410,7 +457,10 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id,
static ssize_t send_control_msg(struct port *port, unsigned int event,
unsigned int value)
{
- return __send_control_msg(port->portdev, port->id, event, value);
+ /* Did the port get unplugged before userspace closed it? */
+ if (port->portdev)
+ return __send_control_msg(port->portdev, port->id, event, value);
+ return 0;
}
/* Callers must take the port->outvq_lock */
@@ -525,6 +575,10 @@ static ssize_t fill_readbuf(struct port *port, char *out_buf, size_t out_count,
/* The condition that must be true for polling to end */
static bool will_read_block(struct port *port)
{
+ if (!port->guest_connected) {
+ /* Port got hot-unplugged. Let's exit. */
+ return false;
+ }
return !port_has_data(port) && port->host_connected;
}
@@ -575,6 +629,9 @@ static ssize_t port_fops_read(struct file *filp, char __user *ubuf,
if (ret < 0)
return ret;
}
+ /* Port got hot-unplugged. */
+ if (!port->guest_connected)
+ return -ENODEV;
/*
* We could've received a disconnection message while we were
* waiting for more data.
@@ -616,6 +673,9 @@ static ssize_t port_fops_write(struct file *filp, const char __user *ubuf,
if (ret < 0)
return ret;
}
+ /* Port got hot-unplugged. */
+ if (!port->guest_connected)
+ return -ENODEV;
count = min((size_t)(32 * 1024), count);
@@ -656,6 +716,10 @@ static unsigned int port_fops_poll(struct file *filp, poll_table *wait)
port = filp->private_data;
poll_wait(filp, &port->waitqueue, wait);
+ if (!port->guest_connected) {
+ /* Port got unplugged */
+ return POLLHUP;
+ }
ret = 0;
if (!will_read_block(port))
ret |= POLLIN | POLLRDNORM;
@@ -667,6 +731,8 @@ static unsigned int port_fops_poll(struct file *filp, poll_table *wait)
return ret;
}
+static void remove_port(struct kref *kref);
+
static int port_fops_release(struct inode *inode, struct file *filp)
{
struct port *port;
@@ -687,6 +753,16 @@ static int port_fops_release(struct inode *inode, struct file *filp)
reclaim_consumed_buffers(port);
spin_unlock_irq(&port->outvq_lock);
+ /*
+ * Locks aren't necessary here as a port can't be opened after
+ * unplug, and if a port isn't unplugged, a kref would already
+ * exist for the port. Plus, taking ports_lock here would
+ * create a dependency on other locks taken by functions
+ * inside remove_port if we're the last holder of the port,
+ * creating many problems.
+ */
+ kref_put(&port->kref, remove_port);
+
return 0;
}
@@ -694,22 +770,31 @@ static int port_fops_open(struct inode *inode, struct file *filp)
{
struct cdev *cdev = inode->i_cdev;
struct port *port;
+ int ret;
- port = container_of(cdev, struct port, cdev);
+ port = find_port_by_devt(cdev->dev);
filp->private_data = port;
+ /* Prevent against a port getting hot-unplugged at the same time */
+ spin_lock_irq(&port->portdev->ports_lock);
+ kref_get(&port->kref);
+ spin_unlock_irq(&port->portdev->ports_lock);
+
/*
* Don't allow opening of console port devices -- that's done
* via /dev/hvc
*/
- if (is_console_port(port))
- return -ENXIO;
+ if (is_console_port(port)) {
+ ret = -ENXIO;
+ goto out;
+ }
/* Allow only one process to open a particular port at a time */
spin_lock_irq(&port->inbuf_lock);
if (port->guest_connected) {
spin_unlock_irq(&port->inbuf_lock);
- return -EMFILE;
+ ret = -EMFILE;
+ goto out;
}
port->guest_connected = true;
@@ -724,10 +809,23 @@ static int port_fops_open(struct inode *inode, struct file *filp)
reclaim_consumed_buffers(port);
spin_unlock_irq(&port->outvq_lock);
+ nonseekable_open(inode, filp);
+
/* Notify host of port being opened */
send_control_msg(filp->private_data, VIRTIO_CONSOLE_PORT_OPEN, 1);
return 0;
+out:
+ kref_put(&port->kref, remove_port);
+ return ret;
+}
+
+static int port_fops_fasync(int fd, struct file *filp, int mode)
+{
+ struct port *port;
+
+ port = filp->private_data;
+ return fasync_helper(fd, filp, mode, &port->async_queue);
}
/*
@@ -743,6 +841,8 @@ static const struct file_operations port_fops = {
.write = port_fops_write,
.poll = port_fops_poll,
.release = port_fops_release,
+ .fasync = port_fops_fasync,
+ .llseek = no_llseek,
};
/*
@@ -1001,6 +1101,12 @@ static unsigned int fill_queue(struct virtqueue *vq, spinlock_t *lock)
return nr_added_bufs;
}
+static void send_sigio_to_port(struct port *port)
+{
+ if (port->async_queue && port->guest_connected)
+ kill_fasync(&port->async_queue, SIGIO, POLL_OUT);
+}
+
static int add_port(struct ports_device *portdev, u32 id)
{
char debugfs_name[16];
@@ -1015,6 +1121,7 @@ static int add_port(struct ports_device *portdev, u32 id)
err = -ENOMEM;
goto fail;
}
+ kref_init(&port->kref);
port->portdev = portdev;
port->id = id;
@@ -1022,6 +1129,7 @@ static int add_port(struct ports_device *portdev, u32 id)
port->name = NULL;
port->inbuf = NULL;
port->cons.hvc = NULL;
+ port->async_queue = NULL;
port->cons.ws.ws_row = port->cons.ws.ws_col = 0;
@@ -1032,14 +1140,20 @@ static int add_port(struct ports_device *portdev, u32 id)
port->in_vq = portdev->in_vqs[port->id];
port->out_vq = portdev->out_vqs[port->id];
- cdev_init(&port->cdev, &port_fops);
+ port->cdev = cdev_alloc();
+ if (!port->cdev) {
+ dev_err(&port->portdev->vdev->dev, "Error allocating cdev\n");
+ err = -ENOMEM;
+ goto free_port;
+ }
+ port->cdev->ops = &port_fops;
devt = MKDEV(portdev->chr_major, id);
- err = cdev_add(&port->cdev, devt, 1);
+ err = cdev_add(port->cdev, devt, 1);
if (err < 0) {
dev_err(&port->portdev->vdev->dev,
"Error %d adding cdev for port %u\n", err, id);
- goto free_port;
+ goto free_cdev;
}
port->dev = device_create(pdrvdata.class, &port->portdev->vdev->dev,
devt, port, "vport%up%u",
@@ -1104,7 +1218,7 @@ free_inbufs:
free_device:
device_destroy(pdrvdata.class, port->dev->devt);
free_cdev:
- cdev_del(&port->cdev);
+ cdev_del(port->cdev);
free_port:
kfree(port);
fail:
@@ -1113,21 +1227,45 @@ fail:
return err;
}
-/* Remove all port-specific data. */
-static int remove_port(struct port *port)
+/* No users remain, remove all port-specific data. */
+static void remove_port(struct kref *kref)
+{
+ struct port *port;
+
+ port = container_of(kref, struct port, kref);
+
+ sysfs_remove_group(&port->dev->kobj, &port_attribute_group);
+ device_destroy(pdrvdata.class, port->dev->devt);
+ cdev_del(port->cdev);
+
+ kfree(port->name);
+
+ debugfs_remove(port->debugfs_file);
+
+ kfree(port);
+}
+
+/*
+ * Port got unplugged. Remove port from portdev's list and drop the
+ * kref reference. If no userspace has this port opened, it will
+ * result in immediate removal the port.
+ */
+static void unplug_port(struct port *port)
{
struct port_buffer *buf;
+ spin_lock_irq(&port->portdev->ports_lock);
+ list_del(&port->list);
+ spin_unlock_irq(&port->portdev->ports_lock);
+
if (port->guest_connected) {
port->guest_connected = false;
port->host_connected = false;
wake_up_interruptible(&port->waitqueue);
- send_control_msg(port, VIRTIO_CONSOLE_PORT_OPEN, 0);
- }
- spin_lock_irq(&port->portdev->ports_lock);
- list_del(&port->list);
- spin_unlock_irq(&port->portdev->ports_lock);
+ /* Let the app know the port is going down. */
+ send_sigio_to_port(port);
+ }
if (is_console_port(port)) {
spin_lock_irq(&pdrvdata_lock);
@@ -1146,9 +1284,6 @@ static int remove_port(struct port *port)
hvc_remove(port->cons.hvc);
#endif
}
- sysfs_remove_group(&port->dev->kobj, &port_attribute_group);
- device_destroy(pdrvdata.class, port->dev->devt);
- cdev_del(&port->cdev);
/* Remove unused data this port might have received. */
discard_port_data(port);
@@ -1159,12 +1294,19 @@ static int remove_port(struct port *port)
while ((buf = virtqueue_detach_unused_buf(port->in_vq)))
free_buf(buf);
- kfree(port->name);
+ /*
+ * We should just assume the device itself has gone off --
+ * else a close on an open port later will try to send out a
+ * control message.
+ */
+ port->portdev = NULL;
- debugfs_remove(port->debugfs_file);
-
- kfree(port);
- return 0;
+ /*
+ * Locks around here are not necessary - a port can't be
+ * opened after we removed the port struct from ports_list
+ * above.
+ */
+ kref_put(&port->kref, remove_port);
}
/* Any private messages that the Host and Guest want to share */
@@ -1203,7 +1345,7 @@ static void handle_control_message(struct ports_device *portdev,
add_port(portdev, cpkt->id);
break;
case VIRTIO_CONSOLE_PORT_REMOVE:
- remove_port(port);
+ unplug_port(port);
break;
case VIRTIO_CONSOLE_CONSOLE_PORT:
if (!cpkt->value)
@@ -1245,6 +1387,12 @@ static void handle_control_message(struct ports_device *portdev,
spin_lock_irq(&port->outvq_lock);
reclaim_consumed_buffers(port);
spin_unlock_irq(&port->outvq_lock);
+
+ /*
+ * If the guest is connected, it'll be interested in
+ * knowing the host connection state changed.
+ */
+ send_sigio_to_port(port);
break;
case VIRTIO_CONSOLE_PORT_NAME:
/*
@@ -1341,6 +1489,9 @@ static void in_intr(struct virtqueue *vq)
wake_up_interruptible(&port->waitqueue);
+ /* Send a SIGIO indicating new data in case the process asked for it */
+ send_sigio_to_port(port);
+
if (is_console_port(port) && hvc_poll(port->cons.hvc))
hvc_kick();
}
@@ -1577,6 +1728,10 @@ static int __devinit virtcons_probe(struct virtio_device *vdev)
add_port(portdev, 0);
}
+ spin_lock_irq(&pdrvdata_lock);
+ list_add_tail(&portdev->list, &pdrvdata.portdevs);
+ spin_unlock_irq(&pdrvdata_lock);
+
__send_control_msg(portdev, VIRTIO_CONSOLE_BAD_ID,
VIRTIO_CONSOLE_DEVICE_READY, 1);
return 0;
@@ -1600,23 +1755,41 @@ static void virtcons_remove(struct virtio_device *vdev)
{
struct ports_device *portdev;
struct port *port, *port2;
- struct port_buffer *buf;
- unsigned int len;
portdev = vdev->priv;
+ spin_lock_irq(&pdrvdata_lock);
+ list_del(&portdev->list);
+ spin_unlock_irq(&pdrvdata_lock);
+
+ /* Disable interrupts for vqs */
+ vdev->config->reset(vdev);
+ /* Finish up work that's lined up */
cancel_work_sync(&portdev->control_work);
list_for_each_entry_safe(port, port2, &portdev->ports, list)
- remove_port(port);
+ unplug_port(port);
unregister_chrdev(portdev->chr_major, "virtio-portsdev");
- while ((buf = virtqueue_get_buf(portdev->c_ivq, &len)))
- free_buf(buf);
+ /*
+ * When yanking out a device, we immediately lose the
+ * (device-side) queues. So there's no point in keeping the
+ * guest side around till we drop our final reference. This
+ * also means that any ports which are in an open state will
+ * have to just stop using the port, as the vqs are going
+ * away.
+ */
+ if (use_multiport(portdev)) {
+ struct port_buffer *buf;
+ unsigned int len;
- while ((buf = virtqueue_detach_unused_buf(portdev->c_ivq)))
- free_buf(buf);
+ while ((buf = virtqueue_get_buf(portdev->c_ivq, &len)))
+ free_buf(buf);
+
+ while ((buf = virtqueue_detach_unused_buf(portdev->c_ivq)))
+ free_buf(buf);
+ }
vdev->config->del_vqs(vdev);
kfree(portdev->in_vqs);
@@ -1663,6 +1836,7 @@ static int __init init(void)
PTR_ERR(pdrvdata.debugfs_dir));
}
INIT_LIST_HEAD(&pdrvdata.consoles);
+ INIT_LIST_HEAD(&pdrvdata.portdevs);
return register_virtio_driver(&virtio_console);
}
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 70bb350de996..9dbb28b9559f 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -39,7 +39,7 @@ config EDAC_DEBUG
there're four debug levels (x=0,1,2,3 from low to high).
Usually you should select 'N'.
- config EDAC_DECODE_MCE
+config EDAC_DECODE_MCE
tristate "Decode MCEs in human-readable form (only on AMD for now)"
depends on CPU_SUP_AMD && X86_MCE
default y
@@ -51,6 +51,16 @@ config EDAC_DEBUG
which occur really early upon boot, before the module infrastructure
has been initialized.
+config EDAC_MCE_INJ
+ tristate "Simple MCE injection interface over /sysfs"
+ depends on EDAC_DECODE_MCE
+ default n
+ help
+ This is a simple interface to inject MCEs over /sysfs and test
+ the MCE decoding code in EDAC.
+
+ This is currently AMD-only.
+
config EDAC_MM_EDAC
tristate "Main Memory EDAC (Error Detection And Correction) reporting"
help
@@ -66,13 +76,13 @@ config EDAC_MCE
config EDAC_AMD64
tristate "AMD64 (Opteron, Athlon64) K8, F10h, F11h"
- depends on EDAC_MM_EDAC && K8_NB && X86_64 && PCI && EDAC_DECODE_MCE
+ depends on EDAC_MM_EDAC && AMD_NB && X86_64 && PCI && EDAC_DECODE_MCE
help
Support for error detection and correction on the AMD 64
Families of Memory Controllers (K8, F10h and F11h)
config EDAC_AMD64_ERROR_INJECTION
- bool "Sysfs Error Injection facilities"
+ bool "Sysfs HW Error injection facilities"
depends on EDAC_AMD64
help
Recent Opterons (Family 10h and later) provide for Memory Error
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index ca6b1bb24ccc..32c7bc93c525 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -17,6 +17,9 @@ ifdef CONFIG_PCI
edac_core-objs += edac_pci.o edac_pci_sysfs.o
endif
+obj-$(CONFIG_EDAC_MCE_INJ) += mce_amd_inj.o
+
+edac_mce_amd-objs := mce_amd.o
obj-$(CONFIG_EDAC_DECODE_MCE) += edac_mce_amd.o
obj-$(CONFIG_EDAC_AMD76X) += amd76x_edac.o
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index e7d5d6b5dcf6..8521401bbd75 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1,5 +1,5 @@
#include "amd64_edac.h"
-#include
+#include
static struct edac_pci_ctl_info *amd64_ctl_pci;
@@ -2073,11 +2073,18 @@ static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
amd64_handle_ue(mci, info);
}
-void amd64_decode_bus_error(int node_id, struct err_regs *regs)
+void amd64_decode_bus_error(int node_id, struct mce *m, u32 nbcfg)
{
struct mem_ctl_info *mci = mci_lookup[node_id];
+ struct err_regs regs;
- __amd64_decode_bus_error(mci, regs);
+ regs.nbsl = (u32) m->status;
+ regs.nbsh = (u32)(m->status >> 32);
+ regs.nbeal = (u32) m->addr;
+ regs.nbeah = (u32)(m->addr >> 32);
+ regs.nbcfg = nbcfg;
+
+ __amd64_decode_bus_error(mci, ®s);
/*
* Check the UE bit of the NB status high register, if set generate some
@@ -2086,7 +2093,7 @@ void amd64_decode_bus_error(int node_id, struct err_regs *regs)
*
* FIXME: this should go somewhere else, if at all.
*/
- if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
+ if (regs.nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
edac_mc_handle_ue_no_info(mci, "UE bit is set");
}
@@ -2927,7 +2934,7 @@ static int __init amd64_edac_init(void)
* to finish initialization of the MC instances.
*/
err = -ENODEV;
- for (nb = 0; nb < num_k8_northbridges; nb++) {
+ for (nb = 0; nb < k8_northbridges.num; nb++) {
if (!pvt_lookup[nb])
continue;
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 613b9381e71a..044aee4f944d 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -72,7 +72,7 @@
#include
#include
#include "edac_core.h"
-#include "edac_mce_amd.h"
+#include "mce_amd.h"
#define amd64_printk(level, fmt, arg...) \
edac_printk(level, "amd64", fmt, ##arg)
@@ -482,11 +482,10 @@ extern const char *rrrr_msgs[16];
extern const char *to_msgs[2];
extern const char *pp_msgs[4];
extern const char *ii_msgs[4];
-extern const char *ext_msgs[32];
extern const char *htlink_msgs[8];
#ifdef CONFIG_EDAC_DEBUG
-#define NUM_DBG_ATTRS 9
+#define NUM_DBG_ATTRS 5
#else
#define NUM_DBG_ATTRS 0
#endif
diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c
index 59cf2cf6e11e..e3562288f4ce 100644
--- a/drivers/edac/amd64_edac_dbg.c
+++ b/drivers/edac/amd64_edac_dbg.c
@@ -1,167 +1,16 @@
#include "amd64_edac.h"
-/*
- * accept a hex value and store it into the virtual error register file, field:
- * nbeal and nbeah. Assume virtual error values have already been set for: NBSL,
- * NBSH and NBCFG. Then proceed to map the error values to a MC, CSROW and
- * CHANNEL
- */
-static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data,
- size_t count)
-{
- struct amd64_pvt *pvt = mci->pvt_info;
- unsigned long long value;
- int ret = 0;
-
- ret = strict_strtoull(data, 16, &value);
- if (ret != -EINVAL) {
- debugf0("received NBEA= 0x%llx\n", value);
-
- /* place the value into the virtual error packet */
- pvt->ctl_error_info.nbeal = (u32) value;
- value >>= 32;
- pvt->ctl_error_info.nbeah = (u32) value;
-
- /* Process the Mapping request */
- /* TODO: Add race prevention */
- amd_decode_nb_mce(pvt->mc_node_id, &pvt->ctl_error_info, 1);
-
- return count;
- }
- return ret;
+#define EDAC_DCT_ATTR_SHOW(reg) \
+static ssize_t amd64_##reg##_show(struct mem_ctl_info *mci, char *data) \
+{ \
+ struct amd64_pvt *pvt = mci->pvt_info; \
+ return sprintf(data, "0x%016llx\n", (u64)pvt->reg); \
}
-/* display back what the last NBEA (MCA NB Address (MC4_ADDR)) was written */
-static ssize_t amd64_nbea_show(struct mem_ctl_info *mci, char *data)
-{
- struct amd64_pvt *pvt = mci->pvt_info;
- u64 value;
-
- value = pvt->ctl_error_info.nbeah;
- value <<= 32;
- value |= pvt->ctl_error_info.nbeal;
-
- return sprintf(data, "%llx\n", value);
-}
-
-/* store the NBSL (MCA NB Status Low (MC4_STATUS)) value user desires */
-static ssize_t amd64_nbsl_store(struct mem_ctl_info *mci, const char *data,
- size_t count)
-{
- struct amd64_pvt *pvt = mci->pvt_info;
- unsigned long value;
- int ret = 0;
-
- ret = strict_strtoul(data, 16, &value);
- if (ret != -EINVAL) {
- debugf0("received NBSL= 0x%lx\n", value);
-
- pvt->ctl_error_info.nbsl = (u32) value;
-
- return count;
- }
- return ret;
-}
-
-/* display back what the last NBSL value written */
-static ssize_t amd64_nbsl_show(struct mem_ctl_info *mci, char *data)
-{
- struct amd64_pvt *pvt = mci->pvt_info;
- u32 value;
-
- value = pvt->ctl_error_info.nbsl;
-
- return sprintf(data, "%x\n", value);
-}
-
-/* store the NBSH (MCA NB Status High) value user desires */
-static ssize_t amd64_nbsh_store(struct mem_ctl_info *mci, const char *data,
- size_t count)
-{
- struct amd64_pvt *pvt = mci->pvt_info;
- unsigned long value;
- int ret = 0;
-
- ret = strict_strtoul(data, 16, &value);
- if (ret != -EINVAL) {
- debugf0("received NBSH= 0x%lx\n", value);
-
- pvt->ctl_error_info.nbsh = (u32) value;
-
- return count;
- }
- return ret;
-}
-
-/* display back what the last NBSH value written */
-static ssize_t amd64_nbsh_show(struct mem_ctl_info *mci, char *data)
-{
- struct amd64_pvt *pvt = mci->pvt_info;
- u32 value;
-
- value = pvt->ctl_error_info.nbsh;
-
- return sprintf(data, "%x\n", value);
-}
-
-/* accept and store the NBCFG (MCA NB Configuration) value user desires */
-static ssize_t amd64_nbcfg_store(struct mem_ctl_info *mci,
- const char *data, size_t count)
-{
- struct amd64_pvt *pvt = mci->pvt_info;
- unsigned long value;
- int ret = 0;
-
- ret = strict_strtoul(data, 16, &value);
- if (ret != -EINVAL) {
- debugf0("received NBCFG= 0x%lx\n", value);
-
- pvt->ctl_error_info.nbcfg = (u32) value;
-
- return count;
- }
- return ret;
-}
-
-/* various show routines for the controls of a MCI */
-static ssize_t amd64_nbcfg_show(struct mem_ctl_info *mci, char *data)
-{
- struct amd64_pvt *pvt = mci->pvt_info;
-
- return sprintf(data, "%x\n", pvt->ctl_error_info.nbcfg);
-}
-
-
-static ssize_t amd64_dhar_show(struct mem_ctl_info *mci, char *data)
-{
- struct amd64_pvt *pvt = mci->pvt_info;
-
- return sprintf(data, "%x\n", pvt->dhar);
-}
-
-
-static ssize_t amd64_dbam_show(struct mem_ctl_info *mci, char *data)
-{
- struct amd64_pvt *pvt = mci->pvt_info;
-
- return sprintf(data, "%x\n", pvt->dbam0);
-}
-
-
-static ssize_t amd64_topmem_show(struct mem_ctl_info *mci, char *data)
-{
- struct amd64_pvt *pvt = mci->pvt_info;
-
- return sprintf(data, "%llx\n", pvt->top_mem);
-}
-
-
-static ssize_t amd64_topmem2_show(struct mem_ctl_info *mci, char *data)
-{
- struct amd64_pvt *pvt = mci->pvt_info;
-
- return sprintf(data, "%llx\n", pvt->top_mem2);
-}
+EDAC_DCT_ATTR_SHOW(dhar);
+EDAC_DCT_ATTR_SHOW(dbam0);
+EDAC_DCT_ATTR_SHOW(top_mem);
+EDAC_DCT_ATTR_SHOW(top_mem2);
static ssize_t amd64_hole_show(struct mem_ctl_info *mci, char *data)
{
@@ -180,38 +29,6 @@ static ssize_t amd64_hole_show(struct mem_ctl_info *mci, char *data)
*/
struct mcidev_sysfs_attribute amd64_dbg_attrs[] = {
- {
- .attr = {
- .name = "nbea_ctl",
- .mode = (S_IRUGO | S_IWUSR)
- },
- .show = amd64_nbea_show,
- .store = amd64_nbea_store,
- },
- {
- .attr = {
- .name = "nbsl_ctl",
- .mode = (S_IRUGO | S_IWUSR)
- },
- .show = amd64_nbsl_show,
- .store = amd64_nbsl_store,
- },
- {
- .attr = {
- .name = "nbsh_ctl",
- .mode = (S_IRUGO | S_IWUSR)
- },
- .show = amd64_nbsh_show,
- .store = amd64_nbsh_store,
- },
- {
- .attr = {
- .name = "nbcfg_ctl",
- .mode = (S_IRUGO | S_IWUSR)
- },
- .show = amd64_nbcfg_show,
- .store = amd64_nbcfg_store,
- },
{
.attr = {
.name = "dhar",
@@ -225,7 +42,7 @@ struct mcidev_sysfs_attribute amd64_dbg_attrs[] = {
.name = "dbam",
.mode = (S_IRUGO)
},
- .show = amd64_dbam_show,
+ .show = amd64_dbam0_show,
.store = NULL,
},
{
@@ -233,7 +50,7 @@ struct mcidev_sysfs_attribute amd64_dbg_attrs[] = {
.name = "topmem",
.mode = (S_IRUGO)
},
- .show = amd64_topmem_show,
+ .show = amd64_top_mem_show,
.store = NULL,
},
{
@@ -241,7 +58,7 @@ struct mcidev_sysfs_attribute amd64_dbg_attrs[] = {
.name = "topmem2",
.mode = (S_IRUGO)
},
- .show = amd64_topmem2_show,
+ .show = amd64_top_mem2_show,
.store = NULL,
},
{
diff --git a/drivers/edac/edac_device_sysfs.c b/drivers/edac/edac_device_sysfs.c
index 070968178a24..2941dca91aae 100644
--- a/drivers/edac/edac_device_sysfs.c
+++ b/drivers/edac/edac_device_sysfs.c
@@ -13,6 +13,7 @@
#include
#include
#include
+#include
#include "edac_core.h"
#include "edac_module.h"
@@ -235,7 +236,7 @@ int edac_device_register_sysfs_main_kobj(struct edac_device_ctl_info *edac_dev)
debugf1("%s()\n", __func__);
/* get the /sys/devices/system/edac reference */
- edac_class = edac_get_edac_class();
+ edac_class = edac_get_sysfs_class();
if (edac_class == NULL) {
debugf1("%s() no edac_class error\n", __func__);
err = -ENODEV;
@@ -255,7 +256,7 @@ int edac_device_register_sysfs_main_kobj(struct edac_device_ctl_info *edac_dev)
if (!try_module_get(edac_dev->owner)) {
err = -ENODEV;
- goto err_out;
+ goto err_mod_get;
}
/* register */
@@ -282,6 +283,9 @@ int edac_device_register_sysfs_main_kobj(struct edac_device_ctl_info *edac_dev)
err_kobj_reg:
module_put(edac_dev->owner);
+err_mod_get:
+ edac_put_sysfs_class();
+
err_out:
return err;
}
@@ -290,12 +294,11 @@ err_out:
* edac_device_unregister_sysfs_main_kobj:
* the '..../edac/' kobject
*/
-void edac_device_unregister_sysfs_main_kobj(
- struct edac_device_ctl_info *edac_dev)
+void edac_device_unregister_sysfs_main_kobj(struct edac_device_ctl_info *dev)
{
debugf0("%s()\n", __func__);
debugf4("%s() name of kobject is: %s\n",
- __func__, kobject_name(&edac_dev->kobj));
+ __func__, kobject_name(&dev->kobj));
/*
* Unregister the edac device's kobject and
@@ -304,7 +307,8 @@ void edac_device_unregister_sysfs_main_kobj(
* a) module_put() this module
* b) 'kfree' the memory
*/
- kobject_put(&edac_dev->kobj);
+ kobject_put(&dev->kobj);
+ edac_put_sysfs_class();
}
/* edac_dev -> instance information */
diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
index 8aad94d10c0c..a4135860149b 100644
--- a/drivers/edac/edac_mc_sysfs.c
+++ b/drivers/edac/edac_mc_sysfs.c
@@ -11,6 +11,7 @@
#include
#include
+#include
#include
#include "edac_core.h"
@@ -1011,13 +1012,13 @@ void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci)
*/
int edac_sysfs_setup_mc_kset(void)
{
- int err = 0;
+ int err = -EINVAL;
struct sysdev_class *edac_class;
debugf1("%s()\n", __func__);
/* get the /sys/devices/system/edac class reference */
- edac_class = edac_get_edac_class();
+ edac_class = edac_get_sysfs_class();
if (edac_class == NULL) {
debugf1("%s() no edac_class error=%d\n", __func__, err);
goto fail_out;
@@ -1028,15 +1029,16 @@ int edac_sysfs_setup_mc_kset(void)
if (!mc_kset) {
err = -ENOMEM;
debugf1("%s() Failed to register '.../edac/mc'\n", __func__);
- goto fail_out;
+ goto fail_kset;
}
debugf1("%s() Registered '.../edac/mc' kobject\n", __func__);
return 0;
+fail_kset:
+ edac_put_sysfs_class();
- /* error unwind stack */
fail_out:
return err;
}
@@ -1049,5 +1051,6 @@ fail_out:
void edac_sysfs_teardown_mc_kset(void)
{
kset_unregister(mc_kset);
+ edac_put_sysfs_class();
}
diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c
deleted file mode 100644
index 9014df6f605d..000000000000
--- a/drivers/edac/edac_mce_amd.c
+++ /dev/null
@@ -1,452 +0,0 @@
-#include
-#include "edac_mce_amd.h"
-
-static bool report_gart_errors;
-static void (*nb_bus_decoder)(int node_id, struct err_regs *regs);
-
-void amd_report_gart_errors(bool v)
-{
- report_gart_errors = v;
-}
-EXPORT_SYMBOL_GPL(amd_report_gart_errors);
-
-void amd_register_ecc_decoder(void (*f)(int, struct err_regs *))
-{
- nb_bus_decoder = f;
-}
-EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
-
-void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *))
-{
- if (nb_bus_decoder) {
- WARN_ON(nb_bus_decoder != f);
-
- nb_bus_decoder = NULL;
- }
-}
-EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
-
-/*
- * string representation for the different MCA reported error types, see F3x48
- * or MSR0000_0411.
- */
-const char *tt_msgs[] = { /* transaction type */
- "instruction",
- "data",
- "generic",
- "reserved"
-};
-EXPORT_SYMBOL_GPL(tt_msgs);
-
-const char *ll_msgs[] = { /* cache level */
- "L0",
- "L1",
- "L2",
- "L3/generic"
-};
-EXPORT_SYMBOL_GPL(ll_msgs);
-
-const char *rrrr_msgs[] = {
- "generic",
- "generic read",
- "generic write",
- "data read",
- "data write",
- "inst fetch",
- "prefetch",
- "evict",
- "snoop",
- "reserved RRRR= 9",
- "reserved RRRR= 10",
- "reserved RRRR= 11",
- "reserved RRRR= 12",
- "reserved RRRR= 13",
- "reserved RRRR= 14",
- "reserved RRRR= 15"
-};
-EXPORT_SYMBOL_GPL(rrrr_msgs);
-
-const char *pp_msgs[] = { /* participating processor */
- "local node originated (SRC)",
- "local node responded to request (RES)",
- "local node observed as 3rd party (OBS)",
- "generic"
-};
-EXPORT_SYMBOL_GPL(pp_msgs);
-
-const char *to_msgs[] = {
- "no timeout",
- "timed out"
-};
-EXPORT_SYMBOL_GPL(to_msgs);
-
-const char *ii_msgs[] = { /* memory or i/o */
- "mem access",
- "reserved",
- "i/o access",
- "generic"
-};
-EXPORT_SYMBOL_GPL(ii_msgs);
-
-/*
- * Map the 4 or 5 (family-specific) bits of Extended Error code to the
- * string table.
- */
-const char *ext_msgs[] = {
- "K8 ECC error", /* 0_0000b */
- "CRC error on link", /* 0_0001b */
- "Sync error packets on link", /* 0_0010b */
- "Master Abort during link operation", /* 0_0011b */
- "Target Abort during link operation", /* 0_0100b */
- "Invalid GART PTE entry during table walk", /* 0_0101b */
- "Unsupported atomic RMW command received", /* 0_0110b */
- "WDT error: NB transaction timeout", /* 0_0111b */
- "ECC/ChipKill ECC error", /* 0_1000b */
- "SVM DEV Error", /* 0_1001b */
- "Link Data error", /* 0_1010b */
- "Link/L3/Probe Filter Protocol error", /* 0_1011b */
- "NB Internal Arrays Parity error", /* 0_1100b */
- "DRAM Address/Control Parity error", /* 0_1101b */
- "Link Transmission error", /* 0_1110b */
- "GART/DEV Table Walk Data error" /* 0_1111b */
- "Res 0x100 error", /* 1_0000b */
- "Res 0x101 error", /* 1_0001b */
- "Res 0x102 error", /* 1_0010b */
- "Res 0x103 error", /* 1_0011b */
- "Res 0x104 error", /* 1_0100b */
- "Res 0x105 error", /* 1_0101b */
- "Res 0x106 error", /* 1_0110b */
- "Res 0x107 error", /* 1_0111b */
- "Res 0x108 error", /* 1_1000b */
- "Res 0x109 error", /* 1_1001b */
- "Res 0x10A error", /* 1_1010b */
- "Res 0x10B error", /* 1_1011b */
- "ECC error in L3 Cache Data", /* 1_1100b */
- "L3 Cache Tag error", /* 1_1101b */
- "L3 Cache LRU Parity error", /* 1_1110b */
- "Probe Filter error" /* 1_1111b */
-};
-EXPORT_SYMBOL_GPL(ext_msgs);
-
-static void amd_decode_dc_mce(u64 mc0_status)
-{
- u32 ec = mc0_status & 0xffff;
- u32 xec = (mc0_status >> 16) & 0xf;
-
- pr_emerg("Data Cache Error");
-
- if (xec == 1 && TLB_ERROR(ec))
- pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
- else if (xec == 0) {
- if (mc0_status & (1ULL << 40))
- pr_cont(" during Data Scrub.\n");
- else if (TLB_ERROR(ec))
- pr_cont(": %s TLB parity error.\n", LL_MSG(ec));
- else if (MEM_ERROR(ec)) {
- u8 ll = ec & 0x3;
- u8 tt = (ec >> 2) & 0x3;
- u8 rrrr = (ec >> 4) & 0xf;
-
- /* see F10h BKDG (31116), Table 92. */
- if (ll == 0x1) {
- if (tt != 0x1)
- goto wrong_dc_mce;
-
- pr_cont(": Data/Tag %s error.\n", RRRR_MSG(ec));
-
- } else if (ll == 0x2 && rrrr == 0x3)
- pr_cont(" during L1 linefill from L2.\n");
- else
- goto wrong_dc_mce;
- } else if (BUS_ERROR(ec) && boot_cpu_data.x86 == 0xf)
- pr_cont(" during system linefill.\n");
- else
- goto wrong_dc_mce;
- } else
- goto wrong_dc_mce;
-
- return;
-
-wrong_dc_mce:
- pr_warning("Corrupted DC MCE info?\n");
-}
-
-static void amd_decode_ic_mce(u64 mc1_status)
-{
- u32 ec = mc1_status & 0xffff;
- u32 xec = (mc1_status >> 16) & 0xf;
-
- pr_emerg("Instruction Cache Error");
-
- if (xec == 1 && TLB_ERROR(ec))
- pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
- else if (xec == 0) {
- if (TLB_ERROR(ec))
- pr_cont(": %s TLB Parity error.\n", LL_MSG(ec));
- else if (BUS_ERROR(ec)) {
- if (boot_cpu_data.x86 == 0xf &&
- (mc1_status & (1ULL << 58)))
- pr_cont(" during system linefill.\n");
- else
- pr_cont(" during attempted NB data read.\n");
- } else if (MEM_ERROR(ec)) {
- u8 ll = ec & 0x3;
- u8 rrrr = (ec >> 4) & 0xf;
-
- if (ll == 0x2)
- pr_cont(" during a linefill from L2.\n");
- else if (ll == 0x1) {
-
- switch (rrrr) {
- case 0x5:
- pr_cont(": Parity error during "
- "data load.\n");
- break;
-
- case 0x7:
- pr_cont(": Copyback Parity/Victim"
- " error.\n");
- break;
-
- case 0x8:
- pr_cont(": Tag Snoop error.\n");
- break;
-
- default:
- goto wrong_ic_mce;
- break;
- }
- }
- } else
- goto wrong_ic_mce;
- } else
- goto wrong_ic_mce;
-
- return;
-
-wrong_ic_mce:
- pr_warning("Corrupted IC MCE info?\n");
-}
-
-static void amd_decode_bu_mce(u64 mc2_status)
-{
- u32 ec = mc2_status & 0xffff;
- u32 xec = (mc2_status >> 16) & 0xf;
-
- pr_emerg("Bus Unit Error");
-
- if (xec == 0x1)
- pr_cont(" in the write data buffers.\n");
- else if (xec == 0x3)
- pr_cont(" in the victim data buffers.\n");
- else if (xec == 0x2 && MEM_ERROR(ec))
- pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec));
- else if (xec == 0x0) {
- if (TLB_ERROR(ec))
- pr_cont(": %s error in a Page Descriptor Cache or "
- "Guest TLB.\n", TT_MSG(ec));
- else if (BUS_ERROR(ec))
- pr_cont(": %s/ECC error in data read from NB: %s.\n",
- RRRR_MSG(ec), PP_MSG(ec));
- else if (MEM_ERROR(ec)) {
- u8 rrrr = (ec >> 4) & 0xf;
-
- if (rrrr >= 0x7)
- pr_cont(": %s error during data copyback.\n",
- RRRR_MSG(ec));
- else if (rrrr <= 0x1)
- pr_cont(": %s parity/ECC error during data "
- "access from L2.\n", RRRR_MSG(ec));
- else
- goto wrong_bu_mce;
- } else
- goto wrong_bu_mce;
- } else
- goto wrong_bu_mce;
-
- return;
-
-wrong_bu_mce:
- pr_warning("Corrupted BU MCE info?\n");
-}
-
-static void amd_decode_ls_mce(u64 mc3_status)
-{
- u32 ec = mc3_status & 0xffff;
- u32 xec = (mc3_status >> 16) & 0xf;
-
- pr_emerg("Load Store Error");
-
- if (xec == 0x0) {
- u8 rrrr = (ec >> 4) & 0xf;
-
- if (!BUS_ERROR(ec) || (rrrr != 0x3 && rrrr != 0x4))
- goto wrong_ls_mce;
-
- pr_cont(" during %s.\n", RRRR_MSG(ec));
- }
- return;
-
-wrong_ls_mce:
- pr_warning("Corrupted LS MCE info?\n");
-}
-
-void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors)
-{
- u32 ec = ERROR_CODE(regs->nbsl);
-
- if (!handle_errors)
- return;
-
- /*
- * GART TLB error reporting is disabled by default. Bail out early.
- */
- if (TLB_ERROR(ec) && !report_gart_errors)
- return;
-
- pr_emerg("Northbridge Error, node %d", node_id);
-
- /*
- * F10h, revD can disable ErrCpu[3:0] so check that first and also the
- * value encoding has changed so interpret those differently
- */
- if ((boot_cpu_data.x86 == 0x10) &&
- (boot_cpu_data.x86_model > 7)) {
- if (regs->nbsh & K8_NBSH_ERR_CPU_VAL)
- pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf));
- } else {
- u8 assoc_cpus = regs->nbsh & 0xf;
-
- if (assoc_cpus > 0)
- pr_cont(", core: %d", fls(assoc_cpus) - 1);
-
- pr_cont("\n");
- }
-
- pr_emerg("%s.\n", EXT_ERR_MSG(regs->nbsl));
-
- if (BUS_ERROR(ec) && nb_bus_decoder)
- nb_bus_decoder(node_id, regs);
-}
-EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
-
-static void amd_decode_fr_mce(u64 mc5_status)
-{
- /* we have only one error signature so match all fields at once. */
- if ((mc5_status & 0xffff) == 0x0f0f)
- pr_emerg(" FR Error: CPU Watchdog timer expire.\n");
- else
- pr_warning("Corrupted FR MCE info?\n");
-}
-
-static inline void amd_decode_err_code(unsigned int ec)
-{
- if (TLB_ERROR(ec)) {
- pr_emerg("Transaction: %s, Cache Level %s\n",
- TT_MSG(ec), LL_MSG(ec));
- } else if (MEM_ERROR(ec)) {
- pr_emerg("Transaction: %s, Type: %s, Cache Level: %s",
- RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
- } else if (BUS_ERROR(ec)) {
- pr_emerg("Transaction type: %s(%s), %s, Cache Level: %s, "
- "Participating Processor: %s\n",
- RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
- PP_MSG(ec));
- } else
- pr_warning("Huh? Unknown MCE error 0x%x\n", ec);
-}
-
-static int amd_decode_mce(struct notifier_block *nb, unsigned long val,
- void *data)
-{
- struct mce *m = (struct mce *)data;
- struct err_regs regs;
- int node, ecc;
-
- pr_emerg("MC%d_STATUS: ", m->bank);
-
- pr_cont("%sorrected error, other errors lost: %s, "
- "CPU context corrupt: %s",
- ((m->status & MCI_STATUS_UC) ? "Unc" : "C"),
- ((m->status & MCI_STATUS_OVER) ? "yes" : "no"),
- ((m->status & MCI_STATUS_PCC) ? "yes" : "no"));
-
- /* do the two bits[14:13] together */
- ecc = (m->status >> 45) & 0x3;
- if (ecc)
- pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
-
- pr_cont("\n");
-
- switch (m->bank) {
- case 0:
- amd_decode_dc_mce(m->status);
- break;
-
- case 1:
- amd_decode_ic_mce(m->status);
- break;
-
- case 2:
- amd_decode_bu_mce(m->status);
- break;
-
- case 3:
- amd_decode_ls_mce(m->status);
- break;
-
- case 4:
- regs.nbsl = (u32) m->status;
- regs.nbsh = (u32)(m->status >> 32);
- regs.nbeal = (u32) m->addr;
- regs.nbeah = (u32)(m->addr >> 32);
- node = amd_get_nb_id(m->extcpu);
-
- amd_decode_nb_mce(node, ®s, 1);
- break;
-
- case 5:
- amd_decode_fr_mce(m->status);
- break;
-
- default:
- break;
- }
-
- amd_decode_err_code(m->status & 0xffff);
-
- return NOTIFY_STOP;
-}
-
-static struct notifier_block amd_mce_dec_nb = {
- .notifier_call = amd_decode_mce,
-};
-
-static int __init mce_amd_init(void)
-{
- /*
- * We can decode MCEs for K8, F10h and F11h CPUs:
- */
- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
- return 0;
-
- if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
- return 0;
-
- atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
-
- return 0;
-}
-early_initcall(mce_amd_init);
-
-#ifdef MODULE
-static void __exit mce_amd_exit(void)
-{
- atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
-}
-
-MODULE_DESCRIPTION("AMD MCE decoder");
-MODULE_ALIAS("edac-mce-amd");
-MODULE_LICENSE("GPL");
-module_exit(mce_amd_exit);
-#endif
diff --git a/drivers/edac/edac_module.c b/drivers/edac/edac_module.c
index 7e1374afd967..be4b075c3098 100644
--- a/drivers/edac/edac_module.c
+++ b/drivers/edac/edac_module.c
@@ -26,15 +26,6 @@ EXPORT_SYMBOL_GPL(edac_debug_level);
/* scope is to module level only */
struct workqueue_struct *edac_workqueue;
-/*
- * sysfs object: /sys/devices/system/edac
- * need to export to other files in this modules
- */
-static struct sysdev_class edac_class = {
- .name = "edac",
-};
-static int edac_class_valid;
-
/*
* edac_op_state_to_string()
*/
@@ -54,60 +45,6 @@ char *edac_op_state_to_string(int opstate)
return "UNKNOWN";
}
-/*
- * edac_get_edac_class()
- *
- * return pointer to the edac class of 'edac'
- */
-struct sysdev_class *edac_get_edac_class(void)
-{
- struct sysdev_class *classptr = NULL;
-
- if (edac_class_valid)
- classptr = &edac_class;
-
- return classptr;
-}
-
-/*
- * edac_register_sysfs_edac_name()
- *
- * register the 'edac' into /sys/devices/system
- *
- * return:
- * 0 success
- * !0 error
- */
-static int edac_register_sysfs_edac_name(void)
-{
- int err;
-
- /* create the /sys/devices/system/edac directory */
- err = sysdev_class_register(&edac_class);
-
- if (err) {
- debugf1("%s() error=%d\n", __func__, err);
- return err;
- }
-
- edac_class_valid = 1;
- return 0;
-}
-
-/*
- * sysdev_class_unregister()
- *
- * unregister the 'edac' from /sys/devices/system
- */
-static void edac_unregister_sysfs_edac_name(void)
-{
- /* only if currently registered, then unregister it */
- if (edac_class_valid)
- sysdev_class_unregister(&edac_class);
-
- edac_class_valid = 0;
-}
-
/*
* edac_workqueue_setup
* initialize the edac work queue for polling operations
@@ -153,22 +90,12 @@ static int __init edac_init(void)
*/
edac_pci_clear_parity_errors();
- /*
- * perform the registration of the /sys/devices/system/edac class object
- */
- if (edac_register_sysfs_edac_name()) {
- edac_printk(KERN_ERR, EDAC_MC,
- "Error initializing 'edac' kobject\n");
- err = -ENODEV;
- goto error;
- }
-
/*
* now set up the mc_kset under the edac class object
*/
err = edac_sysfs_setup_mc_kset();
if (err)
- goto sysfs_setup_fail;
+ goto error;
/* Setup/Initialize the workq for this core */
err = edac_workqueue_setup();
@@ -183,9 +110,6 @@ static int __init edac_init(void)
workq_fail:
edac_sysfs_teardown_mc_kset();
-sysfs_setup_fail:
- edac_unregister_sysfs_edac_name();
-
error:
return err;
}
@@ -201,7 +125,6 @@ static void __exit edac_exit(void)
/* tear down the various subsystems */
edac_workqueue_teardown();
edac_sysfs_teardown_mc_kset();
- edac_unregister_sysfs_edac_name();
}
/*
diff --git a/drivers/edac/edac_module.h b/drivers/edac/edac_module.h
index 233d4798c3aa..17aabb7b90ec 100644
--- a/drivers/edac/edac_module.h
+++ b/drivers/edac/edac_module.h
@@ -42,7 +42,6 @@ extern void edac_device_unregister_sysfs_main_kobj(
struct edac_device_ctl_info *edac_dev);
extern int edac_device_create_sysfs(struct edac_device_ctl_info *edac_dev);
extern void edac_device_remove_sysfs(struct edac_device_ctl_info *edac_dev);
-extern struct sysdev_class *edac_get_edac_class(void);
/* edac core workqueue: single CPU mode */
extern struct workqueue_struct *edac_workqueue;
diff --git a/drivers/edac/edac_pci_sysfs.c b/drivers/edac/edac_pci_sysfs.c
index c39697df9cb4..023b01cb5175 100644
--- a/drivers/edac/edac_pci_sysfs.c
+++ b/drivers/edac/edac_pci_sysfs.c
@@ -7,7 +7,7 @@
*
*/
#include
-#include
+#include
#include
#include
@@ -354,7 +354,7 @@ static int edac_pci_main_kobj_setup(void)
/* First time, so create the main kobject and its
* controls and atributes
*/
- edac_class = edac_get_edac_class();
+ edac_class = edac_get_sysfs_class();
if (edac_class == NULL) {
debugf1("%s() no edac_class\n", __func__);
err = -ENODEV;
@@ -368,7 +368,7 @@ static int edac_pci_main_kobj_setup(void)
if (!try_module_get(THIS_MODULE)) {
debugf1("%s() try_module_get() failed\n", __func__);
err = -ENODEV;
- goto decrement_count_fail;
+ goto mod_get_fail;
}
edac_pci_top_main_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
@@ -403,6 +403,9 @@ kobject_init_and_add_fail:
kzalloc_fail:
module_put(THIS_MODULE);
+mod_get_fail:
+ edac_put_sysfs_class();
+
decrement_count_fail:
/* if are on this error exit, nothing to tear down */
atomic_dec(&edac_pci_sysfs_refcount);
@@ -429,6 +432,7 @@ static void edac_pci_main_kobj_teardown(void)
__func__);
kobject_put(edac_pci_top_main_kobj);
}
+ edac_put_sysfs_class();
}
/*
diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c
index 20b428aa155e..aab970760b75 100644
--- a/drivers/edac/edac_stub.c
+++ b/drivers/edac/edac_stub.c
@@ -3,10 +3,13 @@
*
* Author: Dave Jiang
*
- * 2007 (c) MontaVista Software, Inc. This file is licensed under
- * the terms of the GNU General Public License version 2. This program
- * is licensed "as is" without any warranty of any kind, whether express
- * or implied.
+ * 2007 (c) MontaVista Software, Inc.
+ * 2010 (c) Advanced Micro Devices Inc.
+ * Borislav Petkov
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
*
*/
#include
@@ -23,6 +26,8 @@ EXPORT_SYMBOL_GPL(edac_handlers);
int edac_err_assert = 0;
EXPORT_SYMBOL_GPL(edac_err_assert);
+static atomic_t edac_class_valid = ATOMIC_INIT(0);
+
/*
* called to determine if there is an EDAC driver interested in
* knowing an event (such as NMI) occurred
@@ -44,3 +49,41 @@ void edac_atomic_assert_error(void)
edac_err_assert++;
}
EXPORT_SYMBOL_GPL(edac_atomic_assert_error);
+
+/*
+ * sysfs object: /sys/devices/system/edac
+ * need to export to other files
+ */
+struct sysdev_class edac_class = {
+ .name = "edac",
+};
+EXPORT_SYMBOL_GPL(edac_class);
+
+/* return pointer to the 'edac' node in sysfs */
+struct sysdev_class *edac_get_sysfs_class(void)
+{
+ int err = 0;
+
+ if (atomic_read(&edac_class_valid))
+ goto out;
+
+ /* create the /sys/devices/system/edac directory */
+ err = sysdev_class_register(&edac_class);
+ if (err) {
+ printk(KERN_ERR "Error registering toplevel EDAC sysfs dir\n");
+ return NULL;
+ }
+
+out:
+ atomic_inc(&edac_class_valid);
+ return &edac_class;
+}
+EXPORT_SYMBOL_GPL(edac_get_sysfs_class);
+
+void edac_put_sysfs_class(void)
+{
+ /* last user unregisters it */
+ if (atomic_dec_and_test(&edac_class_valid))
+ sysdev_class_unregister(&edac_class);
+}
+EXPORT_SYMBOL_GPL(edac_put_sysfs_class);
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
new file mode 100644
index 000000000000..c0181093b490
--- /dev/null
+++ b/drivers/edac/mce_amd.c
@@ -0,0 +1,680 @@
+#include
+#include
+
+#include "mce_amd.h"
+
+static struct amd_decoder_ops *fam_ops;
+
+static u8 nb_err_cpumask = 0xf;
+
+static bool report_gart_errors;
+static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);
+
+void amd_report_gart_errors(bool v)
+{
+ report_gart_errors = v;
+}
+EXPORT_SYMBOL_GPL(amd_report_gart_errors);
+
+void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32))
+{
+ nb_bus_decoder = f;
+}
+EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
+
+void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32))
+{
+ if (nb_bus_decoder) {
+ WARN_ON(nb_bus_decoder != f);
+
+ nb_bus_decoder = NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
+
+/*
+ * string representation for the different MCA reported error types, see F3x48
+ * or MSR0000_0411.
+ */
+
+/* transaction type */
+const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
+EXPORT_SYMBOL_GPL(tt_msgs);
+
+/* cache level */
+const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
+EXPORT_SYMBOL_GPL(ll_msgs);
+
+/* memory transaction type */
+const char *rrrr_msgs[] = {
+ "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
+};
+EXPORT_SYMBOL_GPL(rrrr_msgs);
+
+/* participating processor */
+const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
+EXPORT_SYMBOL_GPL(pp_msgs);
+
+/* request timeout */
+const char *to_msgs[] = { "no timeout", "timed out" };
+EXPORT_SYMBOL_GPL(to_msgs);
+
+/* memory or i/o */
+const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
+EXPORT_SYMBOL_GPL(ii_msgs);
+
+static const char *f10h_nb_mce_desc[] = {
+ "HT link data error",
+ "Protocol error (link, L3, probe filter, etc.)",
+ "Parity error in NB-internal arrays",
+ "Link Retry due to IO link transmission error",
+ "L3 ECC data cache error",
+ "ECC error in L3 cache tag",
+ "L3 LRU parity bits error",
+ "ECC Error in the Probe Filter directory"
+};
+
+static bool f12h_dc_mce(u16 ec)
+{
+ bool ret = false;
+
+ if (MEM_ERROR(ec)) {
+ u8 ll = ec & 0x3;
+ ret = true;
+
+ if (ll == LL_L2)
+ pr_cont("during L1 linefill from L2.\n");
+ else if (ll == LL_L1)
+ pr_cont("Data/Tag %s error.\n", RRRR_MSG(ec));
+ else
+ ret = false;
+ }
+ return ret;
+}
+
+static bool f10h_dc_mce(u16 ec)
+{
+ u8 r4 = (ec >> 4) & 0xf;
+ u8 ll = ec & 0x3;
+
+ if (r4 == R4_GEN && ll == LL_L1) {
+ pr_cont("during data scrub.\n");
+ return true;
+ }
+ return f12h_dc_mce(ec);
+}
+
+static bool k8_dc_mce(u16 ec)
+{
+ if (BUS_ERROR(ec)) {
+ pr_cont("during system linefill.\n");
+ return true;
+ }
+
+ return f10h_dc_mce(ec);
+}
+
+static bool f14h_dc_mce(u16 ec)
+{
+ u8 r4 = (ec >> 4) & 0xf;
+ u8 ll = ec & 0x3;
+ u8 tt = (ec >> 2) & 0x3;
+ u8 ii = tt;
+ bool ret = true;
+
+ if (MEM_ERROR(ec)) {
+
+ if (tt != TT_DATA || ll != LL_L1)
+ return false;
+
+ switch (r4) {
+ case R4_DRD:
+ case R4_DWR:
+ pr_cont("Data/Tag parity error due to %s.\n",
+ (r4 == R4_DRD ? "load/hw prf" : "store"));
+ break;
+ case R4_EVICT:
+ pr_cont("Copyback parity error on a tag miss.\n");
+ break;
+ case R4_SNOOP:
+ pr_cont("Tag parity error during snoop.\n");
+ break;
+ default:
+ ret = false;
+ }
+ } else if (BUS_ERROR(ec)) {
+
+ if ((ii != II_MEM && ii != II_IO) || ll != LL_LG)
+ return false;
+
+ pr_cont("System read data error on a ");
+
+ switch (r4) {
+ case R4_RD:
+ pr_cont("TLB reload.\n");
+ break;
+ case R4_DWR:
+ pr_cont("store.\n");
+ break;
+ case R4_DRD:
+ pr_cont("load.\n");
+ break;
+ default:
+ ret = false;
+ }
+ } else {
+ ret = false;
+ }
+
+ return ret;
+}
+
+static void amd_decode_dc_mce(struct mce *m)
+{
+ u16 ec = m->status & 0xffff;
+ u8 xec = (m->status >> 16) & 0xf;
+
+ pr_emerg(HW_ERR "Data Cache Error: ");
+
+ /* TLB error signatures are the same across families */
+ if (TLB_ERROR(ec)) {
+ u8 tt = (ec >> 2) & 0x3;
+
+ if (tt == TT_DATA) {
+ pr_cont("%s TLB %s.\n", LL_MSG(ec),
+ (xec ? "multimatch" : "parity error"));
+ return;
+ }
+ else
+ goto wrong_dc_mce;
+ }
+
+ if (!fam_ops->dc_mce(ec))
+ goto wrong_dc_mce;
+
+ return;
+
+wrong_dc_mce:
+ pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
+}
+
+static bool k8_ic_mce(u16 ec)
+{
+ u8 ll = ec & 0x3;
+ u8 r4 = (ec >> 4) & 0xf;
+ bool ret = true;
+
+ if (!MEM_ERROR(ec))
+ return false;
+
+ if (ll == 0x2)
+ pr_cont("during a linefill from L2.\n");
+ else if (ll == 0x1) {
+ switch (r4) {
+ case R4_IRD:
+ pr_cont("Parity error during data load.\n");
+ break;
+
+ case R4_EVICT:
+ pr_cont("Copyback Parity/Victim error.\n");
+ break;
+
+ case R4_SNOOP:
+ pr_cont("Tag Snoop error.\n");
+ break;
+
+ default:
+ ret = false;
+ break;
+ }
+ } else
+ ret = false;
+
+ return ret;
+}
+
+static bool f14h_ic_mce(u16 ec)
+{
+ u8 ll = ec & 0x3;
+ u8 tt = (ec >> 2) & 0x3;
+ u8 r4 = (ec >> 4) & 0xf;
+ bool ret = true;
+
+ if (MEM_ERROR(ec)) {
+ if (tt != 0 || ll != 1)
+ ret = false;
+
+ if (r4 == R4_IRD)
+ pr_cont("Data/tag array parity error for a tag hit.\n");
+ else if (r4 == R4_SNOOP)
+ pr_cont("Tag error during snoop/victimization.\n");
+ else
+ ret = false;
+ }
+ return ret;
+}
+
+static void amd_decode_ic_mce(struct mce *m)
+{
+ u16 ec = m->status & 0xffff;
+ u8 xec = (m->status >> 16) & 0xf;
+
+ pr_emerg(HW_ERR "Instruction Cache Error: ");
+
+ if (TLB_ERROR(ec))
+ pr_cont("%s TLB %s.\n", LL_MSG(ec),
+ (xec ? "multimatch" : "parity error"));
+ else if (BUS_ERROR(ec)) {
+ bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
+
+ pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
+ } else if (fam_ops->ic_mce(ec))
+ ;
+ else
+ pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
+}
+
+static void amd_decode_bu_mce(struct mce *m)
+{
+ u32 ec = m->status & 0xffff;
+ u32 xec = (m->status >> 16) & 0xf;
+
+ pr_emerg(HW_ERR "Bus Unit Error");
+
+ if (xec == 0x1)
+ pr_cont(" in the write data buffers.\n");
+ else if (xec == 0x3)
+ pr_cont(" in the victim data buffers.\n");
+ else if (xec == 0x2 && MEM_ERROR(ec))
+ pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec));
+ else if (xec == 0x0) {
+ if (TLB_ERROR(ec))
+ pr_cont(": %s error in a Page Descriptor Cache or "
+ "Guest TLB.\n", TT_MSG(ec));
+ else if (BUS_ERROR(ec))
+ pr_cont(": %s/ECC error in data read from NB: %s.\n",
+ RRRR_MSG(ec), PP_MSG(ec));
+ else if (MEM_ERROR(ec)) {
+ u8 rrrr = (ec >> 4) & 0xf;
+
+ if (rrrr >= 0x7)
+ pr_cont(": %s error during data copyback.\n",
+ RRRR_MSG(ec));
+ else if (rrrr <= 0x1)
+ pr_cont(": %s parity/ECC error during data "
+ "access from L2.\n", RRRR_MSG(ec));
+ else
+ goto wrong_bu_mce;
+ } else
+ goto wrong_bu_mce;
+ } else
+ goto wrong_bu_mce;
+
+ return;
+
+wrong_bu_mce:
+ pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
+}
+
+static void amd_decode_ls_mce(struct mce *m)
+{
+ u16 ec = m->status & 0xffff;
+ u8 xec = (m->status >> 16) & 0xf;
+
+ if (boot_cpu_data.x86 == 0x14) {
+ pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
+ " please report on LKML.\n");
+ return;
+ }
+
+ pr_emerg(HW_ERR "Load Store Error");
+
+ if (xec == 0x0) {
+ u8 r4 = (ec >> 4) & 0xf;
+
+ if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
+ goto wrong_ls_mce;
+
+ pr_cont(" during %s.\n", RRRR_MSG(ec));
+ } else
+ goto wrong_ls_mce;
+
+ return;
+
+wrong_ls_mce:
+ pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
+}
+
+static bool k8_nb_mce(u16 ec, u8 xec)
+{
+ bool ret = true;
+
+ switch (xec) {
+ case 0x1:
+ pr_cont("CRC error detected on HT link.\n");
+ break;
+
+ case 0x5:
+ pr_cont("Invalid GART PTE entry during GART table walk.\n");
+ break;
+
+ case 0x6:
+ pr_cont("Unsupported atomic RMW received from an IO link.\n");
+ break;
+
+ case 0x0:
+ case 0x8:
+ if (boot_cpu_data.x86 == 0x11)
+ return false;
+
+ pr_cont("DRAM ECC error detected on the NB.\n");
+ break;
+
+ case 0xd:
+ pr_cont("Parity error on the DRAM addr/ctl signals.\n");
+ break;
+
+ default:
+ ret = false;
+ break;
+ }
+
+ return ret;
+}
+
+static bool f10h_nb_mce(u16 ec, u8 xec)
+{
+ bool ret = true;
+ u8 offset = 0;
+
+ if (k8_nb_mce(ec, xec))
+ return true;
+
+ switch(xec) {
+ case 0xa ... 0xc:
+ offset = 10;
+ break;
+
+ case 0xe:
+ offset = 11;
+ break;
+
+ case 0xf:
+ if (TLB_ERROR(ec))
+ pr_cont("GART Table Walk data error.\n");
+ else if (BUS_ERROR(ec))
+ pr_cont("DMA Exclusion Vector Table Walk error.\n");
+ else
+ ret = false;
+
+ goto out;
+ break;
+
+ case 0x1c ... 0x1f:
+ offset = 24;
+ break;
+
+ default:
+ ret = false;
+
+ goto out;
+ break;
+ }
+
+ pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]);
+
+out:
+ return ret;
+}
+
+static bool nb_noop_mce(u16 ec, u8 xec)
+{
+ return false;
+}
+
+void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
+{
+ u8 xec = (m->status >> 16) & 0x1f;
+ u16 ec = m->status & 0xffff;
+ u32 nbsh = (u32)(m->status >> 32);
+
+ pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id);
+
+ /*
+ * F10h, revD can disable ErrCpu[3:0] so check that first and also the
+ * value encoding has changed so interpret those differently
+ */
+ if ((boot_cpu_data.x86 == 0x10) &&
+ (boot_cpu_data.x86_model > 7)) {
+ if (nbsh & K8_NBSH_ERR_CPU_VAL)
+ pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask));
+ } else {
+ u8 assoc_cpus = nbsh & nb_err_cpumask;
+
+ if (assoc_cpus > 0)
+ pr_cont(", core: %d", fls(assoc_cpus) - 1);
+ }
+
+ switch (xec) {
+ case 0x2:
+ pr_cont("Sync error (sync packets on HT link detected).\n");
+ return;
+
+ case 0x3:
+ pr_cont("HT Master abort.\n");
+ return;
+
+ case 0x4:
+ pr_cont("HT Target abort.\n");
+ return;
+
+ case 0x7:
+ pr_cont("NB Watchdog timeout.\n");
+ return;
+
+ case 0x9:
+ pr_cont("SVM DMA Exclusion Vector error.\n");
+ return;
+
+ default:
+ break;
+ }
+
+ if (!fam_ops->nb_mce(ec, xec))
+ goto wrong_nb_mce;
+
+ if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10)
+ if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder)
+ nb_bus_decoder(node_id, m, nbcfg);
+
+ return;
+
+wrong_nb_mce:
+ pr_emerg(HW_ERR "Corrupted NB MCE info?\n");
+}
+EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
+
+static void amd_decode_fr_mce(struct mce *m)
+{
+ if (boot_cpu_data.x86 == 0xf ||
+ boot_cpu_data.x86 == 0x11)
+ goto wrong_fr_mce;
+
+ /* we have only one error signature so match all fields at once. */
+ if ((m->status & 0xffff) == 0x0f0f) {
+ pr_emerg(HW_ERR "FR Error: CPU Watchdog timer expire.\n");
+ return;
+ }
+
+wrong_fr_mce:
+ pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
+}
+
+static inline void amd_decode_err_code(u16 ec)
+{
+ if (TLB_ERROR(ec)) {
+ pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n",
+ TT_MSG(ec), LL_MSG(ec));
+ } else if (MEM_ERROR(ec)) {
+ pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n",
+ RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
+ } else if (BUS_ERROR(ec)) {
+ pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, "
+ "Participating Processor: %s\n",
+ RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
+ PP_MSG(ec));
+ } else
+ pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec);
+}
+
+/*
+ * Filter out unwanted MCE signatures here.
+ */
+static bool amd_filter_mce(struct mce *m)
+{
+ u8 xec = (m->status >> 16) & 0x1f;
+
+ /*
+ * NB GART TLB error reporting is disabled by default.
+ */
+ if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
+ return true;
+
+ return false;
+}
+
+int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
+{
+ struct mce *m = (struct mce *)data;
+ int node, ecc;
+
+ if (amd_filter_mce(m))
+ return NOTIFY_STOP;
+
+ pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank);
+
+ pr_cont("%sorrected error, other errors lost: %s, "
+ "CPU context corrupt: %s",
+ ((m->status & MCI_STATUS_UC) ? "Unc" : "C"),
+ ((m->status & MCI_STATUS_OVER) ? "yes" : "no"),
+ ((m->status & MCI_STATUS_PCC) ? "yes" : "no"));
+
+ /* do the two bits[14:13] together */
+ ecc = (m->status >> 45) & 0x3;
+ if (ecc)
+ pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
+
+ pr_cont("\n");
+
+ switch (m->bank) {
+ case 0:
+ amd_decode_dc_mce(m);
+ break;
+
+ case 1:
+ amd_decode_ic_mce(m);
+ break;
+
+ case 2:
+ amd_decode_bu_mce(m);
+ break;
+
+ case 3:
+ amd_decode_ls_mce(m);
+ break;
+
+ case 4:
+ node = amd_get_nb_id(m->extcpu);
+ amd_decode_nb_mce(node, m, 0);
+ break;
+
+ case 5:
+ amd_decode_fr_mce(m);
+ break;
+
+ default:
+ break;
+ }
+
+ amd_decode_err_code(m->status & 0xffff);
+
+ return NOTIFY_STOP;
+}
+EXPORT_SYMBOL_GPL(amd_decode_mce);
+
+static struct notifier_block amd_mce_dec_nb = {
+ .notifier_call = amd_decode_mce,
+};
+
+static int __init mce_amd_init(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return 0;
+
+ if ((boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x12) &&
+ (boot_cpu_data.x86 != 0x14 || boot_cpu_data.x86_model > 0xf))
+ return 0;
+
+ fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
+ if (!fam_ops)
+ return -ENOMEM;
+
+ switch (boot_cpu_data.x86) {
+ case 0xf:
+ fam_ops->dc_mce = k8_dc_mce;
+ fam_ops->ic_mce = k8_ic_mce;
+ fam_ops->nb_mce = k8_nb_mce;
+ break;
+
+ case 0x10:
+ fam_ops->dc_mce = f10h_dc_mce;
+ fam_ops->ic_mce = k8_ic_mce;
+ fam_ops->nb_mce = f10h_nb_mce;
+ break;
+
+ case 0x11:
+ fam_ops->dc_mce = k8_dc_mce;
+ fam_ops->ic_mce = k8_ic_mce;
+ fam_ops->nb_mce = f10h_nb_mce;
+ break;
+
+ case 0x12:
+ fam_ops->dc_mce = f12h_dc_mce;
+ fam_ops->ic_mce = k8_ic_mce;
+ fam_ops->nb_mce = nb_noop_mce;
+ break;
+
+ case 0x14:
+ nb_err_cpumask = 0x3;
+ fam_ops->dc_mce = f14h_dc_mce;
+ fam_ops->ic_mce = f14h_ic_mce;
+ fam_ops->nb_mce = nb_noop_mce;
+ break;
+
+ default:
+ printk(KERN_WARNING "Huh? What family is that: %d?!\n",
+ boot_cpu_data.x86);
+ kfree(fam_ops);
+ return -EINVAL;
+ }
+
+ pr_info("MCE: In-kernel MCE decoding enabled.\n");
+
+ atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
+
+ return 0;
+}
+early_initcall(mce_amd_init);
+
+#ifdef MODULE
+static void __exit mce_amd_exit(void)
+{
+ atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
+ kfree(fam_ops);
+}
+
+MODULE_DESCRIPTION("AMD MCE decoder");
+MODULE_ALIAS("edac-mce-amd");
+MODULE_LICENSE("GPL");
+module_exit(mce_amd_exit);
+#endif
diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/mce_amd.h
similarity index 65%
rename from drivers/edac/edac_mce_amd.h
rename to drivers/edac/mce_amd.h
index df23ee065f79..35f6e0e3b297 100644
--- a/drivers/edac/edac_mce_amd.h
+++ b/drivers/edac/mce_amd.h
@@ -1,11 +1,14 @@
#ifndef _EDAC_MCE_AMD_H
#define _EDAC_MCE_AMD_H
+#include
+
#include
+#define BIT_64(n) (U64_C(1) << (n))
+
#define ERROR_CODE(x) ((x) & 0xffff)
#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f)
-#define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)]
#define LOW_SYNDROME(x) (((x) >> 15) & 0xff)
#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff)
@@ -20,13 +23,14 @@
#define II_MSG(x) ii_msgs[II(x)]
#define LL(x) (((x) >> 0) & 0x3)
#define LL_MSG(x) ll_msgs[LL(x)]
-#define RRRR(x) (((x) >> 4) & 0xf)
-#define RRRR_MSG(x) rrrr_msgs[RRRR(x)]
#define TO(x) (((x) >> 8) & 0x1)
#define TO_MSG(x) to_msgs[TO(x)]
#define PP(x) (((x) >> 9) & 0x3)
#define PP_MSG(x) pp_msgs[PP(x)]
+#define RRRR(x) (((x) >> 4) & 0xf)
+#define RRRR_MSG(x) ((RRRR(x) < 9) ? rrrr_msgs[RRRR(x)] : "Wrong R4!")
+
#define K8_NBSH 0x4C
#define K8_NBSH_VALID_BIT BIT(31)
@@ -41,13 +45,45 @@
#define K8_NBSH_UECC BIT(13)
#define K8_NBSH_ERR_SCRUBER BIT(8)
+enum tt_ids {
+ TT_INSTR = 0,
+ TT_DATA,
+ TT_GEN,
+ TT_RESV,
+};
+
+enum ll_ids {
+ LL_RESV = 0,
+ LL_L1,
+ LL_L2,
+ LL_LG,
+};
+
+enum ii_ids {
+ II_MEM = 0,
+ II_RESV,
+ II_IO,
+ II_GEN,
+};
+
+enum rrrr_ids {
+ R4_GEN = 0,
+ R4_RD,
+ R4_WR,
+ R4_DRD,
+ R4_DWR,
+ R4_IRD,
+ R4_PREF,
+ R4_EVICT,
+ R4_SNOOP,
+};
+
extern const char *tt_msgs[];
extern const char *ll_msgs[];
extern const char *rrrr_msgs[];
extern const char *pp_msgs[];
extern const char *to_msgs[];
extern const char *ii_msgs[];
-extern const char *ext_msgs[];
/*
* relevant NB regs
@@ -60,10 +96,19 @@ struct err_regs {
u32 nbeal;
};
+/*
+ * per-family decoder ops
+ */
+struct amd_decoder_ops {
+ bool (*dc_mce)(u16);
+ bool (*ic_mce)(u16);
+ bool (*nb_mce)(u16, u8);
+};
void amd_report_gart_errors(bool);
-void amd_register_ecc_decoder(void (*f)(int, struct err_regs *));
-void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *));
-void amd_decode_nb_mce(int, struct err_regs *, int);
+void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32));
+void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32));
+void amd_decode_nb_mce(int, struct mce *, u32);
+int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data);
#endif /* _EDAC_MCE_AMD_H */
diff --git a/drivers/edac/mce_amd_inj.c b/drivers/edac/mce_amd_inj.c
new file mode 100644
index 000000000000..8d0688f36d4c
--- /dev/null
+++ b/drivers/edac/mce_amd_inj.c
@@ -0,0 +1,171 @@
+/*
+ * A simple MCE injection facility for testing the MCE decoding code. This
+ * driver should be built as module so that it can be loaded on production
+ * kernels for testing purposes.
+ *
+ * This file may be distributed under the terms of the GNU General Public
+ * License version 2.
+ *
+ * Copyright (c) 2010: Borislav Petkov
+ * Advanced Micro Devices Inc.
+ */
+
+#include
+#include
+#include
+#include
+
+#include "mce_amd.h"
+
+struct edac_mce_attr {
+ struct attribute attr;
+ ssize_t (*show) (struct kobject *kobj, struct edac_mce_attr *attr, char *buf);
+ ssize_t (*store)(struct kobject *kobj, struct edac_mce_attr *attr,
+ const char *buf, size_t count);
+};
+
+#define EDAC_MCE_ATTR(_name, _mode, _show, _store) \
+static struct edac_mce_attr mce_attr_##_name = __ATTR(_name, _mode, _show, _store)
+
+static struct kobject *mce_kobj;
+
+/*
+ * Collect all the MCi_XXX settings
+ */
+static struct mce i_mce;
+
+#define MCE_INJECT_STORE(reg) \
+static ssize_t edac_inject_##reg##_store(struct kobject *kobj, \
+ struct edac_mce_attr *attr, \
+ const char *data, size_t count)\
+{ \
+ int ret = 0; \
+ unsigned long value; \
+ \
+ ret = strict_strtoul(data, 16, &value); \
+ if (ret < 0) \
+ printk(KERN_ERR "Error writing MCE " #reg " field.\n"); \
+ \
+ i_mce.reg = value; \
+ \
+ return count; \
+}
+
+MCE_INJECT_STORE(status);
+MCE_INJECT_STORE(misc);
+MCE_INJECT_STORE(addr);
+
+#define MCE_INJECT_SHOW(reg) \
+static ssize_t edac_inject_##reg##_show(struct kobject *kobj, \
+ struct edac_mce_attr *attr, \
+ char *buf) \
+{ \
+ return sprintf(buf, "0x%016llx\n", i_mce.reg); \
+}
+
+MCE_INJECT_SHOW(status);
+MCE_INJECT_SHOW(misc);
+MCE_INJECT_SHOW(addr);
+
+EDAC_MCE_ATTR(status, 0644, edac_inject_status_show, edac_inject_status_store);
+EDAC_MCE_ATTR(misc, 0644, edac_inject_misc_show, edac_inject_misc_store);
+EDAC_MCE_ATTR(addr, 0644, edac_inject_addr_show, edac_inject_addr_store);
+
+/*
+ * This denotes into which bank we're injecting and triggers
+ * the injection, at the same time.
+ */
+static ssize_t edac_inject_bank_store(struct kobject *kobj,
+ struct edac_mce_attr *attr,
+ const char *data, size_t count)
+{
+ int ret = 0;
+ unsigned long value;
+
+ ret = strict_strtoul(data, 10, &value);
+ if (ret < 0) {
+ printk(KERN_ERR "Invalid bank value!\n");
+ return -EINVAL;
+ }
+
+ if (value > 5) {
+ printk(KERN_ERR "Non-existant MCE bank: %lu\n", value);
+ return -EINVAL;
+ }
+
+ i_mce.bank = value;
+
+ amd_decode_mce(NULL, 0, &i_mce);
+
+ return count;
+}
+
+static ssize_t edac_inject_bank_show(struct kobject *kobj,
+ struct edac_mce_attr *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", i_mce.bank);
+}
+
+EDAC_MCE_ATTR(bank, 0644, edac_inject_bank_show, edac_inject_bank_store);
+
+static struct edac_mce_attr *sysfs_attrs[] = { &mce_attr_status, &mce_attr_misc,
+ &mce_attr_addr, &mce_attr_bank
+};
+
+static int __init edac_init_mce_inject(void)
+{
+ struct sysdev_class *edac_class = NULL;
+ int i, err = 0;
+
+ edac_class = edac_get_sysfs_class();
+ if (!edac_class)
+ return -EINVAL;
+
+ mce_kobj = kobject_create_and_add("mce", &edac_class->kset.kobj);
+ if (!mce_kobj) {
+ printk(KERN_ERR "Error creating a mce kset.\n");
+ err = -ENOMEM;
+ goto err_mce_kobj;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(sysfs_attrs); i++) {
+ err = sysfs_create_file(mce_kobj, &sysfs_attrs[i]->attr);
+ if (err) {
+ printk(KERN_ERR "Error creating %s in sysfs.\n",
+ sysfs_attrs[i]->attr.name);
+ goto err_sysfs_create;
+ }
+ }
+ return 0;
+
+err_sysfs_create:
+ while (i-- >= 0)
+ sysfs_remove_file(mce_kobj, &sysfs_attrs[i]->attr);
+
+ kobject_del(mce_kobj);
+
+err_mce_kobj:
+ edac_put_sysfs_class();
+
+ return err;
+}
+
+static void __exit edac_exit_mce_inject(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(sysfs_attrs); i++)
+ sysfs_remove_file(mce_kobj, &sysfs_attrs[i]->attr);
+
+ kobject_del(mce_kobj);
+
+ edac_put_sysfs_class();
+}
+
+module_init(edac_init_mce_inject);
+module_exit(edac_exit_mce_inject);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Borislav Petkov ");
+MODULE_AUTHOR("AMD Inc.");
+MODULE_DESCRIPTION("MCE injection facility for testing MCE decoding");
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index 280c9b5ad9e3..88a3ae6cd023 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -125,7 +125,7 @@ config ISCSI_IBFT_FIND
config ISCSI_IBFT
tristate "iSCSI Boot Firmware Table Attributes module"
select ISCSI_BOOT_SYSFS
- depends on ISCSI_IBFT_FIND && SCSI
+ depends on ISCSI_IBFT_FIND && SCSI && SCSI_LOWLEVEL
default n
help
This option enables support for detection and exposing of iSCSI
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index c37ef64d1465..cb3ccf3ed221 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -59,18 +59,11 @@
#include /* ktime_get_real() */
#include
#include
+#include