mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-11 16:29:05 +00:00
Linux v4.13-rc1
-----BEGIN PGP SIGNATURE----- iQEcBAABAgAGBQJZapWhAAoJEHm+PkMAQRiGKb0IAJM6b7SbWaw69Og7+qiFB+zZ xp29iXqbE9fPISC6a5BRQV1ONjeDM6opGixGHqGC8Hla6k2IYz25VDNoF8wd0MXN cz/Ih20vd3C5afxXGe5cTT8lsPAlV0mWXxForlu6j8jPeL62FPfq6RhEkw7AcrYL yfYy3k3qSdOrrvBdII0WAAUi46UfIs+we9BQgbsMbkHOiqV2K0MOrzKE84Xbgepq RAy2xg6P4b4+hTx8xTrYc1MXwpnqjRc0oJ08gdmiwW3AOOU7LxYFn7zDkLPWi9Rr g4x6r4YhBTGxT4wNvovLIiqd9QFs//dMCuPWYwEtTICG48umIqqq24beQ0mvCdg= =08Ic -----END PGP SIGNATURE----- Merge tag 'v4.13-rc1' into fixes The fixes branch is based off a random pre-rc1 commit, because we had some fixes that needed to go in before rc1 was released. However we now need to fix some code that went in after that point, but before rc1, so merge rc1 to get that code into fixes so we can fix it!
This commit is contained in:
commit
bb272221e9
@ -229,6 +229,6 @@ KernelVersion: 4.1
|
||||
Contact: linux-mtd@lists.infradead.org
|
||||
Description:
|
||||
For a partition, the offset of that partition from the start
|
||||
of the master device in bytes. This attribute is absent on
|
||||
main devices, so it can be used to distinguish between
|
||||
partitions and devices that aren't partitions.
|
||||
of the parent (another partition or a flash device) in bytes.
|
||||
This attribute is absent on flash devices, so it can be used
|
||||
to distinguish them from partitions.
|
||||
|
@ -75,7 +75,7 @@ Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||
Description:
|
||||
Controls the memory footprint used by f2fs.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/trim_sections
|
||||
What: /sys/fs/f2fs/<disk>/batched_trim_sections
|
||||
Date: February 2015
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description:
|
||||
@ -112,3 +112,21 @@ Date: January 2016
|
||||
Contact: "Shuoran Liu" <liushuoran@huawei.com>
|
||||
Description:
|
||||
Shows total written kbytes issued to disk.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/inject_rate
|
||||
Date: May 2016
|
||||
Contact: "Sheng Yong" <shengyong1@huawei.com>
|
||||
Description:
|
||||
Controls the injection rate.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/inject_type
|
||||
Date: May 2016
|
||||
Contact: "Sheng Yong" <shengyong1@huawei.com>
|
||||
Description:
|
||||
Controls the injection type.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/reserved_blocks
|
||||
Date: June 2017
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Description:
|
||||
Controls current reserved blocks in system.
|
||||
|
@ -1,22 +1,24 @@
|
||||
Dynamic DMA mapping Guide
|
||||
=========================
|
||||
=========================
|
||||
Dynamic DMA mapping Guide
|
||||
=========================
|
||||
|
||||
David S. Miller <davem@redhat.com>
|
||||
Richard Henderson <rth@cygnus.com>
|
||||
Jakub Jelinek <jakub@redhat.com>
|
||||
:Author: David S. Miller <davem@redhat.com>
|
||||
:Author: Richard Henderson <rth@cygnus.com>
|
||||
:Author: Jakub Jelinek <jakub@redhat.com>
|
||||
|
||||
This is a guide to device driver writers on how to use the DMA API
|
||||
with example pseudo-code. For a concise description of the API, see
|
||||
DMA-API.txt.
|
||||
|
||||
CPU and DMA addresses
|
||||
CPU and DMA addresses
|
||||
=====================
|
||||
|
||||
There are several kinds of addresses involved in the DMA API, and it's
|
||||
important to understand the differences.
|
||||
|
||||
The kernel normally uses virtual addresses. Any address returned by
|
||||
kmalloc(), vmalloc(), and similar interfaces is a virtual address and can
|
||||
be stored in a "void *".
|
||||
be stored in a ``void *``.
|
||||
|
||||
The virtual memory system (TLB, page tables, etc.) translates virtual
|
||||
addresses to CPU physical addresses, which are stored as "phys_addr_t" or
|
||||
@ -37,7 +39,7 @@ be restricted to a subset of that space. For example, even if a system
|
||||
supports 64-bit addresses for main memory and PCI BARs, it may use an IOMMU
|
||||
so devices only need to use 32-bit DMA addresses.
|
||||
|
||||
Here's a picture and some examples:
|
||||
Here's a picture and some examples::
|
||||
|
||||
CPU CPU Bus
|
||||
Virtual Physical Address
|
||||
@ -98,15 +100,16 @@ microprocessor architecture. You should use the DMA API rather than the
|
||||
bus-specific DMA API, i.e., use the dma_map_*() interfaces rather than the
|
||||
pci_map_*() interfaces.
|
||||
|
||||
First of all, you should make sure
|
||||
First of all, you should make sure::
|
||||
|
||||
#include <linux/dma-mapping.h>
|
||||
#include <linux/dma-mapping.h>
|
||||
|
||||
is in your driver, which provides the definition of dma_addr_t. This type
|
||||
can hold any valid DMA address for the platform and should be used
|
||||
everywhere you hold a DMA address returned from the DMA mapping functions.
|
||||
|
||||
What memory is DMA'able?
|
||||
What memory is DMA'able?
|
||||
========================
|
||||
|
||||
The first piece of information you must know is what kernel memory can
|
||||
be used with the DMA mapping facilities. There has been an unwritten
|
||||
@ -143,7 +146,8 @@ What about block I/O and networking buffers? The block I/O and
|
||||
networking subsystems make sure that the buffers they use are valid
|
||||
for you to DMA from/to.
|
||||
|
||||
DMA addressing limitations
|
||||
DMA addressing limitations
|
||||
==========================
|
||||
|
||||
Does your device have any DMA addressing limitations? For example, is
|
||||
your device only capable of driving the low order 24-bits of address?
|
||||
@ -166,7 +170,7 @@ style to do this even if your device holds the default setting,
|
||||
because this shows that you did think about these issues wrt. your
|
||||
device.
|
||||
|
||||
The query is performed via a call to dma_set_mask_and_coherent():
|
||||
The query is performed via a call to dma_set_mask_and_coherent()::
|
||||
|
||||
int dma_set_mask_and_coherent(struct device *dev, u64 mask);
|
||||
|
||||
@ -175,12 +179,12 @@ If you have some special requirements, then the following two separate
|
||||
queries can be used instead:
|
||||
|
||||
The query for streaming mappings is performed via a call to
|
||||
dma_set_mask():
|
||||
dma_set_mask()::
|
||||
|
||||
int dma_set_mask(struct device *dev, u64 mask);
|
||||
|
||||
The query for consistent allocations is performed via a call
|
||||
to dma_set_coherent_mask():
|
||||
to dma_set_coherent_mask()::
|
||||
|
||||
int dma_set_coherent_mask(struct device *dev, u64 mask);
|
||||
|
||||
@ -209,7 +213,7 @@ of your driver reports that performance is bad or that the device is not
|
||||
even detected, you can ask them for the kernel messages to find out
|
||||
exactly why.
|
||||
|
||||
The standard 32-bit addressing device would do something like this:
|
||||
The standard 32-bit addressing device would do something like this::
|
||||
|
||||
if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32))) {
|
||||
dev_warn(dev, "mydev: No suitable DMA available\n");
|
||||
@ -225,7 +229,7 @@ than 64-bit addressing. For example, Sparc64 PCI SAC addressing is
|
||||
more efficient than DAC addressing.
|
||||
|
||||
Here is how you would handle a 64-bit capable device which can drive
|
||||
all 64-bits when accessing streaming DMA:
|
||||
all 64-bits when accessing streaming DMA::
|
||||
|
||||
int using_dac;
|
||||
|
||||
@ -239,7 +243,7 @@ all 64-bits when accessing streaming DMA:
|
||||
}
|
||||
|
||||
If a card is capable of using 64-bit consistent allocations as well,
|
||||
the case would look like this:
|
||||
the case would look like this::
|
||||
|
||||
int using_dac, consistent_using_dac;
|
||||
|
||||
@ -260,7 +264,7 @@ uses consistent allocations, one would have to check the return value from
|
||||
dma_set_coherent_mask().
|
||||
|
||||
Finally, if your device can only drive the low 24-bits of
|
||||
address you might do something like:
|
||||
address you might do something like::
|
||||
|
||||
if (dma_set_mask(dev, DMA_BIT_MASK(24))) {
|
||||
dev_warn(dev, "mydev: 24-bit DMA addressing not available\n");
|
||||
@ -280,7 +284,7 @@ only provide the functionality which the machine can handle. It
|
||||
is important that the last call to dma_set_mask() be for the
|
||||
most specific mask.
|
||||
|
||||
Here is pseudo-code showing how this might be done:
|
||||
Here is pseudo-code showing how this might be done::
|
||||
|
||||
#define PLAYBACK_ADDRESS_BITS DMA_BIT_MASK(32)
|
||||
#define RECORD_ADDRESS_BITS DMA_BIT_MASK(24)
|
||||
@ -308,7 +312,8 @@ A sound card was used as an example here because this genre of PCI
|
||||
devices seems to be littered with ISA chips given a PCI front end,
|
||||
and thus retaining the 16MB DMA addressing limitations of ISA.
|
||||
|
||||
Types of DMA mappings
|
||||
Types of DMA mappings
|
||||
=====================
|
||||
|
||||
There are two types of DMA mappings:
|
||||
|
||||
@ -336,12 +341,14 @@ There are two types of DMA mappings:
|
||||
to memory is immediately visible to the device, and vice
|
||||
versa. Consistent mappings guarantee this.
|
||||
|
||||
IMPORTANT: Consistent DMA memory does not preclude the usage of
|
||||
proper memory barriers. The CPU may reorder stores to
|
||||
.. important::
|
||||
|
||||
Consistent DMA memory does not preclude the usage of
|
||||
proper memory barriers. The CPU may reorder stores to
|
||||
consistent memory just as it may normal memory. Example:
|
||||
if it is important for the device to see the first word
|
||||
of a descriptor updated before the second, you must do
|
||||
something like:
|
||||
something like::
|
||||
|
||||
desc->word0 = address;
|
||||
wmb();
|
||||
@ -377,16 +384,17 @@ Also, systems with caches that aren't DMA-coherent will work better
|
||||
when the underlying buffers don't share cache lines with other data.
|
||||
|
||||
|
||||
Using Consistent DMA mappings.
|
||||
Using Consistent DMA mappings
|
||||
=============================
|
||||
|
||||
To allocate and map large (PAGE_SIZE or so) consistent DMA regions,
|
||||
you should do:
|
||||
you should do::
|
||||
|
||||
dma_addr_t dma_handle;
|
||||
|
||||
cpu_addr = dma_alloc_coherent(dev, size, &dma_handle, gfp);
|
||||
|
||||
where device is a struct device *. This may be called in interrupt
|
||||
where device is a ``struct device *``. This may be called in interrupt
|
||||
context with the GFP_ATOMIC flag.
|
||||
|
||||
Size is the length of the region you want to allocate, in bytes.
|
||||
@ -415,7 +423,7 @@ exists (for example) to guarantee that if you allocate a chunk
|
||||
which is smaller than or equal to 64 kilobytes, the extent of the
|
||||
buffer you receive will not cross a 64K boundary.
|
||||
|
||||
To unmap and free such a DMA region, you call:
|
||||
To unmap and free such a DMA region, you call::
|
||||
|
||||
dma_free_coherent(dev, size, cpu_addr, dma_handle);
|
||||
|
||||
@ -430,7 +438,7 @@ a kmem_cache, but it uses dma_alloc_coherent(), not __get_free_pages().
|
||||
Also, it understands common hardware constraints for alignment,
|
||||
like queue heads needing to be aligned on N byte boundaries.
|
||||
|
||||
Create a dma_pool like this:
|
||||
Create a dma_pool like this::
|
||||
|
||||
struct dma_pool *pool;
|
||||
|
||||
@ -444,7 +452,7 @@ pass 0 for boundary; passing 4096 says memory allocated from this pool
|
||||
must not cross 4KByte boundaries (but at that time it may be better to
|
||||
use dma_alloc_coherent() directly instead).
|
||||
|
||||
Allocate memory from a DMA pool like this:
|
||||
Allocate memory from a DMA pool like this::
|
||||
|
||||
cpu_addr = dma_pool_alloc(pool, flags, &dma_handle);
|
||||
|
||||
@ -452,7 +460,7 @@ flags are GFP_KERNEL if blocking is permitted (not in_interrupt nor
|
||||
holding SMP locks), GFP_ATOMIC otherwise. Like dma_alloc_coherent(),
|
||||
this returns two values, cpu_addr and dma_handle.
|
||||
|
||||
Free memory that was allocated from a dma_pool like this:
|
||||
Free memory that was allocated from a dma_pool like this::
|
||||
|
||||
dma_pool_free(pool, cpu_addr, dma_handle);
|
||||
|
||||
@ -460,7 +468,7 @@ where pool is what you passed to dma_pool_alloc(), and cpu_addr and
|
||||
dma_handle are the values dma_pool_alloc() returned. This function
|
||||
may be called in interrupt context.
|
||||
|
||||
Destroy a dma_pool by calling:
|
||||
Destroy a dma_pool by calling::
|
||||
|
||||
dma_pool_destroy(pool);
|
||||
|
||||
@ -468,11 +476,12 @@ Make sure you've called dma_pool_free() for all memory allocated
|
||||
from a pool before you destroy the pool. This function may not
|
||||
be called in interrupt context.
|
||||
|
||||
DMA Direction
|
||||
DMA Direction
|
||||
=============
|
||||
|
||||
The interfaces described in subsequent portions of this document
|
||||
take a DMA direction argument, which is an integer and takes on
|
||||
one of the following values:
|
||||
one of the following values::
|
||||
|
||||
DMA_BIDIRECTIONAL
|
||||
DMA_TO_DEVICE
|
||||
@ -521,14 +530,15 @@ packets, map/unmap them with the DMA_TO_DEVICE direction
|
||||
specifier. For receive packets, just the opposite, map/unmap them
|
||||
with the DMA_FROM_DEVICE direction specifier.
|
||||
|
||||
Using Streaming DMA mappings
|
||||
Using Streaming DMA mappings
|
||||
============================
|
||||
|
||||
The streaming DMA mapping routines can be called from interrupt
|
||||
context. There are two versions of each map/unmap, one which will
|
||||
map/unmap a single memory region, and one which will map/unmap a
|
||||
scatterlist.
|
||||
|
||||
To map a single region, you do:
|
||||
To map a single region, you do::
|
||||
|
||||
struct device *dev = &my_dev->dev;
|
||||
dma_addr_t dma_handle;
|
||||
@ -545,7 +555,7 @@ To map a single region, you do:
|
||||
goto map_error_handling;
|
||||
}
|
||||
|
||||
and to unmap it:
|
||||
and to unmap it::
|
||||
|
||||
dma_unmap_single(dev, dma_handle, size, direction);
|
||||
|
||||
@ -563,7 +573,7 @@ Using CPU pointers like this for single mappings has a disadvantage:
|
||||
you cannot reference HIGHMEM memory in this way. Thus, there is a
|
||||
map/unmap interface pair akin to dma_{map,unmap}_single(). These
|
||||
interfaces deal with page/offset pairs instead of CPU pointers.
|
||||
Specifically:
|
||||
Specifically::
|
||||
|
||||
struct device *dev = &my_dev->dev;
|
||||
dma_addr_t dma_handle;
|
||||
@ -593,7 +603,7 @@ error as outlined under the dma_map_single() discussion.
|
||||
You should call dma_unmap_page() when the DMA activity is finished, e.g.,
|
||||
from the interrupt which told you that the DMA transfer is done.
|
||||
|
||||
With scatterlists, you map a region gathered from several regions by:
|
||||
With scatterlists, you map a region gathered from several regions by::
|
||||
|
||||
int i, count = dma_map_sg(dev, sglist, nents, direction);
|
||||
struct scatterlist *sg;
|
||||
@ -617,16 +627,18 @@ Then you should loop count times (note: this can be less than nents times)
|
||||
and use sg_dma_address() and sg_dma_len() macros where you previously
|
||||
accessed sg->address and sg->length as shown above.
|
||||
|
||||
To unmap a scatterlist, just call:
|
||||
To unmap a scatterlist, just call::
|
||||
|
||||
dma_unmap_sg(dev, sglist, nents, direction);
|
||||
|
||||
Again, make sure DMA activity has already finished.
|
||||
|
||||
PLEASE NOTE: The 'nents' argument to the dma_unmap_sg call must be
|
||||
the _same_ one you passed into the dma_map_sg call,
|
||||
it should _NOT_ be the 'count' value _returned_ from the
|
||||
dma_map_sg call.
|
||||
.. note::
|
||||
|
||||
The 'nents' argument to the dma_unmap_sg call must be
|
||||
the _same_ one you passed into the dma_map_sg call,
|
||||
it should _NOT_ be the 'count' value _returned_ from the
|
||||
dma_map_sg call.
|
||||
|
||||
Every dma_map_{single,sg}() call should have its dma_unmap_{single,sg}()
|
||||
counterpart, because the DMA address space is a shared resource and
|
||||
@ -638,11 +650,11 @@ properly in order for the CPU and device to see the most up-to-date and
|
||||
correct copy of the DMA buffer.
|
||||
|
||||
So, firstly, just map it with dma_map_{single,sg}(), and after each DMA
|
||||
transfer call either:
|
||||
transfer call either::
|
||||
|
||||
dma_sync_single_for_cpu(dev, dma_handle, size, direction);
|
||||
|
||||
or:
|
||||
or::
|
||||
|
||||
dma_sync_sg_for_cpu(dev, sglist, nents, direction);
|
||||
|
||||
@ -650,17 +662,19 @@ as appropriate.
|
||||
|
||||
Then, if you wish to let the device get at the DMA area again,
|
||||
finish accessing the data with the CPU, and then before actually
|
||||
giving the buffer to the hardware call either:
|
||||
giving the buffer to the hardware call either::
|
||||
|
||||
dma_sync_single_for_device(dev, dma_handle, size, direction);
|
||||
|
||||
or:
|
||||
or::
|
||||
|
||||
dma_sync_sg_for_device(dev, sglist, nents, direction);
|
||||
|
||||
as appropriate.
|
||||
|
||||
PLEASE NOTE: The 'nents' argument to dma_sync_sg_for_cpu() and
|
||||
.. note::
|
||||
|
||||
The 'nents' argument to dma_sync_sg_for_cpu() and
|
||||
dma_sync_sg_for_device() must be the same passed to
|
||||
dma_map_sg(). It is _NOT_ the count returned by
|
||||
dma_map_sg().
|
||||
@ -671,7 +685,7 @@ dma_map_*() call till dma_unmap_*(), then you don't have to call the
|
||||
dma_sync_*() routines at all.
|
||||
|
||||
Here is pseudo code which shows a situation in which you would need
|
||||
to use the dma_sync_*() interfaces.
|
||||
to use the dma_sync_*() interfaces::
|
||||
|
||||
my_card_setup_receive_buffer(struct my_card *cp, char *buffer, int len)
|
||||
{
|
||||
@ -747,7 +761,8 @@ is planned to completely remove virt_to_bus() and bus_to_virt() as
|
||||
they are entirely deprecated. Some ports already do not provide these
|
||||
as it is impossible to correctly support them.
|
||||
|
||||
Handling Errors
|
||||
Handling Errors
|
||||
===============
|
||||
|
||||
DMA address space is limited on some architectures and an allocation
|
||||
failure can be determined by:
|
||||
@ -755,7 +770,7 @@ failure can be determined by:
|
||||
- checking if dma_alloc_coherent() returns NULL or dma_map_sg returns 0
|
||||
|
||||
- checking the dma_addr_t returned from dma_map_single() and dma_map_page()
|
||||
by using dma_mapping_error():
|
||||
by using dma_mapping_error()::
|
||||
|
||||
dma_addr_t dma_handle;
|
||||
|
||||
@ -773,7 +788,8 @@ failure can be determined by:
|
||||
of a multiple page mapping attempt. These example are applicable to
|
||||
dma_map_page() as well.
|
||||
|
||||
Example 1:
|
||||
Example 1::
|
||||
|
||||
dma_addr_t dma_handle1;
|
||||
dma_addr_t dma_handle2;
|
||||
|
||||
@ -802,8 +818,12 @@ Example 1:
|
||||
dma_unmap_single(dma_handle1);
|
||||
map_error_handling1:
|
||||
|
||||
Example 2: (if buffers are allocated in a loop, unmap all mapped buffers when
|
||||
mapping error is detected in the middle)
|
||||
Example 2::
|
||||
|
||||
/*
|
||||
* if buffers are allocated in a loop, unmap all mapped buffers when
|
||||
* mapping error is detected in the middle
|
||||
*/
|
||||
|
||||
dma_addr_t dma_addr;
|
||||
dma_addr_t array[DMA_BUFFERS];
|
||||
@ -846,7 +866,8 @@ SCSI drivers must return SCSI_MLQUEUE_HOST_BUSY if the DMA mapping
|
||||
fails in the queuecommand hook. This means that the SCSI subsystem
|
||||
passes the command to the driver again later.
|
||||
|
||||
Optimizing Unmap State Space Consumption
|
||||
Optimizing Unmap State Space Consumption
|
||||
========================================
|
||||
|
||||
On many platforms, dma_unmap_{single,page}() is simply a nop.
|
||||
Therefore, keeping track of the mapping address and length is a waste
|
||||
@ -858,7 +879,7 @@ Actually, instead of describing the macros one by one, we'll
|
||||
transform some example code.
|
||||
|
||||
1) Use DEFINE_DMA_UNMAP_{ADDR,LEN} in state saving structures.
|
||||
Example, before:
|
||||
Example, before::
|
||||
|
||||
struct ring_state {
|
||||
struct sk_buff *skb;
|
||||
@ -866,7 +887,7 @@ transform some example code.
|
||||
__u32 len;
|
||||
};
|
||||
|
||||
after:
|
||||
after::
|
||||
|
||||
struct ring_state {
|
||||
struct sk_buff *skb;
|
||||
@ -875,23 +896,23 @@ transform some example code.
|
||||
};
|
||||
|
||||
2) Use dma_unmap_{addr,len}_set() to set these values.
|
||||
Example, before:
|
||||
Example, before::
|
||||
|
||||
ringp->mapping = FOO;
|
||||
ringp->len = BAR;
|
||||
|
||||
after:
|
||||
after::
|
||||
|
||||
dma_unmap_addr_set(ringp, mapping, FOO);
|
||||
dma_unmap_len_set(ringp, len, BAR);
|
||||
|
||||
3) Use dma_unmap_{addr,len}() to access these values.
|
||||
Example, before:
|
||||
Example, before::
|
||||
|
||||
dma_unmap_single(dev, ringp->mapping, ringp->len,
|
||||
DMA_FROM_DEVICE);
|
||||
|
||||
after:
|
||||
after::
|
||||
|
||||
dma_unmap_single(dev,
|
||||
dma_unmap_addr(ringp, mapping),
|
||||
@ -902,7 +923,8 @@ It really should be self-explanatory. We treat the ADDR and LEN
|
||||
separately, because it is possible for an implementation to only
|
||||
need the address in order to perform the unmap operation.
|
||||
|
||||
Platform Issues
|
||||
Platform Issues
|
||||
===============
|
||||
|
||||
If you are just writing drivers for Linux and do not maintain
|
||||
an architecture port for the kernel, you can safely skip down
|
||||
@ -928,12 +950,13 @@ to "Closing".
|
||||
alignment constraints (e.g. the alignment constraints about 64-bit
|
||||
objects).
|
||||
|
||||
Closing
|
||||
Closing
|
||||
=======
|
||||
|
||||
This document, and the API itself, would not be in its current
|
||||
form without the feedback and suggestions from numerous individuals.
|
||||
We would like to specifically mention, in no particular order, the
|
||||
following people:
|
||||
following people::
|
||||
|
||||
Russell King <rmk@arm.linux.org.uk>
|
||||
Leo Dagum <dagum@barrel.engr.sgi.com>
|
||||
|
@ -1,7 +1,8 @@
|
||||
Dynamic DMA mapping using the generic device
|
||||
============================================
|
||||
============================================
|
||||
Dynamic DMA mapping using the generic device
|
||||
============================================
|
||||
|
||||
James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
|
||||
:Author: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
|
||||
|
||||
This document describes the DMA API. For a more gentle introduction
|
||||
of the API (and actual examples), see Documentation/DMA-API-HOWTO.txt.
|
||||
@ -12,10 +13,10 @@ machines. Unless you know that your driver absolutely has to support
|
||||
non-consistent platforms (this is usually only legacy platforms) you
|
||||
should only use the API described in part I.
|
||||
|
||||
Part I - dma_ API
|
||||
-------------------------------------
|
||||
Part I - dma_API
|
||||
----------------
|
||||
|
||||
To get the dma_ API, you must #include <linux/dma-mapping.h>. This
|
||||
To get the dma_API, you must #include <linux/dma-mapping.h>. This
|
||||
provides dma_addr_t and the interfaces described below.
|
||||
|
||||
A dma_addr_t can hold any valid DMA address for the platform. It can be
|
||||
@ -26,9 +27,11 @@ address space and the DMA address space.
|
||||
Part Ia - Using large DMA-coherent buffers
|
||||
------------------------------------------
|
||||
|
||||
void *
|
||||
dma_alloc_coherent(struct device *dev, size_t size,
|
||||
dma_addr_t *dma_handle, gfp_t flag)
|
||||
::
|
||||
|
||||
void *
|
||||
dma_alloc_coherent(struct device *dev, size_t size,
|
||||
dma_addr_t *dma_handle, gfp_t flag)
|
||||
|
||||
Consistent memory is memory for which a write by either the device or
|
||||
the processor can immediately be read by the processor or device
|
||||
@ -51,20 +54,24 @@ consolidate your requests for consistent memory as much as possible.
|
||||
The simplest way to do that is to use the dma_pool calls (see below).
|
||||
|
||||
The flag parameter (dma_alloc_coherent() only) allows the caller to
|
||||
specify the GFP_ flags (see kmalloc()) for the allocation (the
|
||||
specify the ``GFP_`` flags (see kmalloc()) for the allocation (the
|
||||
implementation may choose to ignore flags that affect the location of
|
||||
the returned memory, like GFP_DMA).
|
||||
|
||||
void *
|
||||
dma_zalloc_coherent(struct device *dev, size_t size,
|
||||
dma_addr_t *dma_handle, gfp_t flag)
|
||||
::
|
||||
|
||||
void *
|
||||
dma_zalloc_coherent(struct device *dev, size_t size,
|
||||
dma_addr_t *dma_handle, gfp_t flag)
|
||||
|
||||
Wraps dma_alloc_coherent() and also zeroes the returned memory if the
|
||||
allocation attempt succeeded.
|
||||
|
||||
void
|
||||
dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
|
||||
dma_addr_t dma_handle)
|
||||
::
|
||||
|
||||
void
|
||||
dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
|
||||
dma_addr_t dma_handle)
|
||||
|
||||
Free a region of consistent memory you previously allocated. dev,
|
||||
size and dma_handle must all be the same as those passed into
|
||||
@ -78,7 +85,7 @@ may only be called with IRQs enabled.
|
||||
Part Ib - Using small DMA-coherent buffers
|
||||
------------------------------------------
|
||||
|
||||
To get this part of the dma_ API, you must #include <linux/dmapool.h>
|
||||
To get this part of the dma_API, you must #include <linux/dmapool.h>
|
||||
|
||||
Many drivers need lots of small DMA-coherent memory regions for DMA
|
||||
descriptors or I/O buffers. Rather than allocating in units of a page
|
||||
@ -88,6 +95,8 @@ not __get_free_pages(). Also, they understand common hardware constraints
|
||||
for alignment, like queue heads needing to be aligned on N-byte boundaries.
|
||||
|
||||
|
||||
::
|
||||
|
||||
struct dma_pool *
|
||||
dma_pool_create(const char *name, struct device *dev,
|
||||
size_t size, size_t align, size_t alloc);
|
||||
@ -103,16 +112,21 @@ in bytes, and must be a power of two). If your device has no boundary
|
||||
crossing restrictions, pass 0 for alloc; passing 4096 says memory allocated
|
||||
from this pool must not cross 4KByte boundaries.
|
||||
|
||||
::
|
||||
|
||||
void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags,
|
||||
dma_addr_t *handle)
|
||||
void *
|
||||
dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags,
|
||||
dma_addr_t *handle)
|
||||
|
||||
Wraps dma_pool_alloc() and also zeroes the returned memory if the
|
||||
allocation attempt succeeded.
|
||||
|
||||
|
||||
void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags,
|
||||
dma_addr_t *dma_handle);
|
||||
::
|
||||
|
||||
void *
|
||||
dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags,
|
||||
dma_addr_t *dma_handle);
|
||||
|
||||
This allocates memory from the pool; the returned memory will meet the
|
||||
size and alignment requirements specified at creation time. Pass
|
||||
@ -122,16 +136,20 @@ blocking. Like dma_alloc_coherent(), this returns two values: an
|
||||
address usable by the CPU, and the DMA address usable by the pool's
|
||||
device.
|
||||
|
||||
::
|
||||
|
||||
void dma_pool_free(struct dma_pool *pool, void *vaddr,
|
||||
dma_addr_t addr);
|
||||
void
|
||||
dma_pool_free(struct dma_pool *pool, void *vaddr,
|
||||
dma_addr_t addr);
|
||||
|
||||
This puts memory back into the pool. The pool is what was passed to
|
||||
dma_pool_alloc(); the CPU (vaddr) and DMA addresses are what
|
||||
were returned when that routine allocated the memory being freed.
|
||||
|
||||
::
|
||||
|
||||
void dma_pool_destroy(struct dma_pool *pool);
|
||||
void
|
||||
dma_pool_destroy(struct dma_pool *pool);
|
||||
|
||||
dma_pool_destroy() frees the resources of the pool. It must be
|
||||
called in a context which can sleep. Make sure you've freed all allocated
|
||||
@ -141,32 +159,40 @@ memory back to the pool before you destroy it.
|
||||
Part Ic - DMA addressing limitations
|
||||
------------------------------------
|
||||
|
||||
int
|
||||
dma_set_mask_and_coherent(struct device *dev, u64 mask)
|
||||
::
|
||||
|
||||
int
|
||||
dma_set_mask_and_coherent(struct device *dev, u64 mask)
|
||||
|
||||
Checks to see if the mask is possible and updates the device
|
||||
streaming and coherent DMA mask parameters if it is.
|
||||
|
||||
Returns: 0 if successful and a negative error if not.
|
||||
|
||||
int
|
||||
dma_set_mask(struct device *dev, u64 mask)
|
||||
::
|
||||
|
||||
int
|
||||
dma_set_mask(struct device *dev, u64 mask)
|
||||
|
||||
Checks to see if the mask is possible and updates the device
|
||||
parameters if it is.
|
||||
|
||||
Returns: 0 if successful and a negative error if not.
|
||||
|
||||
int
|
||||
dma_set_coherent_mask(struct device *dev, u64 mask)
|
||||
::
|
||||
|
||||
int
|
||||
dma_set_coherent_mask(struct device *dev, u64 mask)
|
||||
|
||||
Checks to see if the mask is possible and updates the device
|
||||
parameters if it is.
|
||||
|
||||
Returns: 0 if successful and a negative error if not.
|
||||
|
||||
u64
|
||||
dma_get_required_mask(struct device *dev)
|
||||
::
|
||||
|
||||
u64
|
||||
dma_get_required_mask(struct device *dev)
|
||||
|
||||
This API returns the mask that the platform requires to
|
||||
operate efficiently. Usually this means the returned mask
|
||||
@ -182,94 +208,107 @@ call to set the mask to the value returned.
|
||||
Part Id - Streaming DMA mappings
|
||||
--------------------------------
|
||||
|
||||
dma_addr_t
|
||||
dma_map_single(struct device *dev, void *cpu_addr, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
::
|
||||
|
||||
dma_addr_t
|
||||
dma_map_single(struct device *dev, void *cpu_addr, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
|
||||
Maps a piece of processor virtual memory so it can be accessed by the
|
||||
device and returns the DMA address of the memory.
|
||||
|
||||
The direction for both APIs may be converted freely by casting.
|
||||
However the dma_ API uses a strongly typed enumerator for its
|
||||
However the dma_API uses a strongly typed enumerator for its
|
||||
direction:
|
||||
|
||||
======================= =============================================
|
||||
DMA_NONE no direction (used for debugging)
|
||||
DMA_TO_DEVICE data is going from the memory to the device
|
||||
DMA_FROM_DEVICE data is coming from the device to the memory
|
||||
DMA_BIDIRECTIONAL direction isn't known
|
||||
======================= =============================================
|
||||
|
||||
Notes: Not all memory regions in a machine can be mapped by this API.
|
||||
Further, contiguous kernel virtual space may not be contiguous as
|
||||
physical memory. Since this API does not provide any scatter/gather
|
||||
capability, it will fail if the user tries to map a non-physically
|
||||
contiguous piece of memory. For this reason, memory to be mapped by
|
||||
this API should be obtained from sources which guarantee it to be
|
||||
physically contiguous (like kmalloc).
|
||||
.. note::
|
||||
|
||||
Further, the DMA address of the memory must be within the
|
||||
dma_mask of the device (the dma_mask is a bit mask of the
|
||||
addressable region for the device, i.e., if the DMA address of
|
||||
the memory ANDed with the dma_mask is still equal to the DMA
|
||||
address, then the device can perform DMA to the memory). To
|
||||
ensure that the memory allocated by kmalloc is within the dma_mask,
|
||||
the driver may specify various platform-dependent flags to restrict
|
||||
the DMA address range of the allocation (e.g., on x86, GFP_DMA
|
||||
guarantees to be within the first 16MB of available DMA addresses,
|
||||
as required by ISA devices).
|
||||
Not all memory regions in a machine can be mapped by this API.
|
||||
Further, contiguous kernel virtual space may not be contiguous as
|
||||
physical memory. Since this API does not provide any scatter/gather
|
||||
capability, it will fail if the user tries to map a non-physically
|
||||
contiguous piece of memory. For this reason, memory to be mapped by
|
||||
this API should be obtained from sources which guarantee it to be
|
||||
physically contiguous (like kmalloc).
|
||||
|
||||
Note also that the above constraints on physical contiguity and
|
||||
dma_mask may not apply if the platform has an IOMMU (a device which
|
||||
maps an I/O DMA address to a physical memory address). However, to be
|
||||
portable, device driver writers may *not* assume that such an IOMMU
|
||||
exists.
|
||||
Further, the DMA address of the memory must be within the
|
||||
dma_mask of the device (the dma_mask is a bit mask of the
|
||||
addressable region for the device, i.e., if the DMA address of
|
||||
the memory ANDed with the dma_mask is still equal to the DMA
|
||||
address, then the device can perform DMA to the memory). To
|
||||
ensure that the memory allocated by kmalloc is within the dma_mask,
|
||||
the driver may specify various platform-dependent flags to restrict
|
||||
the DMA address range of the allocation (e.g., on x86, GFP_DMA
|
||||
guarantees to be within the first 16MB of available DMA addresses,
|
||||
as required by ISA devices).
|
||||
|
||||
Warnings: Memory coherency operates at a granularity called the cache
|
||||
line width. In order for memory mapped by this API to operate
|
||||
correctly, the mapped region must begin exactly on a cache line
|
||||
boundary and end exactly on one (to prevent two separately mapped
|
||||
regions from sharing a single cache line). Since the cache line size
|
||||
may not be known at compile time, the API will not enforce this
|
||||
requirement. Therefore, it is recommended that driver writers who
|
||||
don't take special care to determine the cache line size at run time
|
||||
only map virtual regions that begin and end on page boundaries (which
|
||||
are guaranteed also to be cache line boundaries).
|
||||
Note also that the above constraints on physical contiguity and
|
||||
dma_mask may not apply if the platform has an IOMMU (a device which
|
||||
maps an I/O DMA address to a physical memory address). However, to be
|
||||
portable, device driver writers may *not* assume that such an IOMMU
|
||||
exists.
|
||||
|
||||
DMA_TO_DEVICE synchronisation must be done after the last modification
|
||||
of the memory region by the software and before it is handed off to
|
||||
the device. Once this primitive is used, memory covered by this
|
||||
primitive should be treated as read-only by the device. If the device
|
||||
may write to it at any point, it should be DMA_BIDIRECTIONAL (see
|
||||
below).
|
||||
.. warning::
|
||||
|
||||
DMA_FROM_DEVICE synchronisation must be done before the driver
|
||||
accesses data that may be changed by the device. This memory should
|
||||
be treated as read-only by the driver. If the driver needs to write
|
||||
to it at any point, it should be DMA_BIDIRECTIONAL (see below).
|
||||
Memory coherency operates at a granularity called the cache
|
||||
line width. In order for memory mapped by this API to operate
|
||||
correctly, the mapped region must begin exactly on a cache line
|
||||
boundary and end exactly on one (to prevent two separately mapped
|
||||
regions from sharing a single cache line). Since the cache line size
|
||||
may not be known at compile time, the API will not enforce this
|
||||
requirement. Therefore, it is recommended that driver writers who
|
||||
don't take special care to determine the cache line size at run time
|
||||
only map virtual regions that begin and end on page boundaries (which
|
||||
are guaranteed also to be cache line boundaries).
|
||||
|
||||
DMA_BIDIRECTIONAL requires special handling: it means that the driver
|
||||
isn't sure if the memory was modified before being handed off to the
|
||||
device and also isn't sure if the device will also modify it. Thus,
|
||||
you must always sync bidirectional memory twice: once before the
|
||||
memory is handed off to the device (to make sure all memory changes
|
||||
are flushed from the processor) and once before the data may be
|
||||
accessed after being used by the device (to make sure any processor
|
||||
cache lines are updated with data that the device may have changed).
|
||||
DMA_TO_DEVICE synchronisation must be done after the last modification
|
||||
of the memory region by the software and before it is handed off to
|
||||
the device. Once this primitive is used, memory covered by this
|
||||
primitive should be treated as read-only by the device. If the device
|
||||
may write to it at any point, it should be DMA_BIDIRECTIONAL (see
|
||||
below).
|
||||
|
||||
void
|
||||
dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
DMA_FROM_DEVICE synchronisation must be done before the driver
|
||||
accesses data that may be changed by the device. This memory should
|
||||
be treated as read-only by the driver. If the driver needs to write
|
||||
to it at any point, it should be DMA_BIDIRECTIONAL (see below).
|
||||
|
||||
DMA_BIDIRECTIONAL requires special handling: it means that the driver
|
||||
isn't sure if the memory was modified before being handed off to the
|
||||
device and also isn't sure if the device will also modify it. Thus,
|
||||
you must always sync bidirectional memory twice: once before the
|
||||
memory is handed off to the device (to make sure all memory changes
|
||||
are flushed from the processor) and once before the data may be
|
||||
accessed after being used by the device (to make sure any processor
|
||||
cache lines are updated with data that the device may have changed).
|
||||
|
||||
::
|
||||
|
||||
void
|
||||
dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
|
||||
Unmaps the region previously mapped. All the parameters passed in
|
||||
must be identical to those passed in (and returned) by the mapping
|
||||
API.
|
||||
|
||||
dma_addr_t
|
||||
dma_map_page(struct device *dev, struct page *page,
|
||||
unsigned long offset, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
void
|
||||
dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
::
|
||||
|
||||
dma_addr_t
|
||||
dma_map_page(struct device *dev, struct page *page,
|
||||
unsigned long offset, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
|
||||
void
|
||||
dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
|
||||
API for mapping and unmapping for pages. All the notes and warnings
|
||||
for the other mapping APIs apply here. Also, although the <offset>
|
||||
@ -277,20 +316,24 @@ and <size> parameters are provided to do partial page mapping, it is
|
||||
recommended that you never use these unless you really know what the
|
||||
cache width is.
|
||||
|
||||
dma_addr_t
|
||||
dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
|
||||
enum dma_data_direction dir, unsigned long attrs)
|
||||
::
|
||||
|
||||
void
|
||||
dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
|
||||
enum dma_data_direction dir, unsigned long attrs)
|
||||
dma_addr_t
|
||||
dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
|
||||
enum dma_data_direction dir, unsigned long attrs)
|
||||
|
||||
void
|
||||
dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
|
||||
enum dma_data_direction dir, unsigned long attrs)
|
||||
|
||||
API for mapping and unmapping for MMIO resources. All the notes and
|
||||
warnings for the other mapping APIs apply here. The API should only be
|
||||
used to map device MMIO resources, mapping of RAM is not permitted.
|
||||
|
||||
int
|
||||
dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
|
||||
::
|
||||
|
||||
int
|
||||
dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
|
||||
|
||||
In some circumstances dma_map_single(), dma_map_page() and dma_map_resource()
|
||||
will fail to create a mapping. A driver can check for these errors by testing
|
||||
@ -298,9 +341,11 @@ the returned DMA address with dma_mapping_error(). A non-zero return value
|
||||
means the mapping could not be created and the driver should take appropriate
|
||||
action (e.g. reduce current DMA mapping usage or delay and try again later).
|
||||
|
||||
::
|
||||
|
||||
int
|
||||
dma_map_sg(struct device *dev, struct scatterlist *sg,
|
||||
int nents, enum dma_data_direction direction)
|
||||
int nents, enum dma_data_direction direction)
|
||||
|
||||
Returns: the number of DMA address segments mapped (this may be shorter
|
||||
than <nents> passed in if some elements of the scatter/gather list are
|
||||
@ -316,7 +361,7 @@ critical that the driver do something, in the case of a block driver
|
||||
aborting the request or even oopsing is better than doing nothing and
|
||||
corrupting the filesystem.
|
||||
|
||||
With scatterlists, you use the resulting mapping like this:
|
||||
With scatterlists, you use the resulting mapping like this::
|
||||
|
||||
int i, count = dma_map_sg(dev, sglist, nents, direction);
|
||||
struct scatterlist *sg;
|
||||
@ -337,9 +382,11 @@ Then you should loop count times (note: this can be less than nents times)
|
||||
and use sg_dma_address() and sg_dma_len() macros where you previously
|
||||
accessed sg->address and sg->length as shown above.
|
||||
|
||||
::
|
||||
|
||||
void
|
||||
dma_unmap_sg(struct device *dev, struct scatterlist *sg,
|
||||
int nents, enum dma_data_direction direction)
|
||||
int nents, enum dma_data_direction direction)
|
||||
|
||||
Unmap the previously mapped scatter/gather list. All the parameters
|
||||
must be the same as those and passed in to the scatter/gather mapping
|
||||
@ -348,18 +395,27 @@ API.
|
||||
Note: <nents> must be the number you passed in, *not* the number of
|
||||
DMA address entries returned.
|
||||
|
||||
void
|
||||
dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
void
|
||||
dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
void
|
||||
dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nents,
|
||||
enum dma_data_direction direction)
|
||||
void
|
||||
dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nents,
|
||||
enum dma_data_direction direction)
|
||||
::
|
||||
|
||||
void
|
||||
dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
|
||||
size_t size,
|
||||
enum dma_data_direction direction)
|
||||
|
||||
void
|
||||
dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle,
|
||||
size_t size,
|
||||
enum dma_data_direction direction)
|
||||
|
||||
void
|
||||
dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
|
||||
int nents,
|
||||
enum dma_data_direction direction)
|
||||
|
||||
void
|
||||
dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
|
||||
int nents,
|
||||
enum dma_data_direction direction)
|
||||
|
||||
Synchronise a single contiguous or scatter/gather mapping for the CPU
|
||||
and device. With the sync_sg API, all the parameters must be the same
|
||||
@ -367,36 +423,41 @@ as those passed into the single mapping API. With the sync_single API,
|
||||
you can use dma_handle and size parameters that aren't identical to
|
||||
those passed into the single mapping API to do a partial sync.
|
||||
|
||||
Notes: You must do this:
|
||||
|
||||
- Before reading values that have been written by DMA from the device
|
||||
(use the DMA_FROM_DEVICE direction)
|
||||
- After writing values that will be written to the device using DMA
|
||||
(use the DMA_TO_DEVICE) direction
|
||||
- before *and* after handing memory to the device if the memory is
|
||||
DMA_BIDIRECTIONAL
|
||||
.. note::
|
||||
|
||||
You must do this:
|
||||
|
||||
- Before reading values that have been written by DMA from the device
|
||||
(use the DMA_FROM_DEVICE direction)
|
||||
- After writing values that will be written to the device using DMA
|
||||
(use the DMA_TO_DEVICE) direction
|
||||
- before *and* after handing memory to the device if the memory is
|
||||
DMA_BIDIRECTIONAL
|
||||
|
||||
See also dma_map_single().
|
||||
|
||||
dma_addr_t
|
||||
dma_map_single_attrs(struct device *dev, void *cpu_addr, size_t size,
|
||||
enum dma_data_direction dir,
|
||||
unsigned long attrs)
|
||||
::
|
||||
|
||||
void
|
||||
dma_unmap_single_attrs(struct device *dev, dma_addr_t dma_addr,
|
||||
size_t size, enum dma_data_direction dir,
|
||||
unsigned long attrs)
|
||||
dma_addr_t
|
||||
dma_map_single_attrs(struct device *dev, void *cpu_addr, size_t size,
|
||||
enum dma_data_direction dir,
|
||||
unsigned long attrs)
|
||||
|
||||
int
|
||||
dma_map_sg_attrs(struct device *dev, struct scatterlist *sgl,
|
||||
int nents, enum dma_data_direction dir,
|
||||
unsigned long attrs)
|
||||
void
|
||||
dma_unmap_single_attrs(struct device *dev, dma_addr_t dma_addr,
|
||||
size_t size, enum dma_data_direction dir,
|
||||
unsigned long attrs)
|
||||
|
||||
void
|
||||
dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sgl,
|
||||
int nents, enum dma_data_direction dir,
|
||||
unsigned long attrs)
|
||||
int
|
||||
dma_map_sg_attrs(struct device *dev, struct scatterlist *sgl,
|
||||
int nents, enum dma_data_direction dir,
|
||||
unsigned long attrs)
|
||||
|
||||
void
|
||||
dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sgl,
|
||||
int nents, enum dma_data_direction dir,
|
||||
unsigned long attrs)
|
||||
|
||||
The four functions above are just like the counterpart functions
|
||||
without the _attrs suffixes, except that they pass an optional
|
||||
@ -410,37 +471,38 @@ is identical to those of the corresponding function
|
||||
without the _attrs suffix. As a result dma_map_single_attrs()
|
||||
can generally replace dma_map_single(), etc.
|
||||
|
||||
As an example of the use of the *_attrs functions, here's how
|
||||
As an example of the use of the ``*_attrs`` functions, here's how
|
||||
you could pass an attribute DMA_ATTR_FOO when mapping memory
|
||||
for DMA:
|
||||
for DMA::
|
||||
|
||||
#include <linux/dma-mapping.h>
|
||||
/* DMA_ATTR_FOO should be defined in linux/dma-mapping.h and
|
||||
* documented in Documentation/DMA-attributes.txt */
|
||||
...
|
||||
#include <linux/dma-mapping.h>
|
||||
/* DMA_ATTR_FOO should be defined in linux/dma-mapping.h and
|
||||
* documented in Documentation/DMA-attributes.txt */
|
||||
...
|
||||
|
||||
unsigned long attr;
|
||||
attr |= DMA_ATTR_FOO;
|
||||
....
|
||||
n = dma_map_sg_attrs(dev, sg, nents, DMA_TO_DEVICE, attr);
|
||||
....
|
||||
unsigned long attr;
|
||||
attr |= DMA_ATTR_FOO;
|
||||
....
|
||||
n = dma_map_sg_attrs(dev, sg, nents, DMA_TO_DEVICE, attr);
|
||||
....
|
||||
|
||||
Architectures that care about DMA_ATTR_FOO would check for its
|
||||
presence in their implementations of the mapping and unmapping
|
||||
routines, e.g.:
|
||||
routines, e.g.:::
|
||||
|
||||
void whizco_dma_map_sg_attrs(struct device *dev, dma_addr_t dma_addr,
|
||||
size_t size, enum dma_data_direction dir,
|
||||
unsigned long attrs)
|
||||
{
|
||||
....
|
||||
if (attrs & DMA_ATTR_FOO)
|
||||
/* twizzle the frobnozzle */
|
||||
....
|
||||
void whizco_dma_map_sg_attrs(struct device *dev, dma_addr_t dma_addr,
|
||||
size_t size, enum dma_data_direction dir,
|
||||
unsigned long attrs)
|
||||
{
|
||||
....
|
||||
if (attrs & DMA_ATTR_FOO)
|
||||
/* twizzle the frobnozzle */
|
||||
....
|
||||
}
|
||||
|
||||
|
||||
Part II - Advanced dma_ usage
|
||||
-----------------------------
|
||||
Part II - Advanced dma usage
|
||||
----------------------------
|
||||
|
||||
Warning: These pieces of the DMA API should not be used in the
|
||||
majority of cases, since they cater for unlikely corner cases that
|
||||
@ -450,9 +512,11 @@ If you don't understand how cache line coherency works between a
|
||||
processor and an I/O device, you should not be using this part of the
|
||||
API at all.
|
||||
|
||||
void *
|
||||
dma_alloc_noncoherent(struct device *dev, size_t size,
|
||||
dma_addr_t *dma_handle, gfp_t flag)
|
||||
::
|
||||
|
||||
void *
|
||||
dma_alloc_noncoherent(struct device *dev, size_t size,
|
||||
dma_addr_t *dma_handle, gfp_t flag)
|
||||
|
||||
Identical to dma_alloc_coherent() except that the platform will
|
||||
choose to return either consistent or non-consistent memory as it sees
|
||||
@ -468,39 +532,49 @@ only use this API if you positively know your driver will be
|
||||
required to work on one of the rare (usually non-PCI) architectures
|
||||
that simply cannot make consistent memory.
|
||||
|
||||
void
|
||||
dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr,
|
||||
dma_addr_t dma_handle)
|
||||
::
|
||||
|
||||
void
|
||||
dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr,
|
||||
dma_addr_t dma_handle)
|
||||
|
||||
Free memory allocated by the nonconsistent API. All parameters must
|
||||
be identical to those passed in (and returned by
|
||||
dma_alloc_noncoherent()).
|
||||
|
||||
int
|
||||
dma_get_cache_alignment(void)
|
||||
::
|
||||
|
||||
int
|
||||
dma_get_cache_alignment(void)
|
||||
|
||||
Returns the processor cache alignment. This is the absolute minimum
|
||||
alignment *and* width that you must observe when either mapping
|
||||
memory or doing partial flushes.
|
||||
|
||||
Notes: This API may return a number *larger* than the actual cache
|
||||
line, but it will guarantee that one or more cache lines fit exactly
|
||||
into the width returned by this call. It will also always be a power
|
||||
of two for easy alignment.
|
||||
.. note::
|
||||
|
||||
void
|
||||
dma_cache_sync(struct device *dev, void *vaddr, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
This API may return a number *larger* than the actual cache
|
||||
line, but it will guarantee that one or more cache lines fit exactly
|
||||
into the width returned by this call. It will also always be a power
|
||||
of two for easy alignment.
|
||||
|
||||
::
|
||||
|
||||
void
|
||||
dma_cache_sync(struct device *dev, void *vaddr, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
|
||||
Do a partial sync of memory that was allocated by
|
||||
dma_alloc_noncoherent(), starting at virtual address vaddr and
|
||||
continuing on for size. Again, you *must* observe the cache line
|
||||
boundaries when doing this.
|
||||
|
||||
int
|
||||
dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
|
||||
dma_addr_t device_addr, size_t size, int
|
||||
flags)
|
||||
::
|
||||
|
||||
int
|
||||
dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
|
||||
dma_addr_t device_addr, size_t size, int
|
||||
flags)
|
||||
|
||||
Declare region of memory to be handed out by dma_alloc_coherent() when
|
||||
it's asked for coherent memory for this device.
|
||||
@ -516,21 +590,21 @@ size is the size of the area (must be multiples of PAGE_SIZE).
|
||||
|
||||
flags can be ORed together and are:
|
||||
|
||||
DMA_MEMORY_MAP - request that the memory returned from
|
||||
dma_alloc_coherent() be directly writable.
|
||||
- DMA_MEMORY_MAP - request that the memory returned from
|
||||
dma_alloc_coherent() be directly writable.
|
||||
|
||||
DMA_MEMORY_IO - request that the memory returned from
|
||||
dma_alloc_coherent() be addressable using read()/write()/memcpy_toio() etc.
|
||||
- DMA_MEMORY_IO - request that the memory returned from
|
||||
dma_alloc_coherent() be addressable using read()/write()/memcpy_toio() etc.
|
||||
|
||||
One or both of these flags must be present.
|
||||
|
||||
DMA_MEMORY_INCLUDES_CHILDREN - make the declared memory be allocated by
|
||||
dma_alloc_coherent of any child devices of this one (for memory residing
|
||||
on a bridge).
|
||||
- DMA_MEMORY_INCLUDES_CHILDREN - make the declared memory be allocated by
|
||||
dma_alloc_coherent of any child devices of this one (for memory residing
|
||||
on a bridge).
|
||||
|
||||
DMA_MEMORY_EXCLUSIVE - only allocate memory from the declared regions.
|
||||
Do not allow dma_alloc_coherent() to fall back to system memory when
|
||||
it's out of memory in the declared region.
|
||||
- DMA_MEMORY_EXCLUSIVE - only allocate memory from the declared regions.
|
||||
Do not allow dma_alloc_coherent() to fall back to system memory when
|
||||
it's out of memory in the declared region.
|
||||
|
||||
The return value will be either DMA_MEMORY_MAP or DMA_MEMORY_IO and
|
||||
must correspond to a passed in flag (i.e. no returning DMA_MEMORY_IO
|
||||
@ -543,15 +617,17 @@ must be accessed using the correct bus functions. If your driver
|
||||
isn't prepared to handle this contingency, it should not specify
|
||||
DMA_MEMORY_IO in the input flags.
|
||||
|
||||
As a simplification for the platforms, only *one* such region of
|
||||
As a simplification for the platforms, only **one** such region of
|
||||
memory may be declared per device.
|
||||
|
||||
For reasons of efficiency, most platforms choose to track the declared
|
||||
region only at the granularity of a page. For smaller allocations,
|
||||
you should use the dma_pool() API.
|
||||
|
||||
void
|
||||
dma_release_declared_memory(struct device *dev)
|
||||
::
|
||||
|
||||
void
|
||||
dma_release_declared_memory(struct device *dev)
|
||||
|
||||
Remove the memory region previously declared from the system. This
|
||||
API performs *no* in-use checking for this region and will return
|
||||
@ -559,9 +635,11 @@ unconditionally having removed all the required structures. It is the
|
||||
driver's job to ensure that no parts of this memory region are
|
||||
currently in use.
|
||||
|
||||
void *
|
||||
dma_mark_declared_memory_occupied(struct device *dev,
|
||||
dma_addr_t device_addr, size_t size)
|
||||
::
|
||||
|
||||
void *
|
||||
dma_mark_declared_memory_occupied(struct device *dev,
|
||||
dma_addr_t device_addr, size_t size)
|
||||
|
||||
This is used to occupy specific regions of the declared space
|
||||
(dma_alloc_coherent() will hand out the first free region it finds).
|
||||
@ -592,38 +670,37 @@ option has a performance impact. Do not enable it in production kernels.
|
||||
If you boot the resulting kernel will contain code which does some bookkeeping
|
||||
about what DMA memory was allocated for which device. If this code detects an
|
||||
error it prints a warning message with some details into your kernel log. An
|
||||
example warning message may look like this:
|
||||
example warning message may look like this::
|
||||
|
||||
------------[ cut here ]------------
|
||||
WARNING: at /data2/repos/linux-2.6-iommu/lib/dma-debug.c:448
|
||||
check_unmap+0x203/0x490()
|
||||
Hardware name:
|
||||
forcedeth 0000:00:08.0: DMA-API: device driver frees DMA memory with wrong
|
||||
function [device address=0x00000000640444be] [size=66 bytes] [mapped as
|
||||
single] [unmapped as page]
|
||||
Modules linked in: nfsd exportfs bridge stp llc r8169
|
||||
Pid: 0, comm: swapper Tainted: G W 2.6.28-dmatest-09289-g8bb99c0 #1
|
||||
Call Trace:
|
||||
<IRQ> [<ffffffff80240b22>] warn_slowpath+0xf2/0x130
|
||||
[<ffffffff80647b70>] _spin_unlock+0x10/0x30
|
||||
[<ffffffff80537e75>] usb_hcd_link_urb_to_ep+0x75/0xc0
|
||||
[<ffffffff80647c22>] _spin_unlock_irqrestore+0x12/0x40
|
||||
[<ffffffff8055347f>] ohci_urb_enqueue+0x19f/0x7c0
|
||||
[<ffffffff80252f96>] queue_work+0x56/0x60
|
||||
[<ffffffff80237e10>] enqueue_task_fair+0x20/0x50
|
||||
[<ffffffff80539279>] usb_hcd_submit_urb+0x379/0xbc0
|
||||
[<ffffffff803b78c3>] cpumask_next_and+0x23/0x40
|
||||
[<ffffffff80235177>] find_busiest_group+0x207/0x8a0
|
||||
[<ffffffff8064784f>] _spin_lock_irqsave+0x1f/0x50
|
||||
[<ffffffff803c7ea3>] check_unmap+0x203/0x490
|
||||
[<ffffffff803c8259>] debug_dma_unmap_page+0x49/0x50
|
||||
[<ffffffff80485f26>] nv_tx_done_optimized+0xc6/0x2c0
|
||||
[<ffffffff80486c13>] nv_nic_irq_optimized+0x73/0x2b0
|
||||
[<ffffffff8026df84>] handle_IRQ_event+0x34/0x70
|
||||
[<ffffffff8026ffe9>] handle_edge_irq+0xc9/0x150
|
||||
[<ffffffff8020e3ab>] do_IRQ+0xcb/0x1c0
|
||||
[<ffffffff8020c093>] ret_from_intr+0x0/0xa
|
||||
<EOI> <4>---[ end trace f6435a98e2a38c0e ]---
|
||||
WARNING: at /data2/repos/linux-2.6-iommu/lib/dma-debug.c:448
|
||||
check_unmap+0x203/0x490()
|
||||
Hardware name:
|
||||
forcedeth 0000:00:08.0: DMA-API: device driver frees DMA memory with wrong
|
||||
function [device address=0x00000000640444be] [size=66 bytes] [mapped as
|
||||
single] [unmapped as page]
|
||||
Modules linked in: nfsd exportfs bridge stp llc r8169
|
||||
Pid: 0, comm: swapper Tainted: G W 2.6.28-dmatest-09289-g8bb99c0 #1
|
||||
Call Trace:
|
||||
<IRQ> [<ffffffff80240b22>] warn_slowpath+0xf2/0x130
|
||||
[<ffffffff80647b70>] _spin_unlock+0x10/0x30
|
||||
[<ffffffff80537e75>] usb_hcd_link_urb_to_ep+0x75/0xc0
|
||||
[<ffffffff80647c22>] _spin_unlock_irqrestore+0x12/0x40
|
||||
[<ffffffff8055347f>] ohci_urb_enqueue+0x19f/0x7c0
|
||||
[<ffffffff80252f96>] queue_work+0x56/0x60
|
||||
[<ffffffff80237e10>] enqueue_task_fair+0x20/0x50
|
||||
[<ffffffff80539279>] usb_hcd_submit_urb+0x379/0xbc0
|
||||
[<ffffffff803b78c3>] cpumask_next_and+0x23/0x40
|
||||
[<ffffffff80235177>] find_busiest_group+0x207/0x8a0
|
||||
[<ffffffff8064784f>] _spin_lock_irqsave+0x1f/0x50
|
||||
[<ffffffff803c7ea3>] check_unmap+0x203/0x490
|
||||
[<ffffffff803c8259>] debug_dma_unmap_page+0x49/0x50
|
||||
[<ffffffff80485f26>] nv_tx_done_optimized+0xc6/0x2c0
|
||||
[<ffffffff80486c13>] nv_nic_irq_optimized+0x73/0x2b0
|
||||
[<ffffffff8026df84>] handle_IRQ_event+0x34/0x70
|
||||
[<ffffffff8026ffe9>] handle_edge_irq+0xc9/0x150
|
||||
[<ffffffff8020e3ab>] do_IRQ+0xcb/0x1c0
|
||||
[<ffffffff8020c093>] ret_from_intr+0x0/0xa
|
||||
<EOI> <4>---[ end trace f6435a98e2a38c0e ]---
|
||||
|
||||
The driver developer can find the driver and the device including a stacktrace
|
||||
of the DMA-API call which caused this warning.
|
||||
@ -637,43 +714,42 @@ details.
|
||||
The debugfs directory for the DMA-API debugging code is called dma-api/. In
|
||||
this directory the following files can currently be found:
|
||||
|
||||
dma-api/all_errors This file contains a numeric value. If this
|
||||
=============================== ===============================================
|
||||
dma-api/all_errors This file contains a numeric value. If this
|
||||
value is not equal to zero the debugging code
|
||||
will print a warning for every error it finds
|
||||
into the kernel log. Be careful with this
|
||||
option, as it can easily flood your logs.
|
||||
|
||||
dma-api/disabled This read-only file contains the character 'Y'
|
||||
dma-api/disabled This read-only file contains the character 'Y'
|
||||
if the debugging code is disabled. This can
|
||||
happen when it runs out of memory or if it was
|
||||
disabled at boot time
|
||||
|
||||
dma-api/error_count This file is read-only and shows the total
|
||||
dma-api/error_count This file is read-only and shows the total
|
||||
numbers of errors found.
|
||||
|
||||
dma-api/num_errors The number in this file shows how many
|
||||
dma-api/num_errors The number in this file shows how many
|
||||
warnings will be printed to the kernel log
|
||||
before it stops. This number is initialized to
|
||||
one at system boot and be set by writing into
|
||||
this file
|
||||
|
||||
dma-api/min_free_entries
|
||||
This read-only file can be read to get the
|
||||
dma-api/min_free_entries This read-only file can be read to get the
|
||||
minimum number of free dma_debug_entries the
|
||||
allocator has ever seen. If this value goes
|
||||
down to zero the code will disable itself
|
||||
because it is not longer reliable.
|
||||
|
||||
dma-api/num_free_entries
|
||||
The current number of free dma_debug_entries
|
||||
dma-api/num_free_entries The current number of free dma_debug_entries
|
||||
in the allocator.
|
||||
|
||||
dma-api/driver-filter
|
||||
You can write a name of a driver into this file
|
||||
dma-api/driver-filter You can write a name of a driver into this file
|
||||
to limit the debug output to requests from that
|
||||
particular driver. Write an empty string to
|
||||
that file to disable the filter and see
|
||||
all errors again.
|
||||
=============================== ===============================================
|
||||
|
||||
If you have this code compiled into your kernel it will be enabled by default.
|
||||
If you want to boot without the bookkeeping anyway you can provide
|
||||
@ -692,7 +768,10 @@ of preallocated entries is defined per architecture. If it is too low for you
|
||||
boot with 'dma_debug_entries=<your_desired_number>' to overwrite the
|
||||
architectural default.
|
||||
|
||||
void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr);
|
||||
::
|
||||
|
||||
void
|
||||
debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr);
|
||||
|
||||
dma-debug interface debug_dma_mapping_error() to debug drivers that fail
|
||||
to check DMA mapping errors on addresses returned by dma_map_single() and
|
||||
@ -702,4 +781,3 @@ the driver. When driver does unmap, debug_dma_unmap() checks the flag and if
|
||||
this flag is still set, prints warning message that includes call trace that
|
||||
leads up to the unmap. This interface can be called from dma_mapping_error()
|
||||
routines to enable DMA mapping error check debugging.
|
||||
|
||||
|
@ -1,19 +1,20 @@
|
||||
DMA with ISA and LPC devices
|
||||
============================
|
||||
============================
|
||||
DMA with ISA and LPC devices
|
||||
============================
|
||||
|
||||
Pierre Ossman <drzeus@drzeus.cx>
|
||||
:Author: Pierre Ossman <drzeus@drzeus.cx>
|
||||
|
||||
This document describes how to do DMA transfers using the old ISA DMA
|
||||
controller. Even though ISA is more or less dead today the LPC bus
|
||||
uses the same DMA system so it will be around for quite some time.
|
||||
|
||||
Part I - Headers and dependencies
|
||||
---------------------------------
|
||||
Headers and dependencies
|
||||
------------------------
|
||||
|
||||
To do ISA style DMA you need to include two headers:
|
||||
To do ISA style DMA you need to include two headers::
|
||||
|
||||
#include <linux/dma-mapping.h>
|
||||
#include <asm/dma.h>
|
||||
#include <linux/dma-mapping.h>
|
||||
#include <asm/dma.h>
|
||||
|
||||
The first is the generic DMA API used to convert virtual addresses to
|
||||
bus addresses (see Documentation/DMA-API.txt for details).
|
||||
@ -23,8 +24,8 @@ this is not present on all platforms make sure you construct your
|
||||
Kconfig to be dependent on ISA_DMA_API (not ISA) so that nobody tries
|
||||
to build your driver on unsupported platforms.
|
||||
|
||||
Part II - Buffer allocation
|
||||
---------------------------
|
||||
Buffer allocation
|
||||
-----------------
|
||||
|
||||
The ISA DMA controller has some very strict requirements on which
|
||||
memory it can access so extra care must be taken when allocating
|
||||
@ -42,13 +43,13 @@ requirements you pass the flag GFP_DMA to kmalloc.
|
||||
|
||||
Unfortunately the memory available for ISA DMA is scarce so unless you
|
||||
allocate the memory during boot-up it's a good idea to also pass
|
||||
__GFP_REPEAT and __GFP_NOWARN to make the allocator try a bit harder.
|
||||
__GFP_RETRY_MAYFAIL and __GFP_NOWARN to make the allocator try a bit harder.
|
||||
|
||||
(This scarcity also means that you should allocate the buffer as
|
||||
early as possible and not release it until the driver is unloaded.)
|
||||
|
||||
Part III - Address translation
|
||||
------------------------------
|
||||
Address translation
|
||||
-------------------
|
||||
|
||||
To translate the virtual address to a bus address, use the normal DMA
|
||||
API. Do _not_ use isa_virt_to_phys() even though it does the same
|
||||
@ -61,8 +62,8 @@ Note: x86_64 had a broken DMA API when it came to ISA but has since
|
||||
been fixed. If your arch has problems then fix the DMA API instead of
|
||||
reverting to the ISA functions.
|
||||
|
||||
Part IV - Channels
|
||||
------------------
|
||||
Channels
|
||||
--------
|
||||
|
||||
A normal ISA DMA controller has 8 channels. The lower four are for
|
||||
8-bit transfers and the upper four are for 16-bit transfers.
|
||||
@ -80,8 +81,8 @@ The ability to use 16-bit or 8-bit transfers is _not_ up to you as a
|
||||
driver author but depends on what the hardware supports. Check your
|
||||
specs or test different channels.
|
||||
|
||||
Part V - Transfer data
|
||||
----------------------
|
||||
Transfer data
|
||||
-------------
|
||||
|
||||
Now for the good stuff, the actual DMA transfer. :)
|
||||
|
||||
@ -112,37 +113,37 @@ Once the DMA transfer is finished (or timed out) you should disable
|
||||
the channel again. You should also check get_dma_residue() to make
|
||||
sure that all data has been transferred.
|
||||
|
||||
Example:
|
||||
Example::
|
||||
|
||||
int flags, residue;
|
||||
int flags, residue;
|
||||
|
||||
flags = claim_dma_lock();
|
||||
flags = claim_dma_lock();
|
||||
|
||||
clear_dma_ff();
|
||||
clear_dma_ff();
|
||||
|
||||
set_dma_mode(channel, DMA_MODE_WRITE);
|
||||
set_dma_addr(channel, phys_addr);
|
||||
set_dma_count(channel, num_bytes);
|
||||
set_dma_mode(channel, DMA_MODE_WRITE);
|
||||
set_dma_addr(channel, phys_addr);
|
||||
set_dma_count(channel, num_bytes);
|
||||
|
||||
dma_enable(channel);
|
||||
dma_enable(channel);
|
||||
|
||||
release_dma_lock(flags);
|
||||
release_dma_lock(flags);
|
||||
|
||||
while (!device_done());
|
||||
while (!device_done());
|
||||
|
||||
flags = claim_dma_lock();
|
||||
flags = claim_dma_lock();
|
||||
|
||||
dma_disable(channel);
|
||||
dma_disable(channel);
|
||||
|
||||
residue = dma_get_residue(channel);
|
||||
if (residue != 0)
|
||||
printk(KERN_ERR "driver: Incomplete DMA transfer!"
|
||||
" %d bytes left!\n", residue);
|
||||
residue = dma_get_residue(channel);
|
||||
if (residue != 0)
|
||||
printk(KERN_ERR "driver: Incomplete DMA transfer!"
|
||||
" %d bytes left!\n", residue);
|
||||
|
||||
release_dma_lock(flags);
|
||||
release_dma_lock(flags);
|
||||
|
||||
Part VI - Suspend/resume
|
||||
------------------------
|
||||
Suspend/resume
|
||||
--------------
|
||||
|
||||
It is the driver's responsibility to make sure that the machine isn't
|
||||
suspended while a DMA transfer is in progress. Also, all DMA settings
|
||||
|
@ -1,5 +1,6 @@
|
||||
DMA attributes
|
||||
==============
|
||||
==============
|
||||
DMA attributes
|
||||
==============
|
||||
|
||||
This document describes the semantics of the DMA attributes that are
|
||||
defined in linux/dma-mapping.h.
|
||||
@ -108,6 +109,7 @@ This is a hint to the DMA-mapping subsystem that it's probably not worth
|
||||
the time to try to allocate memory to in a way that gives better TLB
|
||||
efficiency (AKA it's not worth trying to build the mapping out of larger
|
||||
pages). You might want to specify this if:
|
||||
|
||||
- You know that the accesses to this memory won't thrash the TLB.
|
||||
You might know that the accesses are likely to be sequential or
|
||||
that they aren't sequential but it's unlikely you'll ping-pong
|
||||
@ -121,11 +123,12 @@ pages). You might want to specify this if:
|
||||
the mapping to have a short lifetime then it may be worth it to
|
||||
optimize allocation (avoid coming up with large pages) instead of
|
||||
getting the slight performance win of larger pages.
|
||||
|
||||
Setting this hint doesn't guarantee that you won't get huge pages, but it
|
||||
means that we won't try quite as hard to get them.
|
||||
|
||||
NOTE: At the moment DMA_ATTR_ALLOC_SINGLE_PAGES is only implemented on ARM,
|
||||
though ARM64 patches will likely be posted soon.
|
||||
.. note:: At the moment DMA_ATTR_ALLOC_SINGLE_PAGES is only implemented on ARM,
|
||||
though ARM64 patches will likely be posted soon.
|
||||
|
||||
DMA_ATTR_NO_WARN
|
||||
----------------
|
||||
@ -142,10 +145,10 @@ problem at all, depending on the implementation of the retry mechanism.
|
||||
So, this provides a way for drivers to avoid those error messages on calls
|
||||
where allocation failures are not a problem, and shouldn't bother the logs.
|
||||
|
||||
NOTE: At the moment DMA_ATTR_NO_WARN is only implemented on PowerPC.
|
||||
.. note:: At the moment DMA_ATTR_NO_WARN is only implemented on PowerPC.
|
||||
|
||||
DMA_ATTR_PRIVILEGED
|
||||
------------------------------
|
||||
-------------------
|
||||
|
||||
Some advanced peripherals such as remote processors and GPUs perform
|
||||
accesses to DMA buffers in both privileged "supervisor" and unprivileged
|
||||
|
@ -1,9 +1,8 @@
|
||||
=====================
|
||||
The Linux IPMI Driver
|
||||
=====================
|
||||
|
||||
The Linux IPMI Driver
|
||||
---------------------
|
||||
Corey Minyard
|
||||
<minyard@mvista.com>
|
||||
<minyard@acm.org>
|
||||
:Author: Corey Minyard <minyard@mvista.com> / <minyard@acm.org>
|
||||
|
||||
The Intelligent Platform Management Interface, or IPMI, is a
|
||||
standard for controlling intelligent devices that monitor a system.
|
||||
@ -141,7 +140,7 @@ Addressing
|
||||
----------
|
||||
|
||||
The IPMI addressing works much like IP addresses, you have an overlay
|
||||
to handle the different address types. The overlay is:
|
||||
to handle the different address types. The overlay is::
|
||||
|
||||
struct ipmi_addr
|
||||
{
|
||||
@ -153,7 +152,7 @@ to handle the different address types. The overlay is:
|
||||
The addr_type determines what the address really is. The driver
|
||||
currently understands two different types of addresses.
|
||||
|
||||
"System Interface" addresses are defined as:
|
||||
"System Interface" addresses are defined as::
|
||||
|
||||
struct ipmi_system_interface_addr
|
||||
{
|
||||
@ -166,7 +165,7 @@ straight to the BMC on the current card. The channel must be
|
||||
IPMI_BMC_CHANNEL.
|
||||
|
||||
Messages that are destined to go out on the IPMB bus use the
|
||||
IPMI_IPMB_ADDR_TYPE address type. The format is
|
||||
IPMI_IPMB_ADDR_TYPE address type. The format is::
|
||||
|
||||
struct ipmi_ipmb_addr
|
||||
{
|
||||
@ -184,16 +183,16 @@ spec.
|
||||
Messages
|
||||
--------
|
||||
|
||||
Messages are defined as:
|
||||
Messages are defined as::
|
||||
|
||||
struct ipmi_msg
|
||||
{
|
||||
struct ipmi_msg
|
||||
{
|
||||
unsigned char netfn;
|
||||
unsigned char lun;
|
||||
unsigned char cmd;
|
||||
unsigned char *data;
|
||||
int data_len;
|
||||
};
|
||||
};
|
||||
|
||||
The driver takes care of adding/stripping the header information. The
|
||||
data portion is just the data to be send (do NOT put addressing info
|
||||
@ -208,7 +207,7 @@ block of data, even when receiving messages. Otherwise the driver
|
||||
will have no place to put the message.
|
||||
|
||||
Messages coming up from the message handler in kernelland will come in
|
||||
as:
|
||||
as::
|
||||
|
||||
struct ipmi_recv_msg
|
||||
{
|
||||
@ -246,6 +245,7 @@ and the user should not have to care what type of SMI is below them.
|
||||
|
||||
|
||||
Watching For Interfaces
|
||||
^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
When your code comes up, the IPMI driver may or may not have detected
|
||||
if IPMI devices exist. So you might have to defer your setup until
|
||||
@ -256,6 +256,7 @@ and tell you when they come and go.
|
||||
|
||||
|
||||
Creating the User
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
To use the message handler, you must first create a user using
|
||||
ipmi_create_user. The interface number specifies which SMI you want
|
||||
@ -272,6 +273,7 @@ closing the device automatically destroys the user.
|
||||
|
||||
|
||||
Messaging
|
||||
^^^^^^^^^
|
||||
|
||||
To send a message from kernel-land, the ipmi_request_settime() call does
|
||||
pretty much all message handling. Most of the parameter are
|
||||
@ -321,6 +323,7 @@ though, since it is tricky to manage your own buffers.
|
||||
|
||||
|
||||
Events and Incoming Commands
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The driver takes care of polling for IPMI events and receiving
|
||||
commands (commands are messages that are not responses, they are
|
||||
@ -367,7 +370,7 @@ in the system. It discovers interfaces through a host of different
|
||||
methods, depending on the system.
|
||||
|
||||
You can specify up to four interfaces on the module load line and
|
||||
control some module parameters:
|
||||
control some module parameters::
|
||||
|
||||
modprobe ipmi_si.o type=<type1>,<type2>....
|
||||
ports=<port1>,<port2>... addrs=<addr1>,<addr2>...
|
||||
@ -437,7 +440,7 @@ default is one. Setting to 0 is useful with the hotmod, but is
|
||||
obviously only useful for modules.
|
||||
|
||||
When compiled into the kernel, the parameters can be specified on the
|
||||
kernel command line as:
|
||||
kernel command line as::
|
||||
|
||||
ipmi_si.type=<type1>,<type2>...
|
||||
ipmi_si.ports=<port1>,<port2>... ipmi_si.addrs=<addr1>,<addr2>...
|
||||
@ -474,16 +477,22 @@ The driver supports a hot add and remove of interfaces. This way,
|
||||
interfaces can be added or removed after the kernel is up and running.
|
||||
This is done using /sys/modules/ipmi_si/parameters/hotmod, which is a
|
||||
write-only parameter. You write a string to this interface. The string
|
||||
has the format:
|
||||
has the format::
|
||||
|
||||
<op1>[:op2[:op3...]]
|
||||
The "op"s are:
|
||||
|
||||
The "op"s are::
|
||||
|
||||
add|remove,kcs|bt|smic,mem|i/o,<address>[,<opt1>[,<opt2>[,...]]]
|
||||
You can specify more than one interface on the line. The "opt"s are:
|
||||
|
||||
You can specify more than one interface on the line. The "opt"s are::
|
||||
|
||||
rsp=<regspacing>
|
||||
rsi=<regsize>
|
||||
rsh=<regshift>
|
||||
irq=<irq>
|
||||
ipmb=<ipmb slave addr>
|
||||
|
||||
and these have the same meanings as discussed above. Note that you
|
||||
can also use this on the kernel command line for a more compact format
|
||||
for specifying an interface. Note that when removing an interface,
|
||||
@ -496,7 +505,7 @@ The SMBus Driver (SSIF)
|
||||
The SMBus driver allows up to 4 SMBus devices to be configured in the
|
||||
system. By default, the driver will only register with something it
|
||||
finds in DMI or ACPI tables. You can change this
|
||||
at module load time (for a module) with:
|
||||
at module load time (for a module) with::
|
||||
|
||||
modprobe ipmi_ssif.o
|
||||
addr=<i2caddr1>[,<i2caddr2>[,...]]
|
||||
@ -535,7 +544,7 @@ the smb_addr parameter unless you have DMI or ACPI data to tell the
|
||||
driver what to use.
|
||||
|
||||
When compiled into the kernel, the addresses can be specified on the
|
||||
kernel command line as:
|
||||
kernel command line as::
|
||||
|
||||
ipmb_ssif.addr=<i2caddr1>[,<i2caddr2>[...]]
|
||||
ipmi_ssif.adapter=<adapter1>[,<adapter2>[...]]
|
||||
@ -565,9 +574,9 @@ Some users need more detailed information about a device, like where
|
||||
the address came from or the raw base device for the IPMI interface.
|
||||
You can use the IPMI smi_watcher to catch the IPMI interfaces as they
|
||||
come or go, and to grab the information, you can use the function
|
||||
ipmi_get_smi_info(), which returns the following structure:
|
||||
ipmi_get_smi_info(), which returns the following structure::
|
||||
|
||||
struct ipmi_smi_info {
|
||||
struct ipmi_smi_info {
|
||||
enum ipmi_addr_src addr_src;
|
||||
struct device *dev;
|
||||
union {
|
||||
@ -575,7 +584,7 @@ struct ipmi_smi_info {
|
||||
void *acpi_handle;
|
||||
} acpi_info;
|
||||
} addr_info;
|
||||
};
|
||||
};
|
||||
|
||||
Currently special info for only for SI_ACPI address sources is
|
||||
returned. Others may be added as necessary.
|
||||
@ -590,7 +599,7 @@ Watchdog
|
||||
|
||||
A watchdog timer is provided that implements the Linux-standard
|
||||
watchdog timer interface. It has three module parameters that can be
|
||||
used to control it:
|
||||
used to control it::
|
||||
|
||||
modprobe ipmi_watchdog timeout=<t> pretimeout=<t> action=<action type>
|
||||
preaction=<preaction type> preop=<preop type> start_now=x
|
||||
@ -635,7 +644,7 @@ watchdog device is closed. The default value of nowayout is true
|
||||
if the CONFIG_WATCHDOG_NOWAYOUT option is enabled, or false if not.
|
||||
|
||||
When compiled into the kernel, the kernel command line is available
|
||||
for configuring the watchdog:
|
||||
for configuring the watchdog::
|
||||
|
||||
ipmi_watchdog.timeout=<t> ipmi_watchdog.pretimeout=<t>
|
||||
ipmi_watchdog.action=<action type>
|
||||
@ -675,6 +684,7 @@ also get a bunch of OEM events holding the panic string.
|
||||
|
||||
|
||||
The field settings of the events are:
|
||||
|
||||
* Generator ID: 0x21 (kernel)
|
||||
* EvM Rev: 0x03 (this event is formatting in IPMI 1.0 format)
|
||||
* Sensor Type: 0x20 (OS critical stop sensor)
|
||||
@ -683,18 +693,20 @@ The field settings of the events are:
|
||||
* Event Data 1: 0xa1 (Runtime stop in OEM bytes 2 and 3)
|
||||
* Event data 2: second byte of panic string
|
||||
* Event data 3: third byte of panic string
|
||||
|
||||
See the IPMI spec for the details of the event layout. This event is
|
||||
always sent to the local management controller. It will handle routing
|
||||
the message to the right place
|
||||
|
||||
Other OEM events have the following format:
|
||||
Record ID (bytes 0-1): Set by the SEL.
|
||||
Record type (byte 2): 0xf0 (OEM non-timestamped)
|
||||
byte 3: The slave address of the card saving the panic
|
||||
byte 4: A sequence number (starting at zero)
|
||||
The rest of the bytes (11 bytes) are the panic string. If the panic string
|
||||
is longer than 11 bytes, multiple messages will be sent with increasing
|
||||
sequence numbers.
|
||||
|
||||
* Record ID (bytes 0-1): Set by the SEL.
|
||||
* Record type (byte 2): 0xf0 (OEM non-timestamped)
|
||||
* byte 3: The slave address of the card saving the panic
|
||||
* byte 4: A sequence number (starting at zero)
|
||||
The rest of the bytes (11 bytes) are the panic string. If the panic string
|
||||
is longer than 11 bytes, multiple messages will be sent with increasing
|
||||
sequence numbers.
|
||||
|
||||
Because you cannot send OEM events using the standard interface, this
|
||||
function will attempt to find an SEL and add the events there. It
|
||||
|
@ -1,8 +1,11 @@
|
||||
ChangeLog:
|
||||
Started by Ingo Molnar <mingo@redhat.com>
|
||||
Update by Max Krasnyansky <maxk@qualcomm.com>
|
||||
|
||||
================
|
||||
SMP IRQ affinity
|
||||
================
|
||||
|
||||
ChangeLog:
|
||||
- Started by Ingo Molnar <mingo@redhat.com>
|
||||
- Update by Max Krasnyansky <maxk@qualcomm.com>
|
||||
|
||||
|
||||
/proc/irq/IRQ#/smp_affinity and /proc/irq/IRQ#/smp_affinity_list specify
|
||||
which target CPUs are permitted for a given IRQ source. It's a bitmask
|
||||
@ -16,50 +19,52 @@ will be set to the default mask. It can then be changed as described above.
|
||||
Default mask is 0xffffffff.
|
||||
|
||||
Here is an example of restricting IRQ44 (eth1) to CPU0-3 then restricting
|
||||
it to CPU4-7 (this is an 8-CPU SMP box):
|
||||
it to CPU4-7 (this is an 8-CPU SMP box)::
|
||||
|
||||
[root@moon 44]# cd /proc/irq/44
|
||||
[root@moon 44]# cat smp_affinity
|
||||
ffffffff
|
||||
[root@moon 44]# cd /proc/irq/44
|
||||
[root@moon 44]# cat smp_affinity
|
||||
ffffffff
|
||||
|
||||
[root@moon 44]# echo 0f > smp_affinity
|
||||
[root@moon 44]# cat smp_affinity
|
||||
0000000f
|
||||
[root@moon 44]# ping -f h
|
||||
PING hell (195.4.7.3): 56 data bytes
|
||||
...
|
||||
--- hell ping statistics ---
|
||||
6029 packets transmitted, 6027 packets received, 0% packet loss
|
||||
round-trip min/avg/max = 0.1/0.1/0.4 ms
|
||||
[root@moon 44]# cat /proc/interrupts | grep 'CPU\|44:'
|
||||
CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7
|
||||
44: 1068 1785 1785 1783 0 0 0 0 IO-APIC-level eth1
|
||||
[root@moon 44]# echo 0f > smp_affinity
|
||||
[root@moon 44]# cat smp_affinity
|
||||
0000000f
|
||||
[root@moon 44]# ping -f h
|
||||
PING hell (195.4.7.3): 56 data bytes
|
||||
...
|
||||
--- hell ping statistics ---
|
||||
6029 packets transmitted, 6027 packets received, 0% packet loss
|
||||
round-trip min/avg/max = 0.1/0.1/0.4 ms
|
||||
[root@moon 44]# cat /proc/interrupts | grep 'CPU\|44:'
|
||||
CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7
|
||||
44: 1068 1785 1785 1783 0 0 0 0 IO-APIC-level eth1
|
||||
|
||||
As can be seen from the line above IRQ44 was delivered only to the first four
|
||||
processors (0-3).
|
||||
Now lets restrict that IRQ to CPU(4-7).
|
||||
|
||||
[root@moon 44]# echo f0 > smp_affinity
|
||||
[root@moon 44]# cat smp_affinity
|
||||
000000f0
|
||||
[root@moon 44]# ping -f h
|
||||
PING hell (195.4.7.3): 56 data bytes
|
||||
..
|
||||
--- hell ping statistics ---
|
||||
2779 packets transmitted, 2777 packets received, 0% packet loss
|
||||
round-trip min/avg/max = 0.1/0.5/585.4 ms
|
||||
[root@moon 44]# cat /proc/interrupts | 'CPU\|44:'
|
||||
CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7
|
||||
44: 1068 1785 1785 1783 1784 1069 1070 1069 IO-APIC-level eth1
|
||||
::
|
||||
|
||||
[root@moon 44]# echo f0 > smp_affinity
|
||||
[root@moon 44]# cat smp_affinity
|
||||
000000f0
|
||||
[root@moon 44]# ping -f h
|
||||
PING hell (195.4.7.3): 56 data bytes
|
||||
..
|
||||
--- hell ping statistics ---
|
||||
2779 packets transmitted, 2777 packets received, 0% packet loss
|
||||
round-trip min/avg/max = 0.1/0.5/585.4 ms
|
||||
[root@moon 44]# cat /proc/interrupts | 'CPU\|44:'
|
||||
CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7
|
||||
44: 1068 1785 1785 1783 1784 1069 1070 1069 IO-APIC-level eth1
|
||||
|
||||
This time around IRQ44 was delivered only to the last four processors.
|
||||
i.e counters for the CPU0-3 did not change.
|
||||
|
||||
Here is an example of limiting that same irq (44) to cpus 1024 to 1031:
|
||||
Here is an example of limiting that same irq (44) to cpus 1024 to 1031::
|
||||
|
||||
[root@moon 44]# echo 1024-1031 > smp_affinity_list
|
||||
[root@moon 44]# cat smp_affinity_list
|
||||
1024-1031
|
||||
[root@moon 44]# echo 1024-1031 > smp_affinity_list
|
||||
[root@moon 44]# cat smp_affinity_list
|
||||
1024-1031
|
||||
|
||||
Note that to do this with a bitmask would require 32 bitmasks of zero
|
||||
to follow the pertinent one.
|
||||
|
@ -1,4 +1,6 @@
|
||||
irq_domain interrupt number mapping library
|
||||
===============================================
|
||||
The irq_domain interrupt number mapping library
|
||||
===============================================
|
||||
|
||||
The current design of the Linux kernel uses a single large number
|
||||
space where each separate IRQ source is assigned a different number.
|
||||
@ -36,7 +38,9 @@ irq_domain also implements translation from an abstract irq_fwspec
|
||||
structure to hwirq numbers (Device Tree and ACPI GSI so far), and can
|
||||
be easily extended to support other IRQ topology data sources.
|
||||
|
||||
=== irq_domain usage ===
|
||||
irq_domain usage
|
||||
================
|
||||
|
||||
An interrupt controller driver creates and registers an irq_domain by
|
||||
calling one of the irq_domain_add_*() functions (each mapping method
|
||||
has a different allocator function, more on that later). The function
|
||||
@ -62,15 +66,21 @@ If the driver has the Linux IRQ number or the irq_data pointer, and
|
||||
needs to know the associated hwirq number (such as in the irq_chip
|
||||
callbacks) then it can be directly obtained from irq_data->hwirq.
|
||||
|
||||
=== Types of irq_domain mappings ===
|
||||
Types of irq_domain mappings
|
||||
============================
|
||||
|
||||
There are several mechanisms available for reverse mapping from hwirq
|
||||
to Linux irq, and each mechanism uses a different allocation function.
|
||||
Which reverse map type should be used depends on the use case. Each
|
||||
of the reverse map types are described below:
|
||||
|
||||
==== Linear ====
|
||||
irq_domain_add_linear()
|
||||
irq_domain_create_linear()
|
||||
Linear
|
||||
------
|
||||
|
||||
::
|
||||
|
||||
irq_domain_add_linear()
|
||||
irq_domain_create_linear()
|
||||
|
||||
The linear reverse map maintains a fixed size table indexed by the
|
||||
hwirq number. When a hwirq is mapped, an irq_desc is allocated for
|
||||
@ -89,9 +99,13 @@ accepts a more general abstraction 'struct fwnode_handle'.
|
||||
|
||||
The majority of drivers should use the linear map.
|
||||
|
||||
==== Tree ====
|
||||
irq_domain_add_tree()
|
||||
irq_domain_create_tree()
|
||||
Tree
|
||||
----
|
||||
|
||||
::
|
||||
|
||||
irq_domain_add_tree()
|
||||
irq_domain_create_tree()
|
||||
|
||||
The irq_domain maintains a radix tree map from hwirq numbers to Linux
|
||||
IRQs. When an hwirq is mapped, an irq_desc is allocated and the
|
||||
@ -109,8 +123,12 @@ accepts a more general abstraction 'struct fwnode_handle'.
|
||||
|
||||
Very few drivers should need this mapping.
|
||||
|
||||
==== No Map ===-
|
||||
irq_domain_add_nomap()
|
||||
No Map
|
||||
------
|
||||
|
||||
::
|
||||
|
||||
irq_domain_add_nomap()
|
||||
|
||||
The No Map mapping is to be used when the hwirq number is
|
||||
programmable in the hardware. In this case it is best to program the
|
||||
@ -121,10 +139,14 @@ Linux IRQ number into the hardware.
|
||||
|
||||
Most drivers cannot use this mapping.
|
||||
|
||||
==== Legacy ====
|
||||
irq_domain_add_simple()
|
||||
irq_domain_add_legacy()
|
||||
irq_domain_add_legacy_isa()
|
||||
Legacy
|
||||
------
|
||||
|
||||
::
|
||||
|
||||
irq_domain_add_simple()
|
||||
irq_domain_add_legacy()
|
||||
irq_domain_add_legacy_isa()
|
||||
|
||||
The Legacy mapping is a special case for drivers that already have a
|
||||
range of irq_descs allocated for the hwirqs. It is used when the
|
||||
@ -163,14 +185,17 @@ that the driver using the simple domain call irq_create_mapping()
|
||||
before any irq_find_mapping() since the latter will actually work
|
||||
for the static IRQ assignment case.
|
||||
|
||||
==== Hierarchy IRQ domain ====
|
||||
Hierarchy IRQ domain
|
||||
--------------------
|
||||
|
||||
On some architectures, there may be multiple interrupt controllers
|
||||
involved in delivering an interrupt from the device to the target CPU.
|
||||
Let's look at a typical interrupt delivering path on x86 platforms:
|
||||
Let's look at a typical interrupt delivering path on x86 platforms::
|
||||
|
||||
Device --> IOAPIC -> Interrupt remapping Controller -> Local APIC -> CPU
|
||||
Device --> IOAPIC -> Interrupt remapping Controller -> Local APIC -> CPU
|
||||
|
||||
There are three interrupt controllers involved:
|
||||
|
||||
1) IOAPIC controller
|
||||
2) Interrupt remapping controller
|
||||
3) Local APIC controller
|
||||
@ -180,7 +205,8 @@ hardware architecture, an irq_domain data structure is built for each
|
||||
interrupt controller and those irq_domains are organized into hierarchy.
|
||||
When building irq_domain hierarchy, the irq_domain near to the device is
|
||||
child and the irq_domain near to CPU is parent. So a hierarchy structure
|
||||
as below will be built for the example above.
|
||||
as below will be built for the example above::
|
||||
|
||||
CPU Vector irq_domain (root irq_domain to manage CPU vectors)
|
||||
^
|
||||
|
|
||||
@ -190,6 +216,7 @@ as below will be built for the example above.
|
||||
IOAPIC irq_domain (manage IOAPIC delivery entries/pins)
|
||||
|
||||
There are four major interfaces to use hierarchy irq_domain:
|
||||
|
||||
1) irq_domain_alloc_irqs(): allocate IRQ descriptors and interrupt
|
||||
controller related resources to deliver these interrupts.
|
||||
2) irq_domain_free_irqs(): free IRQ descriptors and interrupt controller
|
||||
@ -199,7 +226,8 @@ There are four major interfaces to use hierarchy irq_domain:
|
||||
4) irq_domain_deactivate_irq(): deactivate interrupt controller hardware
|
||||
to stop delivering the interrupt.
|
||||
|
||||
Following changes are needed to support hierarchy irq_domain.
|
||||
Following changes are needed to support hierarchy irq_domain:
|
||||
|
||||
1) a new field 'parent' is added to struct irq_domain; it's used to
|
||||
maintain irq_domain hierarchy information.
|
||||
2) a new field 'parent_data' is added to struct irq_data; it's used to
|
||||
@ -223,6 +251,7 @@ software architecture.
|
||||
|
||||
For an interrupt controller driver to support hierarchy irq_domain, it
|
||||
needs to:
|
||||
|
||||
1) Implement irq_domain_ops.alloc and irq_domain_ops.free
|
||||
2) Optionally implement irq_domain_ops.activate and
|
||||
irq_domain_ops.deactivate.
|
||||
|
@ -1,4 +1,6 @@
|
||||
===============
|
||||
What is an IRQ?
|
||||
===============
|
||||
|
||||
An IRQ is an interrupt request from a device.
|
||||
Currently they can come in over a pin, or over a packet.
|
||||
|
@ -1,3 +1,4 @@
|
||||
===================
|
||||
Linux IOMMU Support
|
||||
===================
|
||||
|
||||
@ -9,11 +10,11 @@ This guide gives a quick cheat sheet for some basic understanding.
|
||||
|
||||
Some Keywords
|
||||
|
||||
DMAR - DMA remapping
|
||||
DRHD - DMA Remapping Hardware Unit Definition
|
||||
RMRR - Reserved memory Region Reporting Structure
|
||||
ZLR - Zero length reads from PCI devices
|
||||
IOVA - IO Virtual address.
|
||||
- DMAR - DMA remapping
|
||||
- DRHD - DMA Remapping Hardware Unit Definition
|
||||
- RMRR - Reserved memory Region Reporting Structure
|
||||
- ZLR - Zero length reads from PCI devices
|
||||
- IOVA - IO Virtual address.
|
||||
|
||||
Basic stuff
|
||||
-----------
|
||||
@ -33,7 +34,7 @@ devices that need to access these regions. OS is expected to setup
|
||||
unity mappings for these regions for these devices to access these regions.
|
||||
|
||||
How is IOVA generated?
|
||||
---------------------
|
||||
----------------------
|
||||
|
||||
Well behaved drivers call pci_map_*() calls before sending command to device
|
||||
that needs to perform DMA. Once DMA is completed and mapping is no longer
|
||||
@ -82,14 +83,14 @@ in ACPI.
|
||||
ACPI: DMAR (v001 A M I OEMDMAR 0x00000001 MSFT 0x00000097) @ 0x000000007f5b5ef0
|
||||
|
||||
When DMAR is being processed and initialized by ACPI, prints DMAR locations
|
||||
and any RMRR's processed.
|
||||
and any RMRR's processed::
|
||||
|
||||
ACPI DMAR:Host address width 36
|
||||
ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed90000
|
||||
ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed91000
|
||||
ACPI DMAR:DRHD (flags: 0x00000001)base: 0x00000000fed93000
|
||||
ACPI DMAR:RMRR base: 0x00000000000ed000 end: 0x00000000000effff
|
||||
ACPI DMAR:RMRR base: 0x000000007f600000 end: 0x000000007fffffff
|
||||
ACPI DMAR:Host address width 36
|
||||
ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed90000
|
||||
ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed91000
|
||||
ACPI DMAR:DRHD (flags: 0x00000001)base: 0x00000000fed93000
|
||||
ACPI DMAR:RMRR base: 0x00000000000ed000 end: 0x00000000000effff
|
||||
ACPI DMAR:RMRR base: 0x000000007f600000 end: 0x000000007fffffff
|
||||
|
||||
When DMAR is enabled for use, you will notice..
|
||||
|
||||
@ -98,10 +99,12 @@ PCI-DMA: Using DMAR IOMMU
|
||||
Fault reporting
|
||||
---------------
|
||||
|
||||
DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
|
||||
DMAR:[fault reason 05] PTE Write access is not set
|
||||
DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
|
||||
DMAR:[fault reason 05] PTE Write access is not set
|
||||
::
|
||||
|
||||
DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
|
||||
DMAR:[fault reason 05] PTE Write access is not set
|
||||
DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
|
||||
DMAR:[fault reason 05] PTE Write access is not set
|
||||
|
||||
TBD
|
||||
----
|
||||
|
@ -1,5 +1,9 @@
|
||||
Linux 2.4.2 Secure Attention Key (SAK) handling
|
||||
18 March 2001, Andrew Morton
|
||||
=========================================
|
||||
Linux Secure Attention Key (SAK) handling
|
||||
=========================================
|
||||
|
||||
:Date: 18 March 2001
|
||||
:Author: Andrew Morton
|
||||
|
||||
An operating system's Secure Attention Key is a security tool which is
|
||||
provided as protection against trojan password capturing programs. It
|
||||
@ -13,7 +17,7 @@ this sequence. It is only available if the kernel was compiled with
|
||||
sysrq support.
|
||||
|
||||
The proper way of generating a SAK is to define the key sequence using
|
||||
`loadkeys'. This will work whether or not sysrq support is compiled
|
||||
``loadkeys``. This will work whether or not sysrq support is compiled
|
||||
into the kernel.
|
||||
|
||||
SAK works correctly when the keyboard is in raw mode. This means that
|
||||
@ -25,64 +29,63 @@ What key sequence should you use? Well, CTRL-ALT-DEL is used to reboot
|
||||
the machine. CTRL-ALT-BACKSPACE is magical to the X server. We'll
|
||||
choose CTRL-ALT-PAUSE.
|
||||
|
||||
In your rc.sysinit (or rc.local) file, add the command
|
||||
In your rc.sysinit (or rc.local) file, add the command::
|
||||
|
||||
echo "control alt keycode 101 = SAK" | /bin/loadkeys
|
||||
|
||||
And that's it! Only the superuser may reprogram the SAK key.
|
||||
|
||||
|
||||
NOTES
|
||||
=====
|
||||
.. note::
|
||||
|
||||
1: Linux SAK is said to be not a "true SAK" as is required by
|
||||
systems which implement C2 level security. This author does not
|
||||
know why.
|
||||
1. Linux SAK is said to be not a "true SAK" as is required by
|
||||
systems which implement C2 level security. This author does not
|
||||
know why.
|
||||
|
||||
|
||||
2: On the PC keyboard, SAK kills all applications which have
|
||||
/dev/console opened.
|
||||
2. On the PC keyboard, SAK kills all applications which have
|
||||
/dev/console opened.
|
||||
|
||||
Unfortunately this includes a number of things which you don't
|
||||
actually want killed. This is because these applications are
|
||||
incorrectly holding /dev/console open. Be sure to complain to your
|
||||
Linux distributor about this!
|
||||
Unfortunately this includes a number of things which you don't
|
||||
actually want killed. This is because these applications are
|
||||
incorrectly holding /dev/console open. Be sure to complain to your
|
||||
Linux distributor about this!
|
||||
|
||||
You can identify processes which will be killed by SAK with the
|
||||
command
|
||||
You can identify processes which will be killed by SAK with the
|
||||
command::
|
||||
|
||||
# ls -l /proc/[0-9]*/fd/* | grep console
|
||||
l-wx------ 1 root root 64 Mar 18 00:46 /proc/579/fd/0 -> /dev/console
|
||||
|
||||
Then:
|
||||
Then::
|
||||
|
||||
# ps aux|grep 579
|
||||
root 579 0.0 0.1 1088 436 ? S 00:43 0:00 gpm -t ps/2
|
||||
|
||||
So `gpm' will be killed by SAK. This is a bug in gpm. It should
|
||||
be closing standard input. You can work around this by finding the
|
||||
initscript which launches gpm and changing it thusly:
|
||||
So ``gpm`` will be killed by SAK. This is a bug in gpm. It should
|
||||
be closing standard input. You can work around this by finding the
|
||||
initscript which launches gpm and changing it thusly:
|
||||
|
||||
Old:
|
||||
Old::
|
||||
|
||||
daemon gpm
|
||||
|
||||
New:
|
||||
New::
|
||||
|
||||
daemon gpm < /dev/null
|
||||
|
||||
Vixie cron also seems to have this problem, and needs the same treatment.
|
||||
Vixie cron also seems to have this problem, and needs the same treatment.
|
||||
|
||||
Also, one prominent Linux distribution has the following three
|
||||
lines in its rc.sysinit and rc scripts:
|
||||
Also, one prominent Linux distribution has the following three
|
||||
lines in its rc.sysinit and rc scripts::
|
||||
|
||||
exec 3<&0
|
||||
exec 4>&1
|
||||
exec 5>&2
|
||||
|
||||
These commands cause *all* daemons which are launched by the
|
||||
initscripts to have file descriptors 3, 4 and 5 attached to
|
||||
/dev/console. So SAK kills them all. A workaround is to simply
|
||||
delete these lines, but this may cause system management
|
||||
applications to malfunction - test everything well.
|
||||
These commands cause **all** daemons which are launched by the
|
||||
initscripts to have file descriptors 3, 4 and 5 attached to
|
||||
/dev/console. So SAK kills them all. A workaround is to simply
|
||||
delete these lines, but this may cause system management
|
||||
applications to malfunction - test everything well.
|
||||
|
||||
|
@ -1,7 +1,10 @@
|
||||
SM501 Driver
|
||||
============
|
||||
.. include:: <isonum.txt>
|
||||
|
||||
Copyright 2006, 2007 Simtec Electronics
|
||||
============
|
||||
SM501 Driver
|
||||
============
|
||||
|
||||
:Copyright: |copy| 2006, 2007 Simtec Electronics
|
||||
|
||||
The Silicon Motion SM501 multimedia companion chip is a multifunction device
|
||||
which may provide numerous interfaces including USB host controller USB gadget,
|
||||
|
@ -61,12 +61,15 @@ stable kernels.
|
||||
| Cavium | ThunderX ITS | #23144 | CAVIUM_ERRATUM_23144 |
|
||||
| Cavium | ThunderX GICv3 | #23154 | CAVIUM_ERRATUM_23154 |
|
||||
| Cavium | ThunderX Core | #27456 | CAVIUM_ERRATUM_27456 |
|
||||
| Cavium | ThunderX SMMUv2 | #27704 | N/A |
|
||||
| Cavium | ThunderX Core | #30115 | CAVIUM_ERRATUM_30115 |
|
||||
| Cavium | ThunderX SMMUv2 | #27704 | N/A |
|
||||
| Cavium | ThunderX2 SMMUv3| #74 | N/A |
|
||||
| Cavium | ThunderX2 SMMUv3| #126 | N/A |
|
||||
| | | | |
|
||||
| Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 |
|
||||
| | | | |
|
||||
| Hisilicon | Hip0{5,6,7} | #161010101 | HISILICON_ERRATUM_161010101 |
|
||||
| Hisilicon | Hip0{6,7} | #161010701 | N/A |
|
||||
| | | | |
|
||||
| Qualcomm Tech. | Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 |
|
||||
| Qualcomm Tech. | Falkor v1 | E1009 | QCOM_FALKOR_ERRATUM_1009 |
|
||||
|
@ -1,10 +1,15 @@
|
||||
============================
|
||||
A block layer cache (bcache)
|
||||
============================
|
||||
|
||||
Say you've got a big slow raid 6, and an ssd or three. Wouldn't it be
|
||||
nice if you could use them as cache... Hence bcache.
|
||||
|
||||
Wiki and git repositories are at:
|
||||
http://bcache.evilpiepirate.org
|
||||
http://evilpiepirate.org/git/linux-bcache.git
|
||||
http://evilpiepirate.org/git/bcache-tools.git
|
||||
|
||||
- http://bcache.evilpiepirate.org
|
||||
- http://evilpiepirate.org/git/linux-bcache.git
|
||||
- http://evilpiepirate.org/git/bcache-tools.git
|
||||
|
||||
It's designed around the performance characteristics of SSDs - it only allocates
|
||||
in erase block sized buckets, and it uses a hybrid btree/log to track cached
|
||||
@ -37,17 +42,19 @@ to be flushed.
|
||||
|
||||
Getting started:
|
||||
You'll need make-bcache from the bcache-tools repository. Both the cache device
|
||||
and backing device must be formatted before use.
|
||||
and backing device must be formatted before use::
|
||||
|
||||
make-bcache -B /dev/sdb
|
||||
make-bcache -C /dev/sdc
|
||||
|
||||
make-bcache has the ability to format multiple devices at the same time - if
|
||||
you format your backing devices and cache device at the same time, you won't
|
||||
have to manually attach:
|
||||
have to manually attach::
|
||||
|
||||
make-bcache -B /dev/sda /dev/sdb -C /dev/sdc
|
||||
|
||||
bcache-tools now ships udev rules, and bcache devices are known to the kernel
|
||||
immediately. Without udev, you can manually register devices like this:
|
||||
immediately. Without udev, you can manually register devices like this::
|
||||
|
||||
echo /dev/sdb > /sys/fs/bcache/register
|
||||
echo /dev/sdc > /sys/fs/bcache/register
|
||||
@ -60,16 +67,16 @@ slow devices as bcache backing devices without a cache, and you can choose to ad
|
||||
a caching device later.
|
||||
See 'ATTACHING' section below.
|
||||
|
||||
The devices show up as:
|
||||
The devices show up as::
|
||||
|
||||
/dev/bcache<N>
|
||||
|
||||
As well as (with udev):
|
||||
As well as (with udev)::
|
||||
|
||||
/dev/bcache/by-uuid/<uuid>
|
||||
/dev/bcache/by-label/<label>
|
||||
|
||||
To get started:
|
||||
To get started::
|
||||
|
||||
mkfs.ext4 /dev/bcache0
|
||||
mount /dev/bcache0 /mnt
|
||||
@ -81,13 +88,13 @@ Cache devices are managed as sets; multiple caches per set isn't supported yet
|
||||
but will allow for mirroring of metadata and dirty data in the future. Your new
|
||||
cache set shows up as /sys/fs/bcache/<UUID>
|
||||
|
||||
ATTACHING
|
||||
Attaching
|
||||
---------
|
||||
|
||||
After your cache device and backing device are registered, the backing device
|
||||
must be attached to your cache set to enable caching. Attaching a backing
|
||||
device to a cache set is done thusly, with the UUID of the cache set in
|
||||
/sys/fs/bcache:
|
||||
/sys/fs/bcache::
|
||||
|
||||
echo <CSET-UUID> > /sys/block/bcache0/bcache/attach
|
||||
|
||||
@ -97,7 +104,7 @@ your bcache devices. If a backing device has data in a cache somewhere, the
|
||||
important if you have writeback caching turned on.
|
||||
|
||||
If you're booting up and your cache device is gone and never coming back, you
|
||||
can force run the backing device:
|
||||
can force run the backing device::
|
||||
|
||||
echo 1 > /sys/block/sdb/bcache/running
|
||||
|
||||
@ -110,7 +117,7 @@ but all the cached data will be invalidated. If there was dirty data in the
|
||||
cache, don't expect the filesystem to be recoverable - you will have massive
|
||||
filesystem corruption, though ext4's fsck does work miracles.
|
||||
|
||||
ERROR HANDLING
|
||||
Error Handling
|
||||
--------------
|
||||
|
||||
Bcache tries to transparently handle IO errors to/from the cache device without
|
||||
@ -134,25 +141,27 @@ the backing devices to passthrough mode.
|
||||
read some of the dirty data, though.
|
||||
|
||||
|
||||
HOWTO/COOKBOOK
|
||||
Howto/cookbook
|
||||
--------------
|
||||
|
||||
A) Starting a bcache with a missing caching device
|
||||
|
||||
If registering the backing device doesn't help, it's already there, you just need
|
||||
to force it to run without the cache:
|
||||
to force it to run without the cache::
|
||||
|
||||
host:~# echo /dev/sdb1 > /sys/fs/bcache/register
|
||||
[ 119.844831] bcache: register_bcache() error opening /dev/sdb1: device already registered
|
||||
|
||||
Next, you try to register your caching device if it's present. However
|
||||
if it's absent, or registration fails for some reason, you can still
|
||||
start your bcache without its cache, like so:
|
||||
start your bcache without its cache, like so::
|
||||
|
||||
host:/sys/block/sdb/sdb1/bcache# echo 1 > running
|
||||
|
||||
Note that this may cause data loss if you were running in writeback mode.
|
||||
|
||||
|
||||
B) Bcache does not find its cache
|
||||
B) Bcache does not find its cache::
|
||||
|
||||
host:/sys/block/md5/bcache# echo 0226553a-37cf-41d5-b3ce-8b1e944543a8 > attach
|
||||
[ 1933.455082] bcache: bch_cached_dev_attach() Couldn't find uuid for md5 in set
|
||||
@ -160,7 +169,8 @@ B) Bcache does not find its cache
|
||||
[ 1933.478179] : cache set not found
|
||||
|
||||
In this case, the caching device was simply not registered at boot
|
||||
or disappeared and came back, and needs to be (re-)registered:
|
||||
or disappeared and came back, and needs to be (re-)registered::
|
||||
|
||||
host:/sys/block/md5/bcache# echo /dev/sdh2 > /sys/fs/bcache/register
|
||||
|
||||
|
||||
@ -180,7 +190,8 @@ device is still available at an 8KiB offset. So either via a loopdev
|
||||
of the backing device created with --offset 8K, or any value defined by
|
||||
--data-offset when you originally formatted bcache with `make-bcache`.
|
||||
|
||||
For example:
|
||||
For example::
|
||||
|
||||
losetup -o 8192 /dev/loop0 /dev/your_bcache_backing_dev
|
||||
|
||||
This should present your unmodified backing device data in /dev/loop0
|
||||
@ -191,33 +202,38 @@ cache device without loosing data.
|
||||
|
||||
E) Wiping a cache device
|
||||
|
||||
host:~# wipefs -a /dev/sdh2
|
||||
16 bytes were erased at offset 0x1018 (bcache)
|
||||
they were: c6 85 73 f6 4e 1a 45 ca 82 65 f5 7f 48 ba 6d 81
|
||||
::
|
||||
|
||||
After you boot back with bcache enabled, you recreate the cache and attach it:
|
||||
host:~# make-bcache -C /dev/sdh2
|
||||
UUID: 7be7e175-8f4c-4f99-94b2-9c904d227045
|
||||
Set UUID: 5bc072a8-ab17-446d-9744-e247949913c1
|
||||
version: 0
|
||||
nbuckets: 106874
|
||||
block_size: 1
|
||||
bucket_size: 1024
|
||||
nr_in_set: 1
|
||||
nr_this_dev: 0
|
||||
first_bucket: 1
|
||||
[ 650.511912] bcache: run_cache_set() invalidating existing data
|
||||
[ 650.549228] bcache: register_cache() registered cache device sdh2
|
||||
host:~# wipefs -a /dev/sdh2
|
||||
16 bytes were erased at offset 0x1018 (bcache)
|
||||
they were: c6 85 73 f6 4e 1a 45 ca 82 65 f5 7f 48 ba 6d 81
|
||||
|
||||
start backing device with missing cache:
|
||||
host:/sys/block/md5/bcache# echo 1 > running
|
||||
After you boot back with bcache enabled, you recreate the cache and attach it::
|
||||
|
||||
attach new cache:
|
||||
host:/sys/block/md5/bcache# echo 5bc072a8-ab17-446d-9744-e247949913c1 > attach
|
||||
[ 865.276616] bcache: bch_cached_dev_attach() Caching md5 as bcache0 on set 5bc072a8-ab17-446d-9744-e247949913c1
|
||||
host:~# make-bcache -C /dev/sdh2
|
||||
UUID: 7be7e175-8f4c-4f99-94b2-9c904d227045
|
||||
Set UUID: 5bc072a8-ab17-446d-9744-e247949913c1
|
||||
version: 0
|
||||
nbuckets: 106874
|
||||
block_size: 1
|
||||
bucket_size: 1024
|
||||
nr_in_set: 1
|
||||
nr_this_dev: 0
|
||||
first_bucket: 1
|
||||
[ 650.511912] bcache: run_cache_set() invalidating existing data
|
||||
[ 650.549228] bcache: register_cache() registered cache device sdh2
|
||||
|
||||
start backing device with missing cache::
|
||||
|
||||
host:/sys/block/md5/bcache# echo 1 > running
|
||||
|
||||
attach new cache::
|
||||
|
||||
host:/sys/block/md5/bcache# echo 5bc072a8-ab17-446d-9744-e247949913c1 > attach
|
||||
[ 865.276616] bcache: bch_cached_dev_attach() Caching md5 as bcache0 on set 5bc072a8-ab17-446d-9744-e247949913c1
|
||||
|
||||
|
||||
F) Remove or replace a caching device
|
||||
F) Remove or replace a caching device::
|
||||
|
||||
host:/sys/block/sda/sda7/bcache# echo 1 > detach
|
||||
[ 695.872542] bcache: cached_dev_detach_finish() Caching disabled for sda7
|
||||
@ -226,13 +242,15 @@ F) Remove or replace a caching device
|
||||
wipefs: error: /dev/nvme0n1p4: probing initialization failed: Device or resource busy
|
||||
Ooops, it's disabled, but not unregistered, so it's still protected
|
||||
|
||||
We need to go and unregister it:
|
||||
We need to go and unregister it::
|
||||
|
||||
host:/sys/fs/bcache/b7ba27a1-2398-4649-8ae3-0959f57ba128# ls -l cache0
|
||||
lrwxrwxrwx 1 root root 0 Feb 25 18:33 cache0 -> ../../../devices/pci0000:00/0000:00:1d.0/0000:70:00.0/nvme/nvme0/nvme0n1/nvme0n1p4/bcache/
|
||||
host:/sys/fs/bcache/b7ba27a1-2398-4649-8ae3-0959f57ba128# echo 1 > stop
|
||||
kernel: [ 917.041908] bcache: cache_set_free() Cache set b7ba27a1-2398-4649-8ae3-0959f57ba128 unregistered
|
||||
|
||||
Now we can wipe it:
|
||||
Now we can wipe it::
|
||||
|
||||
host:~# wipefs -a /dev/nvme0n1p4
|
||||
/dev/nvme0n1p4: 16 bytes were erased at offset 0x00001018 (bcache): c6 85 73 f6 4e 1a 45 ca 82 65 f5 7f 48 ba 6d 81
|
||||
|
||||
@ -252,40 +270,44 @@ if there are any active backing or caching devices left on it:
|
||||
|
||||
1) Is it present in /dev/bcache* ? (there are times where it won't be)
|
||||
|
||||
If so, it's easy:
|
||||
If so, it's easy::
|
||||
|
||||
host:/sys/block/bcache0/bcache# echo 1 > stop
|
||||
|
||||
2) But if your backing device is gone, this won't work:
|
||||
2) But if your backing device is gone, this won't work::
|
||||
|
||||
host:/sys/block/bcache0# cd bcache
|
||||
bash: cd: bcache: No such file or directory
|
||||
|
||||
In this case, you may have to unregister the dmcrypt block device that
|
||||
references this bcache to free it up:
|
||||
In this case, you may have to unregister the dmcrypt block device that
|
||||
references this bcache to free it up::
|
||||
|
||||
host:~# dmsetup remove oldds1
|
||||
bcache: bcache_device_free() bcache0 stopped
|
||||
bcache: cache_set_free() Cache set 5bc072a8-ab17-446d-9744-e247949913c1 unregistered
|
||||
|
||||
This causes the backing bcache to be removed from /sys/fs/bcache and
|
||||
then it can be reused. This would be true of any block device stacking
|
||||
where bcache is a lower device.
|
||||
This causes the backing bcache to be removed from /sys/fs/bcache and
|
||||
then it can be reused. This would be true of any block device stacking
|
||||
where bcache is a lower device.
|
||||
|
||||
3) In other cases, you can also look in /sys/fs/bcache/:
|
||||
3) In other cases, you can also look in /sys/fs/bcache/::
|
||||
|
||||
host:/sys/fs/bcache# ls -l */{cache?,bdev?}
|
||||
lrwxrwxrwx 1 root root 0 Mar 5 09:39 0226553a-37cf-41d5-b3ce-8b1e944543a8/bdev1 -> ../../../devices/virtual/block/dm-1/bcache/
|
||||
lrwxrwxrwx 1 root root 0 Mar 5 09:39 0226553a-37cf-41d5-b3ce-8b1e944543a8/cache0 -> ../../../devices/virtual/block/dm-4/bcache/
|
||||
lrwxrwxrwx 1 root root 0 Mar 5 09:39 5bc072a8-ab17-446d-9744-e247949913c1/cache0 -> ../../../devices/pci0000:00/0000:00:01.0/0000:01:00.0/ata10/host9/target9:0:0/9:0:0:0/block/sdl/sdl2/bcache/
|
||||
host:/sys/fs/bcache# ls -l */{cache?,bdev?}
|
||||
lrwxrwxrwx 1 root root 0 Mar 5 09:39 0226553a-37cf-41d5-b3ce-8b1e944543a8/bdev1 -> ../../../devices/virtual/block/dm-1/bcache/
|
||||
lrwxrwxrwx 1 root root 0 Mar 5 09:39 0226553a-37cf-41d5-b3ce-8b1e944543a8/cache0 -> ../../../devices/virtual/block/dm-4/bcache/
|
||||
lrwxrwxrwx 1 root root 0 Mar 5 09:39 5bc072a8-ab17-446d-9744-e247949913c1/cache0 -> ../../../devices/pci0000:00/0000:00:01.0/0000:01:00.0/ata10/host9/target9:0:0/9:0:0:0/block/sdl/sdl2/bcache/
|
||||
|
||||
The device names will show which UUID is relevant, cd in that directory
|
||||
and stop the cache::
|
||||
|
||||
The device names will show which UUID is relevant, cd in that directory
|
||||
and stop the cache:
|
||||
host:/sys/fs/bcache/5bc072a8-ab17-446d-9744-e247949913c1# echo 1 > stop
|
||||
|
||||
This will free up bcache references and let you reuse the partition for
|
||||
other purposes.
|
||||
This will free up bcache references and let you reuse the partition for
|
||||
other purposes.
|
||||
|
||||
|
||||
|
||||
TROUBLESHOOTING PERFORMANCE
|
||||
Troubleshooting performance
|
||||
---------------------------
|
||||
|
||||
Bcache has a bunch of config options and tunables. The defaults are intended to
|
||||
@ -301,11 +323,13 @@ want for getting the best possible numbers when benchmarking.
|
||||
raid stripe size to get the disk multiples that you would like.
|
||||
|
||||
For example: If you have a 64k stripe size, then the following offset
|
||||
would provide alignment for many common RAID5 data spindle counts:
|
||||
would provide alignment for many common RAID5 data spindle counts::
|
||||
|
||||
64k * 2*2*2*3*3*5*7 bytes = 161280k
|
||||
|
||||
That space is wasted, but for only 157.5MB you can grow your RAID 5
|
||||
volume to the following data-spindle counts without re-aligning:
|
||||
volume to the following data-spindle counts without re-aligning::
|
||||
|
||||
3,4,5,6,7,8,9,10,12,14,15,18,20,21 ...
|
||||
|
||||
- Bad write performance
|
||||
@ -313,9 +337,9 @@ want for getting the best possible numbers when benchmarking.
|
||||
If write performance is not what you expected, you probably wanted to be
|
||||
running in writeback mode, which isn't the default (not due to a lack of
|
||||
maturity, but simply because in writeback mode you'll lose data if something
|
||||
happens to your SSD)
|
||||
happens to your SSD)::
|
||||
|
||||
# echo writeback > /sys/block/bcache0/bcache/cache_mode
|
||||
# echo writeback > /sys/block/bcache0/bcache/cache_mode
|
||||
|
||||
- Bad performance, or traffic not going to the SSD that you'd expect
|
||||
|
||||
@ -325,13 +349,13 @@ want for getting the best possible numbers when benchmarking.
|
||||
accessed data out of your cache.
|
||||
|
||||
But if you want to benchmark reads from cache, and you start out with fio
|
||||
writing an 8 gigabyte test file - so you want to disable that.
|
||||
writing an 8 gigabyte test file - so you want to disable that::
|
||||
|
||||
# echo 0 > /sys/block/bcache0/bcache/sequential_cutoff
|
||||
# echo 0 > /sys/block/bcache0/bcache/sequential_cutoff
|
||||
|
||||
To set it back to the default (4 mb), do
|
||||
To set it back to the default (4 mb), do::
|
||||
|
||||
# echo 4M > /sys/block/bcache0/bcache/sequential_cutoff
|
||||
# echo 4M > /sys/block/bcache0/bcache/sequential_cutoff
|
||||
|
||||
- Traffic's still going to the spindle/still getting cache misses
|
||||
|
||||
@ -344,10 +368,10 @@ want for getting the best possible numbers when benchmarking.
|
||||
throttles traffic if the latency exceeds a threshold (it does this by
|
||||
cranking down the sequential bypass).
|
||||
|
||||
You can disable this if you need to by setting the thresholds to 0:
|
||||
You can disable this if you need to by setting the thresholds to 0::
|
||||
|
||||
# echo 0 > /sys/fs/bcache/<cache set>/congested_read_threshold_us
|
||||
# echo 0 > /sys/fs/bcache/<cache set>/congested_write_threshold_us
|
||||
# echo 0 > /sys/fs/bcache/<cache set>/congested_read_threshold_us
|
||||
# echo 0 > /sys/fs/bcache/<cache set>/congested_write_threshold_us
|
||||
|
||||
The default is 2000 us (2 milliseconds) for reads, and 20000 for writes.
|
||||
|
||||
@ -369,7 +393,7 @@ want for getting the best possible numbers when benchmarking.
|
||||
a fix for the issue there).
|
||||
|
||||
|
||||
SYSFS - BACKING DEVICE
|
||||
Sysfs - backing device
|
||||
----------------------
|
||||
|
||||
Available at /sys/block/<bdev>/bcache, /sys/block/bcache*/bcache and
|
||||
@ -454,7 +478,8 @@ writeback_running
|
||||
still be added to the cache until it is mostly full; only meant for
|
||||
benchmarking. Defaults to on.
|
||||
|
||||
SYSFS - BACKING DEVICE STATS:
|
||||
Sysfs - backing device stats
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
There are directories with these numbers for a running total, as well as
|
||||
versions that decay over the past day, hour and 5 minutes; they're also
|
||||
@ -463,14 +488,11 @@ aggregated in the cache set directory as well.
|
||||
bypassed
|
||||
Amount of IO (both reads and writes) that has bypassed the cache
|
||||
|
||||
cache_hits
|
||||
cache_misses
|
||||
cache_hit_ratio
|
||||
cache_hits, cache_misses, cache_hit_ratio
|
||||
Hits and misses are counted per individual IO as bcache sees them; a
|
||||
partial hit is counted as a miss.
|
||||
|
||||
cache_bypass_hits
|
||||
cache_bypass_misses
|
||||
cache_bypass_hits, cache_bypass_misses
|
||||
Hits and misses for IO that is intended to skip the cache are still counted,
|
||||
but broken out here.
|
||||
|
||||
@ -482,7 +504,8 @@ cache_miss_collisions
|
||||
cache_readaheads
|
||||
Count of times readahead occurred.
|
||||
|
||||
SYSFS - CACHE SET:
|
||||
Sysfs - cache set
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
Available at /sys/fs/bcache/<cset-uuid>
|
||||
|
||||
@ -520,8 +543,7 @@ flash_vol_create
|
||||
Echoing a size to this file (in human readable units, k/M/G) creates a thinly
|
||||
provisioned volume backed by the cache set.
|
||||
|
||||
io_error_halflife
|
||||
io_error_limit
|
||||
io_error_halflife, io_error_limit
|
||||
These determines how many errors we accept before disabling the cache.
|
||||
Each error is decayed by the half life (in # ios). If the decaying count
|
||||
reaches io_error_limit dirty data is written out and the cache is disabled.
|
||||
@ -545,7 +567,8 @@ unregister
|
||||
Detaches all backing devices and closes the cache devices; if dirty data is
|
||||
present it will disable writeback caching and wait for it to be flushed.
|
||||
|
||||
SYSFS - CACHE SET INTERNAL:
|
||||
Sysfs - cache set internal
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This directory also exposes timings for a number of internal operations, with
|
||||
separate files for average duration, average frequency, last occurrence and max
|
||||
@ -574,7 +597,8 @@ cache_read_races
|
||||
trigger_gc
|
||||
Writing to this file forces garbage collection to run.
|
||||
|
||||
SYSFS - CACHE DEVICE:
|
||||
Sysfs - Cache device
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Available at /sys/block/<cdev>/bcache
|
||||
|
||||
|
@ -192,7 +192,7 @@ will require extra work due to the application tag.
|
||||
supported by the block device.
|
||||
|
||||
|
||||
int bio_integrity_prep(bio);
|
||||
bool bio_integrity_prep(bio);
|
||||
|
||||
To generate IMD for WRITE and to set up buffers for READ, the
|
||||
filesystem must call bio_integrity_prep(bio).
|
||||
@ -201,9 +201,7 @@ will require extra work due to the application tag.
|
||||
sector must be set, and the bio should have all data pages
|
||||
added. It is up to the caller to ensure that the bio does not
|
||||
change while I/O is in progress.
|
||||
|
||||
bio_integrity_prep() should only be called if
|
||||
bio_integrity_enabled() returned 1.
|
||||
Complete bio with error if prepare failed for some reson.
|
||||
|
||||
|
||||
5.3 PASSING EXISTING INTEGRITY METADATA
|
||||
|
@ -1,12 +1,8 @@
|
||||
===============================================================
|
||||
== BT8XXGPIO driver ==
|
||||
== ==
|
||||
== A driver for a selfmade cheap BT8xx based PCI GPIO-card ==
|
||||
== ==
|
||||
== For advanced documentation, see ==
|
||||
== http://www.bu3sch.de/btgpio.php ==
|
||||
===============================================================
|
||||
===================================================================
|
||||
A driver for a selfmade cheap BT8xx based PCI GPIO-card (bt8xxgpio)
|
||||
===================================================================
|
||||
|
||||
For advanced documentation, see http://www.bu3sch.de/btgpio.php
|
||||
|
||||
A generic digital 24-port PCI GPIO card can be built out of an ordinary
|
||||
Brooktree bt848, bt849, bt878 or bt879 based analog TV tuner card. The
|
||||
@ -17,9 +13,8 @@ The bt8xx chip does have 24 digital GPIO ports.
|
||||
These ports are accessible via 24 pins on the SMD chip package.
|
||||
|
||||
|
||||
==============================================
|
||||
== How to physically access the GPIO pins ==
|
||||
==============================================
|
||||
How to physically access the GPIO pins
|
||||
======================================
|
||||
|
||||
The are several ways to access these pins. One might unsolder the whole chip
|
||||
and put it on a custom PCI board, or one might only unsolder each individual
|
||||
@ -27,7 +22,7 @@ GPIO pin and solder that to some tiny wire. As the chip package really is tiny
|
||||
there are some advanced soldering skills needed in any case.
|
||||
|
||||
The physical pinouts are drawn in the following ASCII art.
|
||||
The GPIO pins are marked with G00-G23
|
||||
The GPIO pins are marked with G00-G23::
|
||||
|
||||
G G G G G G G G G G G G G G G G G G
|
||||
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
|
||||
|
@ -1,18 +1,16 @@
|
||||
=======================================================================
|
||||
README for btmrvl driver
|
||||
=======================================================================
|
||||
|
||||
=============
|
||||
btmrvl driver
|
||||
=============
|
||||
|
||||
All commands are used via debugfs interface.
|
||||
|
||||
=====================
|
||||
Set/get driver configurations:
|
||||
Set/get driver configurations
|
||||
=============================
|
||||
|
||||
Path: /debug/btmrvl/config/
|
||||
|
||||
gpiogap=[n]
|
||||
hscfgcmd
|
||||
These commands are used to configure the host sleep parameters.
|
||||
gpiogap=[n], hscfgcmd
|
||||
These commands are used to configure the host sleep parameters::
|
||||
bit 8:0 -- Gap
|
||||
bit 16:8 -- GPIO
|
||||
|
||||
@ -23,7 +21,8 @@ hscfgcmd
|
||||
where Gap is the gap in milli seconds between wakeup signal and
|
||||
wakeup event, or 0xff for special host sleep setting.
|
||||
|
||||
Usage:
|
||||
Usage::
|
||||
|
||||
# Use SDIO interface to wake up the host and set GAP to 0x80:
|
||||
echo 0xff80 > /debug/btmrvl/config/gpiogap
|
||||
echo 1 > /debug/btmrvl/config/hscfgcmd
|
||||
@ -32,15 +31,16 @@ hscfgcmd
|
||||
echo 0x03ff > /debug/btmrvl/config/gpiogap
|
||||
echo 1 > /debug/btmrvl/config/hscfgcmd
|
||||
|
||||
psmode=[n]
|
||||
pscmd
|
||||
psmode=[n], pscmd
|
||||
These commands are used to enable/disable auto sleep mode
|
||||
|
||||
where the option is:
|
||||
where the option is::
|
||||
|
||||
1 -- Enable auto sleep mode
|
||||
0 -- Disable auto sleep mode
|
||||
|
||||
Usage:
|
||||
Usage::
|
||||
|
||||
# Enable auto sleep mode
|
||||
echo 1 > /debug/btmrvl/config/psmode
|
||||
echo 1 > /debug/btmrvl/config/pscmd
|
||||
@ -50,15 +50,16 @@ pscmd
|
||||
echo 1 > /debug/btmrvl/config/pscmd
|
||||
|
||||
|
||||
hsmode=[n]
|
||||
hscmd
|
||||
hsmode=[n], hscmd
|
||||
These commands are used to enable host sleep or wake up firmware
|
||||
|
||||
where the option is:
|
||||
where the option is::
|
||||
|
||||
1 -- Enable host sleep
|
||||
0 -- Wake up firmware
|
||||
|
||||
Usage:
|
||||
Usage::
|
||||
|
||||
# Enable host sleep
|
||||
echo 1 > /debug/btmrvl/config/hsmode
|
||||
echo 1 > /debug/btmrvl/config/hscmd
|
||||
@ -68,12 +69,13 @@ hscmd
|
||||
echo 1 > /debug/btmrvl/config/hscmd
|
||||
|
||||
|
||||
======================
|
||||
Get driver status:
|
||||
Get driver status
|
||||
=================
|
||||
|
||||
Path: /debug/btmrvl/status/
|
||||
|
||||
Usage:
|
||||
Usage::
|
||||
|
||||
cat /debug/btmrvl/status/<args>
|
||||
|
||||
where the args are:
|
||||
@ -90,14 +92,17 @@ hsstate
|
||||
txdnldrdy
|
||||
This command displays the value of Tx download ready flag.
|
||||
|
||||
|
||||
=====================
|
||||
Issuing a raw hci command
|
||||
=========================
|
||||
|
||||
Use hcitool to issue raw hci command, refer to hcitool manual
|
||||
|
||||
Usage: Hcitool cmd <ogf> <ocf> [Parameters]
|
||||
Usage::
|
||||
|
||||
Hcitool cmd <ogf> <ocf> [Parameters]
|
||||
|
||||
Interface Control Command::
|
||||
|
||||
Interface Control Command
|
||||
hcitool cmd 0x3f 0x5b 0xf5 0x01 0x00 --Enable All interface
|
||||
hcitool cmd 0x3f 0x5b 0xf5 0x01 0x01 --Enable Wlan interface
|
||||
hcitool cmd 0x3f 0x5b 0xf5 0x01 0x02 --Enable BT interface
|
||||
@ -105,13 +110,13 @@ Use hcitool to issue raw hci command, refer to hcitool manual
|
||||
hcitool cmd 0x3f 0x5b 0xf5 0x00 0x01 --Disable Wlan interface
|
||||
hcitool cmd 0x3f 0x5b 0xf5 0x00 0x02 --Disable BT interface
|
||||
|
||||
=======================================================================
|
||||
SD8688 firmware
|
||||
===============
|
||||
|
||||
Images:
|
||||
|
||||
SD8688 firmware:
|
||||
|
||||
/lib/firmware/sd8688_helper.bin
|
||||
/lib/firmware/sd8688.bin
|
||||
- /lib/firmware/sd8688_helper.bin
|
||||
- /lib/firmware/sd8688.bin
|
||||
|
||||
|
||||
The images can be downloaded from:
|
||||
|
@ -1,17 +1,27 @@
|
||||
[ NOTE: The virt_to_bus() and bus_to_virt() functions have been
|
||||
==========================================================
|
||||
How to access I/O mapped memory from within device drivers
|
||||
==========================================================
|
||||
|
||||
:Author: Linus
|
||||
|
||||
.. warning::
|
||||
|
||||
The virt_to_bus() and bus_to_virt() functions have been
|
||||
superseded by the functionality provided by the PCI DMA interface
|
||||
(see Documentation/DMA-API-HOWTO.txt). They continue
|
||||
to be documented below for historical purposes, but new code
|
||||
must not use them. --davidm 00/12/12 ]
|
||||
must not use them. --davidm 00/12/12
|
||||
|
||||
[ This is a mail message in response to a query on IO mapping, thus the
|
||||
strange format for a "document" ]
|
||||
::
|
||||
|
||||
[ This is a mail message in response to a query on IO mapping, thus the
|
||||
strange format for a "document" ]
|
||||
|
||||
The AHA-1542 is a bus-master device, and your patch makes the driver give the
|
||||
controller the physical address of the buffers, which is correct on x86
|
||||
(because all bus master devices see the physical memory mappings directly).
|
||||
|
||||
However, on many setups, there are actually _three_ different ways of looking
|
||||
However, on many setups, there are actually **three** different ways of looking
|
||||
at memory addresses, and in this case we actually want the third, the
|
||||
so-called "bus address".
|
||||
|
||||
@ -38,7 +48,7 @@ because the memory and the devices share the same address space, and that is
|
||||
not generally necessarily true on other PCI/ISA setups.
|
||||
|
||||
Now, just as an example, on the PReP (PowerPC Reference Platform), the
|
||||
CPU sees a memory map something like this (this is from memory):
|
||||
CPU sees a memory map something like this (this is from memory)::
|
||||
|
||||
0-2 GB "real memory"
|
||||
2 GB-3 GB "system IO" (inb/out and similar accesses on x86)
|
||||
@ -52,7 +62,7 @@ So when the CPU wants any bus master to write to physical memory 0, it
|
||||
has to give the master address 0x80000000 as the memory address.
|
||||
|
||||
So, for example, depending on how the kernel is actually mapped on the
|
||||
PPC, you can end up with a setup like this:
|
||||
PPC, you can end up with a setup like this::
|
||||
|
||||
physical address: 0
|
||||
virtual address: 0xC0000000
|
||||
@ -61,7 +71,7 @@ PPC, you can end up with a setup like this:
|
||||
where all the addresses actually point to the same thing. It's just seen
|
||||
through different translations..
|
||||
|
||||
Similarly, on the Alpha, the normal translation is
|
||||
Similarly, on the Alpha, the normal translation is::
|
||||
|
||||
physical address: 0
|
||||
virtual address: 0xfffffc0000000000
|
||||
@ -70,7 +80,7 @@ Similarly, on the Alpha, the normal translation is
|
||||
(but there are also Alphas where the physical address and the bus address
|
||||
are the same).
|
||||
|
||||
Anyway, the way to look up all these translations, you do
|
||||
Anyway, the way to look up all these translations, you do::
|
||||
|
||||
#include <asm/io.h>
|
||||
|
||||
@ -81,8 +91,8 @@ Anyway, the way to look up all these translations, you do
|
||||
|
||||
Now, when do you need these?
|
||||
|
||||
You want the _virtual_ address when you are actually going to access that
|
||||
pointer from the kernel. So you can have something like this:
|
||||
You want the **virtual** address when you are actually going to access that
|
||||
pointer from the kernel. So you can have something like this::
|
||||
|
||||
/*
|
||||
* this is the hardware "mailbox" we use to communicate with
|
||||
@ -104,7 +114,7 @@ pointer from the kernel. So you can have something like this:
|
||||
...
|
||||
|
||||
on the other hand, you want the bus address when you have a buffer that
|
||||
you want to give to the controller:
|
||||
you want to give to the controller::
|
||||
|
||||
/* ask the controller to read the sense status into "sense_buffer" */
|
||||
mbox.bufstart = virt_to_bus(&sense_buffer);
|
||||
@ -112,7 +122,7 @@ you want to give to the controller:
|
||||
mbox.status = 0;
|
||||
notify_controller(&mbox);
|
||||
|
||||
And you generally _never_ want to use the physical address, because you can't
|
||||
And you generally **never** want to use the physical address, because you can't
|
||||
use that from the CPU (the CPU only uses translated virtual addresses), and
|
||||
you can't use it from the bus master.
|
||||
|
||||
@ -124,8 +134,10 @@ be remapped as measured in units of pages, a.k.a. the pfn (the memory
|
||||
management layer doesn't know about devices outside the CPU, so it
|
||||
shouldn't need to know about "bus addresses" etc).
|
||||
|
||||
NOTE NOTE NOTE! The above is only one part of the whole equation. The above
|
||||
only talks about "real memory", that is, CPU memory (RAM).
|
||||
.. note::
|
||||
|
||||
The above is only one part of the whole equation. The above
|
||||
only talks about "real memory", that is, CPU memory (RAM).
|
||||
|
||||
There is a completely different type of memory too, and that's the "shared
|
||||
memory" on the PCI or ISA bus. That's generally not RAM (although in the case
|
||||
@ -137,20 +149,22 @@ whatever, and there is only one way to access it: the readb/writeb and
|
||||
related functions. You should never take the address of such memory, because
|
||||
there is really nothing you can do with such an address: it's not
|
||||
conceptually in the same memory space as "real memory" at all, so you cannot
|
||||
just dereference a pointer. (Sadly, on x86 it _is_ in the same memory space,
|
||||
just dereference a pointer. (Sadly, on x86 it **is** in the same memory space,
|
||||
so on x86 it actually works to just deference a pointer, but it's not
|
||||
portable).
|
||||
|
||||
For such memory, you can do things like
|
||||
For such memory, you can do things like:
|
||||
|
||||
- reading::
|
||||
|
||||
- reading:
|
||||
/*
|
||||
* read first 32 bits from ISA memory at 0xC0000, aka
|
||||
* C000:0000 in DOS terms
|
||||
*/
|
||||
unsigned int signature = isa_readl(0xC0000);
|
||||
|
||||
- remapping and writing:
|
||||
- remapping and writing::
|
||||
|
||||
/*
|
||||
* remap framebuffer PCI memory area at 0xFC000000,
|
||||
* size 1MB, so that we can access it: We can directly
|
||||
@ -165,7 +179,8 @@ For such memory, you can do things like
|
||||
/* unmap when we unload the driver */
|
||||
iounmap(baseptr);
|
||||
|
||||
- copying and clearing:
|
||||
- copying and clearing::
|
||||
|
||||
/* get the 6-byte Ethernet address at ISA address E000:0040 */
|
||||
memcpy_fromio(kernel_buffer, 0xE0040, 6);
|
||||
/* write a packet to the driver */
|
||||
@ -181,10 +196,10 @@ happy that your driver works ;)
|
||||
Note that kernel versions 2.0.x (and earlier) mistakenly called the
|
||||
ioremap() function "vremap()". ioremap() is the proper name, but I
|
||||
didn't think straight when I wrote it originally. People who have to
|
||||
support both can do something like:
|
||||
support both can do something like::
|
||||
|
||||
/* support old naming silliness */
|
||||
#if LINUX_VERSION_CODE < 0x020100
|
||||
#if LINUX_VERSION_CODE < 0x020100
|
||||
#define ioremap vremap
|
||||
#define iounmap vfree
|
||||
#endif
|
||||
@ -196,13 +211,10 @@ And the above sounds worse than it really is. Most real drivers really
|
||||
don't do all that complex things (or rather: the complexity is not so
|
||||
much in the actual IO accesses as in error handling and timeouts etc).
|
||||
It's generally not hard to fix drivers, and in many cases the code
|
||||
actually looks better afterwards:
|
||||
actually looks better afterwards::
|
||||
|
||||
unsigned long signature = *(unsigned int *) 0xC0000;
|
||||
vs
|
||||
unsigned long signature = readl(0xC0000);
|
||||
|
||||
I think the second version actually is more readable, no?
|
||||
|
||||
Linus
|
||||
|
||||
|
@ -1,7 +1,8 @@
|
||||
Cache and TLB Flushing
|
||||
Under Linux
|
||||
==================================
|
||||
Cache and TLB Flushing Under Linux
|
||||
==================================
|
||||
|
||||
David S. Miller <davem@redhat.com>
|
||||
:Author: David S. Miller <davem@redhat.com>
|
||||
|
||||
This document describes the cache/tlb flushing interfaces called
|
||||
by the Linux VM subsystem. It enumerates over each interface,
|
||||
@ -28,7 +29,7 @@ Therefore when software page table changes occur, the kernel will
|
||||
invoke one of the following flush methods _after_ the page table
|
||||
changes occur:
|
||||
|
||||
1) void flush_tlb_all(void)
|
||||
1) ``void flush_tlb_all(void)``
|
||||
|
||||
The most severe flush of all. After this interface runs,
|
||||
any previous page table modification whatsoever will be
|
||||
@ -37,7 +38,7 @@ changes occur:
|
||||
This is usually invoked when the kernel page tables are
|
||||
changed, since such translations are "global" in nature.
|
||||
|
||||
2) void flush_tlb_mm(struct mm_struct *mm)
|
||||
2) ``void flush_tlb_mm(struct mm_struct *mm)``
|
||||
|
||||
This interface flushes an entire user address space from
|
||||
the TLB. After running, this interface must make sure that
|
||||
@ -49,8 +50,8 @@ changes occur:
|
||||
page table operations such as what happens during
|
||||
fork, and exec.
|
||||
|
||||
3) void flush_tlb_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end)
|
||||
3) ``void flush_tlb_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end)``
|
||||
|
||||
Here we are flushing a specific range of (user) virtual
|
||||
address translations from the TLB. After running, this
|
||||
@ -69,7 +70,7 @@ changes occur:
|
||||
call flush_tlb_page (see below) for each entry which may be
|
||||
modified.
|
||||
|
||||
4) void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)
|
||||
4) ``void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)``
|
||||
|
||||
This time we need to remove the PAGE_SIZE sized translation
|
||||
from the TLB. The 'vma' is the backing structure used by
|
||||
@ -87,8 +88,8 @@ changes occur:
|
||||
|
||||
This is used primarily during fault processing.
|
||||
|
||||
5) void update_mmu_cache(struct vm_area_struct *vma,
|
||||
unsigned long address, pte_t *ptep)
|
||||
5) ``void update_mmu_cache(struct vm_area_struct *vma,
|
||||
unsigned long address, pte_t *ptep)``
|
||||
|
||||
At the end of every page fault, this routine is invoked to
|
||||
tell the architecture specific code that a translation
|
||||
@ -100,7 +101,7 @@ changes occur:
|
||||
translations for software managed TLB configurations.
|
||||
The sparc64 port currently does this.
|
||||
|
||||
6) void tlb_migrate_finish(struct mm_struct *mm)
|
||||
6) ``void tlb_migrate_finish(struct mm_struct *mm)``
|
||||
|
||||
This interface is called at the end of an explicit
|
||||
process migration. This interface provides a hook
|
||||
@ -112,7 +113,7 @@ changes occur:
|
||||
|
||||
Next, we have the cache flushing interfaces. In general, when Linux
|
||||
is changing an existing virtual-->physical mapping to a new value,
|
||||
the sequence will be in one of the following forms:
|
||||
the sequence will be in one of the following forms::
|
||||
|
||||
1) flush_cache_mm(mm);
|
||||
change_all_page_tables_of(mm);
|
||||
@ -143,7 +144,7 @@ and have no dependency on translation information.
|
||||
|
||||
Here are the routines, one by one:
|
||||
|
||||
1) void flush_cache_mm(struct mm_struct *mm)
|
||||
1) ``void flush_cache_mm(struct mm_struct *mm)``
|
||||
|
||||
This interface flushes an entire user address space from
|
||||
the caches. That is, after running, there will be no cache
|
||||
@ -152,7 +153,7 @@ Here are the routines, one by one:
|
||||
This interface is used to handle whole address space
|
||||
page table operations such as what happens during exit and exec.
|
||||
|
||||
2) void flush_cache_dup_mm(struct mm_struct *mm)
|
||||
2) ``void flush_cache_dup_mm(struct mm_struct *mm)``
|
||||
|
||||
This interface flushes an entire user address space from
|
||||
the caches. That is, after running, there will be no cache
|
||||
@ -164,8 +165,8 @@ Here are the routines, one by one:
|
||||
This option is separate from flush_cache_mm to allow some
|
||||
optimizations for VIPT caches.
|
||||
|
||||
3) void flush_cache_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end)
|
||||
3) ``void flush_cache_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end)``
|
||||
|
||||
Here we are flushing a specific range of (user) virtual
|
||||
addresses from the cache. After running, there will be no
|
||||
@ -181,7 +182,7 @@ Here are the routines, one by one:
|
||||
call flush_cache_page (see below) for each entry which may be
|
||||
modified.
|
||||
|
||||
4) void flush_cache_page(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn)
|
||||
4) ``void flush_cache_page(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn)``
|
||||
|
||||
This time we need to remove a PAGE_SIZE sized range
|
||||
from the cache. The 'vma' is the backing structure used by
|
||||
@ -202,7 +203,7 @@ Here are the routines, one by one:
|
||||
|
||||
This is used primarily during fault processing.
|
||||
|
||||
5) void flush_cache_kmaps(void)
|
||||
5) ``void flush_cache_kmaps(void)``
|
||||
|
||||
This routine need only be implemented if the platform utilizes
|
||||
highmem. It will be called right before all of the kmaps
|
||||
@ -214,8 +215,8 @@ Here are the routines, one by one:
|
||||
|
||||
This routing should be implemented in asm/highmem.h
|
||||
|
||||
6) void flush_cache_vmap(unsigned long start, unsigned long end)
|
||||
void flush_cache_vunmap(unsigned long start, unsigned long end)
|
||||
6) ``void flush_cache_vmap(unsigned long start, unsigned long end)``
|
||||
``void flush_cache_vunmap(unsigned long start, unsigned long end)``
|
||||
|
||||
Here in these two interfaces we are flushing a specific range
|
||||
of (kernel) virtual addresses from the cache. After running,
|
||||
@ -243,8 +244,10 @@ size). This setting will force the SYSv IPC layer to only allow user
|
||||
processes to mmap shared memory at address which are a multiple of
|
||||
this value.
|
||||
|
||||
NOTE: This does not fix shared mmaps, check out the sparc64 port for
|
||||
one way to solve this (in particular SPARC_FLAG_MMAPSHARED).
|
||||
.. note::
|
||||
|
||||
This does not fix shared mmaps, check out the sparc64 port for
|
||||
one way to solve this (in particular SPARC_FLAG_MMAPSHARED).
|
||||
|
||||
Next, you have to solve the D-cache aliasing issue for all
|
||||
other cases. Please keep in mind that fact that, for a given page
|
||||
@ -255,8 +258,8 @@ physical page into its address space, by implication the D-cache
|
||||
aliasing problem has the potential to exist since the kernel already
|
||||
maps this page at its virtual address.
|
||||
|
||||
void copy_user_page(void *to, void *from, unsigned long addr, struct page *page)
|
||||
void clear_user_page(void *to, unsigned long addr, struct page *page)
|
||||
``void copy_user_page(void *to, void *from, unsigned long addr, struct page *page)``
|
||||
``void clear_user_page(void *to, unsigned long addr, struct page *page)``
|
||||
|
||||
These two routines store data in user anonymous or COW
|
||||
pages. It allows a port to efficiently avoid D-cache alias
|
||||
@ -276,14 +279,16 @@ maps this page at its virtual address.
|
||||
If D-cache aliasing is not an issue, these two routines may
|
||||
simply call memcpy/memset directly and do nothing more.
|
||||
|
||||
void flush_dcache_page(struct page *page)
|
||||
``void flush_dcache_page(struct page *page)``
|
||||
|
||||
Any time the kernel writes to a page cache page, _OR_
|
||||
the kernel is about to read from a page cache page and
|
||||
user space shared/writable mappings of this page potentially
|
||||
exist, this routine is called.
|
||||
|
||||
NOTE: This routine need only be called for page cache pages
|
||||
.. note::
|
||||
|
||||
This routine need only be called for page cache pages
|
||||
which can potentially ever be mapped into the address
|
||||
space of a user process. So for example, VFS layer code
|
||||
handling vfs symlinks in the page cache need not call
|
||||
@ -322,18 +327,19 @@ maps this page at its virtual address.
|
||||
made of this flag bit, and if set the flush is done and the flag
|
||||
bit is cleared.
|
||||
|
||||
IMPORTANT NOTE: It is often important, if you defer the flush,
|
||||
.. important::
|
||||
|
||||
It is often important, if you defer the flush,
|
||||
that the actual flush occurs on the same CPU
|
||||
as did the cpu stores into the page to make it
|
||||
dirty. Again, see sparc64 for examples of how
|
||||
to deal with this.
|
||||
|
||||
void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
|
||||
unsigned long user_vaddr,
|
||||
void *dst, void *src, int len)
|
||||
void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
|
||||
unsigned long user_vaddr,
|
||||
void *dst, void *src, int len)
|
||||
``void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
|
||||
unsigned long user_vaddr, void *dst, void *src, int len)``
|
||||
``void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
|
||||
unsigned long user_vaddr, void *dst, void *src, int len)``
|
||||
|
||||
When the kernel needs to copy arbitrary data in and out
|
||||
of arbitrary user pages (f.e. for ptrace()) it will use
|
||||
these two routines.
|
||||
@ -344,8 +350,9 @@ maps this page at its virtual address.
|
||||
likely that you will need to flush the instruction cache
|
||||
for copy_to_user_page().
|
||||
|
||||
void flush_anon_page(struct vm_area_struct *vma, struct page *page,
|
||||
unsigned long vmaddr)
|
||||
``void flush_anon_page(struct vm_area_struct *vma, struct page *page,
|
||||
unsigned long vmaddr)``
|
||||
|
||||
When the kernel needs to access the contents of an anonymous
|
||||
page, it calls this function (currently only
|
||||
get_user_pages()). Note: flush_dcache_page() deliberately
|
||||
@ -354,7 +361,8 @@ maps this page at its virtual address.
|
||||
architectures). For incoherent architectures, it should flush
|
||||
the cache of the page at vmaddr.
|
||||
|
||||
void flush_kernel_dcache_page(struct page *page)
|
||||
``void flush_kernel_dcache_page(struct page *page)``
|
||||
|
||||
When the kernel needs to modify a user page is has obtained
|
||||
with kmap, it calls this function after all modifications are
|
||||
complete (but before kunmapping it) to bring the underlying
|
||||
@ -366,14 +374,16 @@ maps this page at its virtual address.
|
||||
the kernel cache for page (using page_address(page)).
|
||||
|
||||
|
||||
void flush_icache_range(unsigned long start, unsigned long end)
|
||||
``void flush_icache_range(unsigned long start, unsigned long end)``
|
||||
|
||||
When the kernel stores into addresses that it will execute
|
||||
out of (eg when loading modules), this function is called.
|
||||
|
||||
If the icache does not snoop stores then this routine will need
|
||||
to flush it.
|
||||
|
||||
void flush_icache_page(struct vm_area_struct *vma, struct page *page)
|
||||
``void flush_icache_page(struct vm_area_struct *vma, struct page *page)``
|
||||
|
||||
All the functionality of flush_icache_page can be implemented in
|
||||
flush_dcache_page and update_mmu_cache. In the future, the hope
|
||||
is to remove this interface completely.
|
||||
@ -387,7 +397,8 @@ the kernel trying to do I/O to vmap areas must manually manage
|
||||
coherency. It must do this by flushing the vmap range before doing
|
||||
I/O and invalidating it after the I/O returns.
|
||||
|
||||
void flush_kernel_vmap_range(void *vaddr, int size)
|
||||
``void flush_kernel_vmap_range(void *vaddr, int size)``
|
||||
|
||||
flushes the kernel cache for a given virtual address range in
|
||||
the vmap area. This is to make sure that any data the kernel
|
||||
modified in the vmap range is made visible to the physical
|
||||
@ -395,7 +406,8 @@ I/O and invalidating it after the I/O returns.
|
||||
Note that this API does *not* also flush the offset map alias
|
||||
of the area.
|
||||
|
||||
void invalidate_kernel_vmap_range(void *vaddr, int size) invalidates
|
||||
``void invalidate_kernel_vmap_range(void *vaddr, int size) invalidates``
|
||||
|
||||
the cache for a given virtual address range in the vmap area
|
||||
which prevents the processor from making the cache stale by
|
||||
speculatively reading data while the I/O was occurring to the
|
||||
|
@ -789,23 +789,46 @@ way to trigger. Applications should do whatever they can to help the
|
||||
system. It might be too late to consult with vmstat or any other
|
||||
statistics, so it's advisable to take an immediate action.
|
||||
|
||||
The events are propagated upward until the event is handled, i.e. the
|
||||
events are not pass-through. Here is what this means: for example you have
|
||||
three cgroups: A->B->C. Now you set up an event listener on cgroups A, B
|
||||
and C, and suppose group C experiences some pressure. In this situation,
|
||||
only group C will receive the notification, i.e. groups A and B will not
|
||||
receive it. This is done to avoid excessive "broadcasting" of messages,
|
||||
which disturbs the system and which is especially bad if we are low on
|
||||
memory or thrashing. So, organize the cgroups wisely, or propagate the
|
||||
events manually (or, ask us to implement the pass-through events,
|
||||
explaining why would you need them.)
|
||||
By default, events are propagated upward until the event is handled, i.e. the
|
||||
events are not pass-through. For example, you have three cgroups: A->B->C. Now
|
||||
you set up an event listener on cgroups A, B and C, and suppose group C
|
||||
experiences some pressure. In this situation, only group C will receive the
|
||||
notification, i.e. groups A and B will not receive it. This is done to avoid
|
||||
excessive "broadcasting" of messages, which disturbs the system and which is
|
||||
especially bad if we are low on memory or thrashing. Group B, will receive
|
||||
notification only if there are no event listers for group C.
|
||||
|
||||
There are three optional modes that specify different propagation behavior:
|
||||
|
||||
- "default": this is the default behavior specified above. This mode is the
|
||||
same as omitting the optional mode parameter, preserved by backwards
|
||||
compatibility.
|
||||
|
||||
- "hierarchy": events always propagate up to the root, similar to the default
|
||||
behavior, except that propagation continues regardless of whether there are
|
||||
event listeners at each level, with the "hierarchy" mode. In the above
|
||||
example, groups A, B, and C will receive notification of memory pressure.
|
||||
|
||||
- "local": events are pass-through, i.e. they only receive notifications when
|
||||
memory pressure is experienced in the memcg for which the notification is
|
||||
registered. In the above example, group C will receive notification if
|
||||
registered for "local" notification and the group experiences memory
|
||||
pressure. However, group B will never receive notification, regardless if
|
||||
there is an event listener for group C or not, if group B is registered for
|
||||
local notification.
|
||||
|
||||
The level and event notification mode ("hierarchy" or "local", if necessary) are
|
||||
specified by a comma-delimited string, i.e. "low,hierarchy" specifies
|
||||
hierarchical, pass-through, notification for all ancestor memcgs. Notification
|
||||
that is the default, non pass-through behavior, does not specify a mode.
|
||||
"medium,local" specifies pass-through notification for the medium level.
|
||||
|
||||
The file memory.pressure_level is only used to setup an eventfd. To
|
||||
register a notification, an application must:
|
||||
|
||||
- create an eventfd using eventfd(2);
|
||||
- open memory.pressure_level;
|
||||
- write string like "<event_fd> <fd of memory.pressure_level> <level>"
|
||||
- write string as "<event_fd> <fd of memory.pressure_level> <level[,mode]>"
|
||||
to cgroup.event_control.
|
||||
|
||||
Application will be notified through eventfd when memory pressure is at
|
||||
@ -821,7 +844,7 @@ Test:
|
||||
# cd /sys/fs/cgroup/memory/
|
||||
# mkdir foo
|
||||
# cd foo
|
||||
# cgroup_event_listener memory.pressure_level low &
|
||||
# cgroup_event_listener memory.pressure_level low,hierarchy &
|
||||
# echo 8000000 > memory.limit_in_bytes
|
||||
# echo 8000000 > memory.memsw.limit_in_bytes
|
||||
# echo $$ > tasks
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,9 +1,9 @@
|
||||
================
|
||||
CIRCULAR BUFFERS
|
||||
================
|
||||
================
|
||||
Circular Buffers
|
||||
================
|
||||
|
||||
By: David Howells <dhowells@redhat.com>
|
||||
Paul E. McKenney <paulmck@linux.vnet.ibm.com>
|
||||
:Author: David Howells <dhowells@redhat.com>
|
||||
:Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
|
||||
|
||||
|
||||
Linux provides a number of features that can be used to implement circular
|
||||
@ -20,7 +20,7 @@ producer and just one consumer. It is possible to handle multiple producers by
|
||||
serialising them, and to handle multiple consumers by serialising them.
|
||||
|
||||
|
||||
Contents:
|
||||
.. Contents:
|
||||
|
||||
(*) What is a circular buffer?
|
||||
|
||||
@ -31,8 +31,8 @@ Contents:
|
||||
- The consumer.
|
||||
|
||||
|
||||
==========================
|
||||
WHAT IS A CIRCULAR BUFFER?
|
||||
|
||||
What is a circular buffer?
|
||||
==========================
|
||||
|
||||
First of all, what is a circular buffer? A circular buffer is a buffer of
|
||||
@ -60,9 +60,7 @@ buffer, provided that neither index overtakes the other. The implementer must
|
||||
be careful, however, as a region more than one unit in size may wrap the end of
|
||||
the buffer and be broken into two segments.
|
||||
|
||||
|
||||
============================
|
||||
MEASURING POWER-OF-2 BUFFERS
|
||||
Measuring power-of-2 buffers
|
||||
============================
|
||||
|
||||
Calculation of the occupancy or the remaining capacity of an arbitrarily sized
|
||||
@ -71,13 +69,13 @@ modulus (divide) instruction. However, if the buffer is of a power-of-2 size,
|
||||
then a much quicker bitwise-AND instruction can be used instead.
|
||||
|
||||
Linux provides a set of macros for handling power-of-2 circular buffers. These
|
||||
can be made use of by:
|
||||
can be made use of by::
|
||||
|
||||
#include <linux/circ_buf.h>
|
||||
|
||||
The macros are:
|
||||
|
||||
(*) Measure the remaining capacity of a buffer:
|
||||
(#) Measure the remaining capacity of a buffer::
|
||||
|
||||
CIRC_SPACE(head_index, tail_index, buffer_size);
|
||||
|
||||
@ -85,7 +83,7 @@ The macros are:
|
||||
can be inserted.
|
||||
|
||||
|
||||
(*) Measure the maximum consecutive immediate space in a buffer:
|
||||
(#) Measure the maximum consecutive immediate space in a buffer::
|
||||
|
||||
CIRC_SPACE_TO_END(head_index, tail_index, buffer_size);
|
||||
|
||||
@ -94,14 +92,14 @@ The macros are:
|
||||
beginning of the buffer.
|
||||
|
||||
|
||||
(*) Measure the occupancy of a buffer:
|
||||
(#) Measure the occupancy of a buffer::
|
||||
|
||||
CIRC_CNT(head_index, tail_index, buffer_size);
|
||||
|
||||
This returns the number of items currently occupying a buffer[2].
|
||||
|
||||
|
||||
(*) Measure the non-wrapping occupancy of a buffer:
|
||||
(#) Measure the non-wrapping occupancy of a buffer::
|
||||
|
||||
CIRC_CNT_TO_END(head_index, tail_index, buffer_size);
|
||||
|
||||
@ -112,7 +110,7 @@ The macros are:
|
||||
Each of these macros will nominally return a value between 0 and buffer_size-1,
|
||||
however:
|
||||
|
||||
[1] CIRC_SPACE*() are intended to be used in the producer. To the producer
|
||||
(1) CIRC_SPACE*() are intended to be used in the producer. To the producer
|
||||
they will return a lower bound as the producer controls the head index,
|
||||
but the consumer may still be depleting the buffer on another CPU and
|
||||
moving the tail index.
|
||||
@ -120,7 +118,7 @@ however:
|
||||
To the consumer it will show an upper bound as the producer may be busy
|
||||
depleting the space.
|
||||
|
||||
[2] CIRC_CNT*() are intended to be used in the consumer. To the consumer they
|
||||
(2) CIRC_CNT*() are intended to be used in the consumer. To the consumer they
|
||||
will return a lower bound as the consumer controls the tail index, but the
|
||||
producer may still be filling the buffer on another CPU and moving the
|
||||
head index.
|
||||
@ -128,14 +126,12 @@ however:
|
||||
To the producer it will show an upper bound as the consumer may be busy
|
||||
emptying the buffer.
|
||||
|
||||
[3] To a third party, the order in which the writes to the indices by the
|
||||
(3) To a third party, the order in which the writes to the indices by the
|
||||
producer and consumer become visible cannot be guaranteed as they are
|
||||
independent and may be made on different CPUs - so the result in such a
|
||||
situation will merely be a guess, and may even be negative.
|
||||
|
||||
|
||||
===========================================
|
||||
USING MEMORY BARRIERS WITH CIRCULAR BUFFERS
|
||||
Using memory barriers with circular buffers
|
||||
===========================================
|
||||
|
||||
By using memory barriers in conjunction with circular buffers, you can avoid
|
||||
@ -152,10 +148,10 @@ time, and only one thing should be emptying a buffer at any one time, but the
|
||||
two sides can operate simultaneously.
|
||||
|
||||
|
||||
THE PRODUCER
|
||||
The producer
|
||||
------------
|
||||
|
||||
The producer will look something like this:
|
||||
The producer will look something like this::
|
||||
|
||||
spin_lock(&producer_lock);
|
||||
|
||||
@ -193,10 +189,10 @@ ordering between the read of the index indicating that the consumer has
|
||||
vacated a given element and the write by the producer to that same element.
|
||||
|
||||
|
||||
THE CONSUMER
|
||||
The Consumer
|
||||
------------
|
||||
|
||||
The consumer will look something like this:
|
||||
The consumer will look something like this::
|
||||
|
||||
spin_lock(&consumer_lock);
|
||||
|
||||
@ -235,8 +231,7 @@ prevents the compiler from tearing the store, and enforces ordering
|
||||
against previous accesses.
|
||||
|
||||
|
||||
===============
|
||||
FURTHER READING
|
||||
Further reading
|
||||
===============
|
||||
|
||||
See also Documentation/memory-barriers.txt for a description of Linux's memory
|
||||
|
@ -1,12 +1,16 @@
|
||||
The Common Clk Framework
|
||||
Mike Turquette <mturquette@ti.com>
|
||||
========================
|
||||
The Common Clk Framework
|
||||
========================
|
||||
|
||||
:Author: Mike Turquette <mturquette@ti.com>
|
||||
|
||||
This document endeavours to explain the common clk framework details,
|
||||
and how to port a platform over to this framework. It is not yet a
|
||||
detailed explanation of the clock api in include/linux/clk.h, but
|
||||
perhaps someday it will include that information.
|
||||
|
||||
Part 1 - introduction and interface split
|
||||
Introduction and interface split
|
||||
================================
|
||||
|
||||
The common clk framework is an interface to control the clock nodes
|
||||
available on various devices today. This may come in the form of clock
|
||||
@ -35,10 +39,11 @@ is defined in struct clk_foo and pointed to within struct clk_core. This
|
||||
allows for easy navigation between the two discrete halves of the common
|
||||
clock interface.
|
||||
|
||||
Part 2 - common data structures and api
|
||||
Common data structures and api
|
||||
==============================
|
||||
|
||||
Below is the common struct clk_core definition from
|
||||
drivers/clk/clk.c, modified for brevity:
|
||||
drivers/clk/clk.c, modified for brevity::
|
||||
|
||||
struct clk_core {
|
||||
const char *name;
|
||||
@ -59,7 +64,7 @@ struct clk. That api is documented in include/linux/clk.h.
|
||||
|
||||
Platforms and devices utilizing the common struct clk_core use the struct
|
||||
clk_ops pointer in struct clk_core to perform the hardware-specific parts of
|
||||
the operations defined in clk-provider.h:
|
||||
the operations defined in clk-provider.h::
|
||||
|
||||
struct clk_ops {
|
||||
int (*prepare)(struct clk_hw *hw);
|
||||
@ -95,19 +100,20 @@ the operations defined in clk-provider.h:
|
||||
struct dentry *dentry);
|
||||
};
|
||||
|
||||
Part 3 - hardware clk implementations
|
||||
Hardware clk implementations
|
||||
============================
|
||||
|
||||
The strength of the common struct clk_core comes from its .ops and .hw pointers
|
||||
which abstract the details of struct clk from the hardware-specific bits, and
|
||||
vice versa. To illustrate consider the simple gateable clk implementation in
|
||||
drivers/clk/clk-gate.c:
|
||||
drivers/clk/clk-gate.c::
|
||||
|
||||
struct clk_gate {
|
||||
struct clk_hw hw;
|
||||
void __iomem *reg;
|
||||
u8 bit_idx;
|
||||
...
|
||||
};
|
||||
struct clk_gate {
|
||||
struct clk_hw hw;
|
||||
void __iomem *reg;
|
||||
u8 bit_idx;
|
||||
...
|
||||
};
|
||||
|
||||
struct clk_gate contains struct clk_hw hw as well as hardware-specific
|
||||
knowledge about which register and bit controls this clk's gating.
|
||||
@ -115,7 +121,7 @@ Nothing about clock topology or accounting, such as enable_count or
|
||||
notifier_count, is needed here. That is all handled by the common
|
||||
framework code and struct clk_core.
|
||||
|
||||
Let's walk through enabling this clk from driver code:
|
||||
Let's walk through enabling this clk from driver code::
|
||||
|
||||
struct clk *clk;
|
||||
clk = clk_get(NULL, "my_gateable_clk");
|
||||
@ -123,70 +129,71 @@ Let's walk through enabling this clk from driver code:
|
||||
clk_prepare(clk);
|
||||
clk_enable(clk);
|
||||
|
||||
The call graph for clk_enable is very simple:
|
||||
The call graph for clk_enable is very simple::
|
||||
|
||||
clk_enable(clk);
|
||||
clk->ops->enable(clk->hw);
|
||||
[resolves to...]
|
||||
clk_gate_enable(hw);
|
||||
[resolves struct clk gate with to_clk_gate(hw)]
|
||||
clk_gate_set_bit(gate);
|
||||
clk_enable(clk);
|
||||
clk->ops->enable(clk->hw);
|
||||
[resolves to...]
|
||||
clk_gate_enable(hw);
|
||||
[resolves struct clk gate with to_clk_gate(hw)]
|
||||
clk_gate_set_bit(gate);
|
||||
|
||||
And the definition of clk_gate_set_bit:
|
||||
And the definition of clk_gate_set_bit::
|
||||
|
||||
static void clk_gate_set_bit(struct clk_gate *gate)
|
||||
{
|
||||
u32 reg;
|
||||
static void clk_gate_set_bit(struct clk_gate *gate)
|
||||
{
|
||||
u32 reg;
|
||||
|
||||
reg = __raw_readl(gate->reg);
|
||||
reg |= BIT(gate->bit_idx);
|
||||
writel(reg, gate->reg);
|
||||
}
|
||||
reg = __raw_readl(gate->reg);
|
||||
reg |= BIT(gate->bit_idx);
|
||||
writel(reg, gate->reg);
|
||||
}
|
||||
|
||||
Note that to_clk_gate is defined as:
|
||||
Note that to_clk_gate is defined as::
|
||||
|
||||
#define to_clk_gate(_hw) container_of(_hw, struct clk_gate, hw)
|
||||
#define to_clk_gate(_hw) container_of(_hw, struct clk_gate, hw)
|
||||
|
||||
This pattern of abstraction is used for every clock hardware
|
||||
representation.
|
||||
|
||||
Part 4 - supporting your own clk hardware
|
||||
Supporting your own clk hardware
|
||||
================================
|
||||
|
||||
When implementing support for a new type of clock it is only necessary to
|
||||
include the following header:
|
||||
include the following header::
|
||||
|
||||
#include <linux/clk-provider.h>
|
||||
#include <linux/clk-provider.h>
|
||||
|
||||
To construct a clk hardware structure for your platform you must define
|
||||
the following:
|
||||
the following::
|
||||
|
||||
struct clk_foo {
|
||||
struct clk_hw hw;
|
||||
... hardware specific data goes here ...
|
||||
};
|
||||
struct clk_foo {
|
||||
struct clk_hw hw;
|
||||
... hardware specific data goes here ...
|
||||
};
|
||||
|
||||
To take advantage of your data you'll need to support valid operations
|
||||
for your clk:
|
||||
for your clk::
|
||||
|
||||
struct clk_ops clk_foo_ops {
|
||||
.enable = &clk_foo_enable;
|
||||
.disable = &clk_foo_disable;
|
||||
};
|
||||
struct clk_ops clk_foo_ops {
|
||||
.enable = &clk_foo_enable;
|
||||
.disable = &clk_foo_disable;
|
||||
};
|
||||
|
||||
Implement the above functions using container_of:
|
||||
Implement the above functions using container_of::
|
||||
|
||||
#define to_clk_foo(_hw) container_of(_hw, struct clk_foo, hw)
|
||||
#define to_clk_foo(_hw) container_of(_hw, struct clk_foo, hw)
|
||||
|
||||
int clk_foo_enable(struct clk_hw *hw)
|
||||
{
|
||||
struct clk_foo *foo;
|
||||
int clk_foo_enable(struct clk_hw *hw)
|
||||
{
|
||||
struct clk_foo *foo;
|
||||
|
||||
foo = to_clk_foo(hw);
|
||||
foo = to_clk_foo(hw);
|
||||
|
||||
... perform magic on foo ...
|
||||
... perform magic on foo ...
|
||||
|
||||
return 0;
|
||||
};
|
||||
return 0;
|
||||
};
|
||||
|
||||
Below is a matrix detailing which clk_ops are mandatory based upon the
|
||||
hardware capabilities of that clock. A cell marked as "y" means
|
||||
@ -194,41 +201,56 @@ mandatory, a cell marked as "n" implies that either including that
|
||||
callback is invalid or otherwise unnecessary. Empty cells are either
|
||||
optional or must be evaluated on a case-by-case basis.
|
||||
|
||||
clock hardware characteristics
|
||||
-----------------------------------------------------------
|
||||
| gate | change rate | single parent | multiplexer | root |
|
||||
|------|-------------|---------------|-------------|------|
|
||||
.prepare | | | | | |
|
||||
.unprepare | | | | | |
|
||||
| | | | | |
|
||||
.enable | y | | | | |
|
||||
.disable | y | | | | |
|
||||
.is_enabled | y | | | | |
|
||||
| | | | | |
|
||||
.recalc_rate | | y | | | |
|
||||
.round_rate | | y [1] | | | |
|
||||
.determine_rate | | y [1] | | | |
|
||||
.set_rate | | y | | | |
|
||||
| | | | | |
|
||||
.set_parent | | | n | y | n |
|
||||
.get_parent | | | n | y | n |
|
||||
| | | | | |
|
||||
.recalc_accuracy| | | | | |
|
||||
| | | | | |
|
||||
.init | | | | | |
|
||||
-----------------------------------------------------------
|
||||
[1] either one of round_rate or determine_rate is required.
|
||||
.. table:: clock hardware characteristics
|
||||
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
| | gate | change rate | single parent | multiplexer | root |
|
||||
+================+======+=============+===============+=============+======+
|
||||
|.prepare | | | | | |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.unprepare | | | | | |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.enable | y | | | | |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.disable | y | | | | |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.is_enabled | y | | | | |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.recalc_rate | | y | | | |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.round_rate | | y [1]_ | | | |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.determine_rate | | y [1]_ | | | |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.set_rate | | y | | | |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.set_parent | | | n | y | n |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.get_parent | | | n | y | n |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.recalc_accuracy| | | | | |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.init | | | | | |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|
||||
.. [1] either one of round_rate or determine_rate is required.
|
||||
|
||||
Finally, register your clock at run-time with a hardware-specific
|
||||
registration function. This function simply populates struct clk_foo's
|
||||
data and then passes the common struct clk parameters to the framework
|
||||
with a call to:
|
||||
with a call to::
|
||||
|
||||
clk_register(...)
|
||||
clk_register(...)
|
||||
|
||||
See the basic clock types in drivers/clk/clk-*.c for examples.
|
||||
See the basic clock types in ``drivers/clk/clk-*.c`` for examples.
|
||||
|
||||
Part 5 - Disabling clock gating of unused clocks
|
||||
Disabling clock gating of unused clocks
|
||||
=======================================
|
||||
|
||||
Sometimes during development it can be useful to be able to bypass the
|
||||
default disabling of unused clocks. For example, if drivers aren't enabling
|
||||
@ -239,7 +261,8 @@ are sorted out.
|
||||
To bypass this disabling, include "clk_ignore_unused" in the bootargs to the
|
||||
kernel.
|
||||
|
||||
Part 6 - Locking
|
||||
Locking
|
||||
=======
|
||||
|
||||
The common clock framework uses two global locks, the prepare lock and the
|
||||
enable lock.
|
||||
|
@ -114,7 +114,7 @@ The Slab Cache
|
||||
User Space Memory Access
|
||||
------------------------
|
||||
|
||||
.. kernel-doc:: arch/x86/include/asm/uaccess_32.h
|
||||
.. kernel-doc:: arch/x86/include/asm/uaccess.h
|
||||
:internal:
|
||||
|
||||
.. kernel-doc:: arch/x86/lib/usercopy_32.c
|
||||
|
@ -1,9 +1,10 @@
|
||||
========
|
||||
CPU load
|
||||
--------
|
||||
========
|
||||
|
||||
Linux exports various bits of information via `/proc/stat' and
|
||||
`/proc/uptime' that userland tools, such as top(1), use to calculate
|
||||
the average time system spent in a particular state, for example:
|
||||
Linux exports various bits of information via ``/proc/stat`` and
|
||||
``/proc/uptime`` that userland tools, such as top(1), use to calculate
|
||||
the average time system spent in a particular state, for example::
|
||||
|
||||
$ iostat
|
||||
Linux 2.6.18.3-exp (linmac) 02/20/2007
|
||||
@ -17,7 +18,7 @@ Here the system thinks that over the default sampling period the
|
||||
system spent 10.01% of the time doing work in user space, 2.92% in the
|
||||
kernel, and was overall 81.63% of the time idle.
|
||||
|
||||
In most cases the `/proc/stat' information reflects the reality quite
|
||||
In most cases the ``/proc/stat`` information reflects the reality quite
|
||||
closely, however due to the nature of how/when the kernel collects
|
||||
this data sometimes it can not be trusted at all.
|
||||
|
||||
@ -33,78 +34,78 @@ Example
|
||||
-------
|
||||
|
||||
If we imagine the system with one task that periodically burns cycles
|
||||
in the following manner:
|
||||
in the following manner::
|
||||
|
||||
time line between two timer interrupts
|
||||
|--------------------------------------|
|
||||
^ ^
|
||||
|_ something begins working |
|
||||
|_ something goes to sleep
|
||||
(only to be awaken quite soon)
|
||||
time line between two timer interrupts
|
||||
|--------------------------------------|
|
||||
^ ^
|
||||
|_ something begins working |
|
||||
|_ something goes to sleep
|
||||
(only to be awaken quite soon)
|
||||
|
||||
In the above situation the system will be 0% loaded according to the
|
||||
`/proc/stat' (since the timer interrupt will always happen when the
|
||||
``/proc/stat`` (since the timer interrupt will always happen when the
|
||||
system is executing the idle handler), but in reality the load is
|
||||
closer to 99%.
|
||||
|
||||
One can imagine many more situations where this behavior of the kernel
|
||||
will lead to quite erratic information inside `/proc/stat'.
|
||||
will lead to quite erratic information inside ``/proc/stat``::
|
||||
|
||||
|
||||
/* gcc -o hog smallhog.c */
|
||||
#include <time.h>
|
||||
#include <limits.h>
|
||||
#include <signal.h>
|
||||
#include <sys/time.h>
|
||||
#define HIST 10
|
||||
/* gcc -o hog smallhog.c */
|
||||
#include <time.h>
|
||||
#include <limits.h>
|
||||
#include <signal.h>
|
||||
#include <sys/time.h>
|
||||
#define HIST 10
|
||||
|
||||
static volatile sig_atomic_t stop;
|
||||
static volatile sig_atomic_t stop;
|
||||
|
||||
static void sighandler (int signr)
|
||||
{
|
||||
(void) signr;
|
||||
stop = 1;
|
||||
}
|
||||
static unsigned long hog (unsigned long niters)
|
||||
{
|
||||
stop = 0;
|
||||
while (!stop && --niters);
|
||||
return niters;
|
||||
}
|
||||
int main (void)
|
||||
{
|
||||
int i;
|
||||
struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
|
||||
.it_value = { .tv_sec = 0, .tv_usec = 1 } };
|
||||
sigset_t set;
|
||||
unsigned long v[HIST];
|
||||
double tmp = 0.0;
|
||||
unsigned long n;
|
||||
signal (SIGALRM, &sighandler);
|
||||
setitimer (ITIMER_REAL, &it, NULL);
|
||||
static void sighandler (int signr)
|
||||
{
|
||||
(void) signr;
|
||||
stop = 1;
|
||||
}
|
||||
static unsigned long hog (unsigned long niters)
|
||||
{
|
||||
stop = 0;
|
||||
while (!stop && --niters);
|
||||
return niters;
|
||||
}
|
||||
int main (void)
|
||||
{
|
||||
int i;
|
||||
struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
|
||||
.it_value = { .tv_sec = 0, .tv_usec = 1 } };
|
||||
sigset_t set;
|
||||
unsigned long v[HIST];
|
||||
double tmp = 0.0;
|
||||
unsigned long n;
|
||||
signal (SIGALRM, &sighandler);
|
||||
setitimer (ITIMER_REAL, &it, NULL);
|
||||
|
||||
hog (ULONG_MAX);
|
||||
for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
|
||||
for (i = 0; i < HIST; ++i) tmp += v[i];
|
||||
tmp /= HIST;
|
||||
n = tmp - (tmp / 3.0);
|
||||
hog (ULONG_MAX);
|
||||
for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
|
||||
for (i = 0; i < HIST; ++i) tmp += v[i];
|
||||
tmp /= HIST;
|
||||
n = tmp - (tmp / 3.0);
|
||||
|
||||
sigemptyset (&set);
|
||||
sigaddset (&set, SIGALRM);
|
||||
sigemptyset (&set);
|
||||
sigaddset (&set, SIGALRM);
|
||||
|
||||
for (;;) {
|
||||
hog (n);
|
||||
sigwait (&set, &i);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
for (;;) {
|
||||
hog (n);
|
||||
sigwait (&set, &i);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
http://lkml.org/lkml/2007/2/12/6
|
||||
Documentation/filesystems/proc.txt (1.8)
|
||||
- http://lkml.org/lkml/2007/2/12/6
|
||||
- Documentation/filesystems/proc.txt (1.8)
|
||||
|
||||
|
||||
Thanks
|
||||
|
@ -1,3 +1,6 @@
|
||||
===========================================
|
||||
How CPU topology info is exported via sysfs
|
||||
===========================================
|
||||
|
||||
Export CPU topology info via sysfs. Items (attributes) are similar
|
||||
to /proc/cpuinfo output of some architectures:
|
||||
@ -75,24 +78,26 @@ CONFIG_SCHED_BOOK and CONFIG_DRAWER are currently only used on s390, where
|
||||
they reflect the cpu and cache hierarchy.
|
||||
|
||||
For an architecture to support this feature, it must define some of
|
||||
these macros in include/asm-XXX/topology.h:
|
||||
#define topology_physical_package_id(cpu)
|
||||
#define topology_core_id(cpu)
|
||||
#define topology_book_id(cpu)
|
||||
#define topology_drawer_id(cpu)
|
||||
#define topology_sibling_cpumask(cpu)
|
||||
#define topology_core_cpumask(cpu)
|
||||
#define topology_book_cpumask(cpu)
|
||||
#define topology_drawer_cpumask(cpu)
|
||||
these macros in include/asm-XXX/topology.h::
|
||||
|
||||
The type of **_id macros is int.
|
||||
The type of **_cpumask macros is (const) struct cpumask *. The latter
|
||||
correspond with appropriate **_siblings sysfs attributes (except for
|
||||
#define topology_physical_package_id(cpu)
|
||||
#define topology_core_id(cpu)
|
||||
#define topology_book_id(cpu)
|
||||
#define topology_drawer_id(cpu)
|
||||
#define topology_sibling_cpumask(cpu)
|
||||
#define topology_core_cpumask(cpu)
|
||||
#define topology_book_cpumask(cpu)
|
||||
#define topology_drawer_cpumask(cpu)
|
||||
|
||||
The type of ``**_id macros`` is int.
|
||||
The type of ``**_cpumask macros`` is ``(const) struct cpumask *``. The latter
|
||||
correspond with appropriate ``**_siblings`` sysfs attributes (except for
|
||||
topology_sibling_cpumask() which corresponds with thread_siblings).
|
||||
|
||||
To be consistent on all architectures, include/linux/topology.h
|
||||
provides default definitions for any of the above macros that are
|
||||
not defined by include/asm-XXX/topology.h:
|
||||
|
||||
1) physical_package_id: -1
|
||||
2) core_id: 0
|
||||
3) sibling_cpumask: just the given CPU
|
||||
@ -107,6 +112,7 @@ Additionally, CPU topology information is provided under
|
||||
/sys/devices/system/cpu and includes these files. The internal
|
||||
source for the output is in brackets ("[]").
|
||||
|
||||
=========== ==========================================================
|
||||
kernel_max: the maximum CPU index allowed by the kernel configuration.
|
||||
[NR_CPUS-1]
|
||||
|
||||
@ -122,6 +128,7 @@ source for the output is in brackets ("[]").
|
||||
|
||||
present: CPUs that have been identified as being present in the
|
||||
system. [cpu_present_mask]
|
||||
=========== ==========================================================
|
||||
|
||||
The format for the above output is compatible with cpulist_parse()
|
||||
[see <linux/cpumask.h>]. Some examples follow.
|
||||
@ -129,7 +136,7 @@ The format for the above output is compatible with cpulist_parse()
|
||||
In this example, there are 64 CPUs in the system but cpus 32-63 exceed
|
||||
the kernel max which is limited to 0..31 by the NR_CPUS config option
|
||||
being 32. Note also that CPUs 2 and 4-31 are not online but could be
|
||||
brought online as they are both present and possible.
|
||||
brought online as they are both present and possible::
|
||||
|
||||
kernel_max: 31
|
||||
offline: 2,4-31,32-63
|
||||
@ -140,7 +147,7 @@ brought online as they are both present and possible.
|
||||
In this example, the NR_CPUS config option is 128, but the kernel was
|
||||
started with possible_cpus=144. There are 4 CPUs in the system and cpu2
|
||||
was manually taken offline (and is the only CPU that can be brought
|
||||
online.)
|
||||
online.)::
|
||||
|
||||
kernel_max: 127
|
||||
offline: 2,4-127,128-143
|
||||
|
@ -1,4 +1,6 @@
|
||||
A brief CRC tutorial.
|
||||
=================================
|
||||
brief tutorial on CRC computation
|
||||
=================================
|
||||
|
||||
A CRC is a long-division remainder. You add the CRC to the message,
|
||||
and the whole thing (message+CRC) is a multiple of the given
|
||||
@ -8,7 +10,8 @@ remainder computed on the message+CRC is 0. This latter approach
|
||||
is used by a lot of hardware implementations, and is why so many
|
||||
protocols put the end-of-frame flag after the CRC.
|
||||
|
||||
It's actually the same long division you learned in school, except that
|
||||
It's actually the same long division you learned in school, except that:
|
||||
|
||||
- We're working in binary, so the digits are only 0 and 1, and
|
||||
- When dividing polynomials, there are no carries. Rather than add and
|
||||
subtract, we just xor. Thus, we tend to get a bit sloppy about
|
||||
@ -40,11 +43,12 @@ throw the quotient bit away, but subtract the appropriate multiple of
|
||||
the polynomial from the remainder and we're back to where we started,
|
||||
ready to process the next bit.
|
||||
|
||||
A big-endian CRC written this way would be coded like:
|
||||
for (i = 0; i < input_bits; i++) {
|
||||
multiple = remainder & 0x80000000 ? CRCPOLY : 0;
|
||||
remainder = (remainder << 1 | next_input_bit()) ^ multiple;
|
||||
}
|
||||
A big-endian CRC written this way would be coded like::
|
||||
|
||||
for (i = 0; i < input_bits; i++) {
|
||||
multiple = remainder & 0x80000000 ? CRCPOLY : 0;
|
||||
remainder = (remainder << 1 | next_input_bit()) ^ multiple;
|
||||
}
|
||||
|
||||
Notice how, to get at bit 32 of the shifted remainder, we look
|
||||
at bit 31 of the remainder *before* shifting it.
|
||||
@ -54,25 +58,26 @@ the remainder don't actually affect any decision-making until
|
||||
32 bits later. Thus, the first 32 cycles of this are pretty boring.
|
||||
Also, to add the CRC to a message, we need a 32-bit-long hole for it at
|
||||
the end, so we have to add 32 extra cycles shifting in zeros at the
|
||||
end of every message,
|
||||
end of every message.
|
||||
|
||||
These details lead to a standard trick: rearrange merging in the
|
||||
next_input_bit() until the moment it's needed. Then the first 32 cycles
|
||||
can be precomputed, and merging in the final 32 zero bits to make room
|
||||
for the CRC can be skipped entirely. This changes the code to:
|
||||
for the CRC can be skipped entirely. This changes the code to::
|
||||
|
||||
for (i = 0; i < input_bits; i++) {
|
||||
remainder ^= next_input_bit() << 31;
|
||||
multiple = (remainder & 0x80000000) ? CRCPOLY : 0;
|
||||
remainder = (remainder << 1) ^ multiple;
|
||||
}
|
||||
for (i = 0; i < input_bits; i++) {
|
||||
remainder ^= next_input_bit() << 31;
|
||||
multiple = (remainder & 0x80000000) ? CRCPOLY : 0;
|
||||
remainder = (remainder << 1) ^ multiple;
|
||||
}
|
||||
|
||||
With this optimization, the little-endian code is particularly simple:
|
||||
for (i = 0; i < input_bits; i++) {
|
||||
remainder ^= next_input_bit();
|
||||
multiple = (remainder & 1) ? CRCPOLY : 0;
|
||||
remainder = (remainder >> 1) ^ multiple;
|
||||
}
|
||||
With this optimization, the little-endian code is particularly simple::
|
||||
|
||||
for (i = 0; i < input_bits; i++) {
|
||||
remainder ^= next_input_bit();
|
||||
multiple = (remainder & 1) ? CRCPOLY : 0;
|
||||
remainder = (remainder >> 1) ^ multiple;
|
||||
}
|
||||
|
||||
The most significant coefficient of the remainder polynomial is stored
|
||||
in the least significant bit of the binary "remainder" variable.
|
||||
@ -81,23 +86,25 @@ be bit-reversed) and next_input_bit().
|
||||
|
||||
As long as next_input_bit is returning the bits in a sensible order, we don't
|
||||
*have* to wait until the last possible moment to merge in additional bits.
|
||||
We can do it 8 bits at a time rather than 1 bit at a time:
|
||||
for (i = 0; i < input_bytes; i++) {
|
||||
remainder ^= next_input_byte() << 24;
|
||||
for (j = 0; j < 8; j++) {
|
||||
multiple = (remainder & 0x80000000) ? CRCPOLY : 0;
|
||||
remainder = (remainder << 1) ^ multiple;
|
||||
}
|
||||
}
|
||||
We can do it 8 bits at a time rather than 1 bit at a time::
|
||||
|
||||
Or in little-endian:
|
||||
for (i = 0; i < input_bytes; i++) {
|
||||
remainder ^= next_input_byte();
|
||||
for (j = 0; j < 8; j++) {
|
||||
multiple = (remainder & 1) ? CRCPOLY : 0;
|
||||
remainder = (remainder >> 1) ^ multiple;
|
||||
for (i = 0; i < input_bytes; i++) {
|
||||
remainder ^= next_input_byte() << 24;
|
||||
for (j = 0; j < 8; j++) {
|
||||
multiple = (remainder & 0x80000000) ? CRCPOLY : 0;
|
||||
remainder = (remainder << 1) ^ multiple;
|
||||
}
|
||||
}
|
||||
|
||||
Or in little-endian::
|
||||
|
||||
for (i = 0; i < input_bytes; i++) {
|
||||
remainder ^= next_input_byte();
|
||||
for (j = 0; j < 8; j++) {
|
||||
multiple = (remainder & 1) ? CRCPOLY : 0;
|
||||
remainder = (remainder >> 1) ^ multiple;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
If the input is a multiple of 32 bits, you can even XOR in a 32-bit
|
||||
word at a time and increase the inner loop count to 32.
|
||||
|
@ -10,6 +10,7 @@ Contents:
|
||||
- Signature verification.
|
||||
- Asymmetric key subtypes.
|
||||
- Instantiation data parsers.
|
||||
- Keyring link restrictions.
|
||||
|
||||
|
||||
========
|
||||
@ -318,7 +319,8 @@ KEYRING LINK RESTRICTIONS
|
||||
=========================
|
||||
|
||||
Keyrings created from userspace using add_key can be configured to check the
|
||||
signature of the key being linked.
|
||||
signature of the key being linked. Keys without a valid signature are not
|
||||
allowed to link.
|
||||
|
||||
Several restriction methods are available:
|
||||
|
||||
@ -327,9 +329,10 @@ Several restriction methods are available:
|
||||
- Option string used with KEYCTL_RESTRICT_KEYRING:
|
||||
- "builtin_trusted"
|
||||
|
||||
The kernel builtin trusted keyring will be searched for the signing
|
||||
key. The ca_keys kernel parameter also affects which keys are used for
|
||||
signature verification.
|
||||
The kernel builtin trusted keyring will be searched for the signing key.
|
||||
If the builtin trusted keyring is not configured, all links will be
|
||||
rejected. The ca_keys kernel parameter also affects which keys are used
|
||||
for signature verification.
|
||||
|
||||
(2) Restrict using the kernel builtin and secondary trusted keyrings
|
||||
|
||||
@ -337,8 +340,10 @@ Several restriction methods are available:
|
||||
- "builtin_and_secondary_trusted"
|
||||
|
||||
The kernel builtin and secondary trusted keyrings will be searched for the
|
||||
signing key. The ca_keys kernel parameter also affects which keys are used
|
||||
for signature verification.
|
||||
signing key. If the secondary trusted keyring is not configured, this
|
||||
restriction will behave like the "builtin_trusted" option. The ca_keys
|
||||
kernel parameter also affects which keys are used for signature
|
||||
verification.
|
||||
|
||||
(3) Restrict using a separate key or keyring
|
||||
|
||||
@ -346,7 +351,7 @@ Several restriction methods are available:
|
||||
- "key_or_keyring:<key or keyring serial number>[:chain]"
|
||||
|
||||
Whenever a key link is requested, the link will only succeed if the key
|
||||
being linked is signed by one of the designated keys. This key may be
|
||||
being linked is signed by one of the designated keys. This key may be
|
||||
specified directly by providing a serial number for one asymmetric key, or
|
||||
a group of keys may be searched for the signing key by providing the
|
||||
serial number for a keyring.
|
||||
@ -354,7 +359,51 @@ Several restriction methods are available:
|
||||
When the "chain" option is provided at the end of the string, the keys
|
||||
within the destination keyring will also be searched for signing keys.
|
||||
This allows for verification of certificate chains by adding each
|
||||
cert in order (starting closest to the root) to one keyring.
|
||||
certificate in order (starting closest to the root) to a keyring. For
|
||||
instance, one keyring can be populated with links to a set of root
|
||||
certificates, with a separate, restricted keyring set up for each
|
||||
certificate chain to be validated:
|
||||
|
||||
# Create and populate a keyring for root certificates
|
||||
root_id=`keyctl add keyring root-certs "" @s`
|
||||
keyctl padd asymmetric "" $root_id < root1.cert
|
||||
keyctl padd asymmetric "" $root_id < root2.cert
|
||||
|
||||
# Create and restrict a keyring for the certificate chain
|
||||
chain_id=`keyctl add keyring chain "" @s`
|
||||
keyctl restrict_keyring $chain_id asymmetric key_or_keyring:$root_id:chain
|
||||
|
||||
# Attempt to add each certificate in the chain, starting with the
|
||||
# certificate closest to the root.
|
||||
keyctl padd asymmetric "" $chain_id < intermediateA.cert
|
||||
keyctl padd asymmetric "" $chain_id < intermediateB.cert
|
||||
keyctl padd asymmetric "" $chain_id < end-entity.cert
|
||||
|
||||
If the final end-entity certificate is successfully added to the "chain"
|
||||
keyring, we can be certain that it has a valid signing chain going back to
|
||||
one of the root certificates.
|
||||
|
||||
A single keyring can be used to verify a chain of signatures by
|
||||
restricting the keyring after linking the root certificate:
|
||||
|
||||
# Create a keyring for the certificate chain and add the root
|
||||
chain2_id=`keyctl add keyring chain2 "" @s`
|
||||
keyctl padd asymmetric "" $chain2_id < root1.cert
|
||||
|
||||
# Restrict the keyring that already has root1.cert linked. The cert
|
||||
# will remain linked by the keyring.
|
||||
keyctl restrict_keyring $chain2_id asymmetric key_or_keyring:0:chain
|
||||
|
||||
# Attempt to add each certificate in the chain, starting with the
|
||||
# certificate closest to the root.
|
||||
keyctl padd asymmetric "" $chain2_id < intermediateA.cert
|
||||
keyctl padd asymmetric "" $chain2_id < intermediateB.cert
|
||||
keyctl padd asymmetric "" $chain2_id < end-entity.cert
|
||||
|
||||
If the final end-entity certificate is successfully added to the "chain2"
|
||||
keyring, we can be certain that there is a valid signing chain going back
|
||||
to the root certificate that was added before the keyring was restricted.
|
||||
|
||||
|
||||
In all of these cases, if the signing key is found the signature of the key to
|
||||
be linked will be verified using the signing key. The requested key is added
|
||||
|
@ -1,4 +1,9 @@
|
||||
===================================
|
||||
Dell Systems Management Base Driver
|
||||
===================================
|
||||
|
||||
Overview
|
||||
========
|
||||
|
||||
The Dell Systems Management Base Driver provides a sysfs interface for
|
||||
systems management software such as Dell OpenManage to perform system
|
||||
@ -17,6 +22,7 @@ more information about the libsmbios project.
|
||||
|
||||
|
||||
System Management Interrupt
|
||||
===========================
|
||||
|
||||
On some Dell systems, systems management software must access certain
|
||||
management information via a system management interrupt (SMI). The SMI data
|
||||
@ -24,12 +30,12 @@ buffer must reside in 32-bit address space, and the physical address of the
|
||||
buffer is required for the SMI. The driver maintains the memory required for
|
||||
the SMI and provides a way for the application to generate the SMI.
|
||||
The driver creates the following sysfs entries for systems management
|
||||
software to perform these system management interrupts:
|
||||
software to perform these system management interrupts::
|
||||
|
||||
/sys/devices/platform/dcdbas/smi_data
|
||||
/sys/devices/platform/dcdbas/smi_data_buf_phys_addr
|
||||
/sys/devices/platform/dcdbas/smi_data_buf_size
|
||||
/sys/devices/platform/dcdbas/smi_request
|
||||
/sys/devices/platform/dcdbas/smi_data
|
||||
/sys/devices/platform/dcdbas/smi_data_buf_phys_addr
|
||||
/sys/devices/platform/dcdbas/smi_data_buf_size
|
||||
/sys/devices/platform/dcdbas/smi_request
|
||||
|
||||
Systems management software must perform the following steps to execute
|
||||
a SMI using this driver:
|
||||
@ -43,6 +49,7 @@ a SMI using this driver:
|
||||
|
||||
|
||||
Host Control Action
|
||||
===================
|
||||
|
||||
Dell OpenManage supports a host control feature that allows the administrator
|
||||
to perform a power cycle or power off of the system after the OS has finished
|
||||
@ -69,12 +76,14 @@ power off host control action using this driver:
|
||||
|
||||
|
||||
Host Control SMI Type
|
||||
=====================
|
||||
|
||||
The following table shows the value to write to host_control_smi_type to
|
||||
perform a power cycle or power off host control action:
|
||||
|
||||
=================== =====================
|
||||
PowerEdge System Host Control SMI Type
|
||||
---------------- ---------------------
|
||||
=================== =====================
|
||||
300 HC_SMITYPE_TYPE1
|
||||
1300 HC_SMITYPE_TYPE1
|
||||
1400 HC_SMITYPE_TYPE2
|
||||
@ -87,5 +96,4 @@ PowerEdge System Host Control SMI Type
|
||||
1655MC HC_SMITYPE_TYPE2
|
||||
700 HC_SMITYPE_TYPE3
|
||||
750 HC_SMITYPE_TYPE3
|
||||
|
||||
|
||||
=================== =====================
|
||||
|
@ -1,6 +1,6 @@
|
||||
|
||||
Using physical DMA provided by OHCI-1394 FireWire controllers for debugging
|
||||
---------------------------------------------------------------------------
|
||||
===========================================================================
|
||||
Using physical DMA provided by OHCI-1394 FireWire controllers for debugging
|
||||
===========================================================================
|
||||
|
||||
Introduction
|
||||
------------
|
||||
@ -91,10 +91,10 @@ Step-by-step instructions for using firescope with early OHCI initialization:
|
||||
1) Verify that your hardware is supported:
|
||||
|
||||
Load the firewire-ohci module and check your kernel logs.
|
||||
You should see a line similar to
|
||||
You should see a line similar to::
|
||||
|
||||
firewire_ohci 0000:15:00.1: added OHCI v1.0 device as card 2, 4 IR + 4 IT
|
||||
... contexts, quirks 0x11
|
||||
firewire_ohci 0000:15:00.1: added OHCI v1.0 device as card 2, 4 IR + 4 IT
|
||||
... contexts, quirks 0x11
|
||||
|
||||
when loading the driver. If you have no supported controller, many PCI,
|
||||
CardBus and even some Express cards which are fully compliant to OHCI-1394
|
||||
@ -113,9 +113,9 @@ Step-by-step instructions for using firescope with early OHCI initialization:
|
||||
stable connection and has matching connectors (there are small 4-pin and
|
||||
large 6-pin FireWire ports) will do.
|
||||
|
||||
If an driver is running on both machines you should see a line like
|
||||
If an driver is running on both machines you should see a line like::
|
||||
|
||||
firewire_core 0000:15:00.1: created device fw1: GUID 00061b0020105917, S400
|
||||
firewire_core 0000:15:00.1: created device fw1: GUID 00061b0020105917, S400
|
||||
|
||||
on both machines in the kernel log when the cable is plugged in
|
||||
and connects the two machines.
|
||||
@ -123,7 +123,7 @@ Step-by-step instructions for using firescope with early OHCI initialization:
|
||||
3) Test physical DMA using firescope:
|
||||
|
||||
On the debug host, make sure that /dev/fw* is accessible,
|
||||
then start firescope:
|
||||
then start firescope::
|
||||
|
||||
$ firescope
|
||||
Port 0 (/dev/fw1) opened, 2 nodes detected
|
||||
@ -163,7 +163,7 @@ Step-by-step instructions for using firescope with early OHCI initialization:
|
||||
host loaded, reboot the debugged machine, booting the kernel which has
|
||||
CONFIG_PROVIDE_OHCI1394_DMA_INIT enabled, with the option ohci1394_dma=early.
|
||||
|
||||
Then, on the debugging host, run firescope, for example by using -A:
|
||||
Then, on the debugging host, run firescope, for example by using -A::
|
||||
|
||||
firescope -A System.map-of-debug-target-kernel
|
||||
|
||||
@ -178,6 +178,7 @@ Step-by-step instructions for using firescope with early OHCI initialization:
|
||||
|
||||
Notes
|
||||
-----
|
||||
|
||||
Documentation and specifications: http://halobates.de/firewire/
|
||||
|
||||
FireWire is a trademark of Apple Inc. - for more information please refer to:
|
||||
|
@ -1,18 +1,30 @@
|
||||
Purpose:
|
||||
Demonstrate the usage of the new open sourced rbu (Remote BIOS Update) driver
|
||||
=============================================================
|
||||
Usage of the new open sourced rbu (Remote BIOS Update) driver
|
||||
=============================================================
|
||||
|
||||
Purpose
|
||||
=======
|
||||
|
||||
Document demonstrating the use of the Dell Remote BIOS Update driver.
|
||||
for updating BIOS images on Dell servers and desktops.
|
||||
|
||||
Scope:
|
||||
Scope
|
||||
=====
|
||||
|
||||
This document discusses the functionality of the rbu driver only.
|
||||
It does not cover the support needed from applications to enable the BIOS to
|
||||
update itself with the image downloaded in to the memory.
|
||||
|
||||
Overview:
|
||||
Overview
|
||||
========
|
||||
|
||||
This driver works with Dell OpenManage or Dell Update Packages for updating
|
||||
the BIOS on Dell servers (starting from servers sold since 1999), desktops
|
||||
and notebooks (starting from those sold in 2005).
|
||||
|
||||
Please go to http://support.dell.com register and you can find info on
|
||||
OpenManage and Dell Update packages (DUP).
|
||||
|
||||
Libsmbios can also be used to update BIOS on Dell systems go to
|
||||
http://linux.dell.com/libsmbios/ for details.
|
||||
|
||||
@ -22,6 +34,7 @@ of physical pages having the BIOS image. In case of packetized the app
|
||||
using the driver breaks the image in to packets of fixed sizes and the driver
|
||||
would place each packet in contiguous physical memory. The driver also
|
||||
maintains a link list of packets for reading them back.
|
||||
|
||||
If the dell_rbu driver is unloaded all the allocated memory is freed.
|
||||
|
||||
The rbu driver needs to have an application (as mentioned above)which will
|
||||
@ -30,28 +43,33 @@ inform the BIOS to enable the update in the next system reboot.
|
||||
The user should not unload the rbu driver after downloading the BIOS image
|
||||
or updating.
|
||||
|
||||
The driver load creates the following directories under the /sys file system.
|
||||
/sys/class/firmware/dell_rbu/loading
|
||||
/sys/class/firmware/dell_rbu/data
|
||||
/sys/devices/platform/dell_rbu/image_type
|
||||
/sys/devices/platform/dell_rbu/data
|
||||
/sys/devices/platform/dell_rbu/packet_size
|
||||
The driver load creates the following directories under the /sys file system::
|
||||
|
||||
/sys/class/firmware/dell_rbu/loading
|
||||
/sys/class/firmware/dell_rbu/data
|
||||
/sys/devices/platform/dell_rbu/image_type
|
||||
/sys/devices/platform/dell_rbu/data
|
||||
/sys/devices/platform/dell_rbu/packet_size
|
||||
|
||||
The driver supports two types of update mechanism; monolithic and packetized.
|
||||
These update mechanism depends upon the BIOS currently running on the system.
|
||||
Most of the Dell systems support a monolithic update where the BIOS image is
|
||||
copied to a single contiguous block of physical memory.
|
||||
|
||||
In case of packet mechanism the single memory can be broken in smaller chunks
|
||||
of contiguous memory and the BIOS image is scattered in these packets.
|
||||
|
||||
By default the driver uses monolithic memory for the update type. This can be
|
||||
changed to packets during the driver load time by specifying the load
|
||||
parameter image_type=packet. This can also be changed later as below
|
||||
echo packet > /sys/devices/platform/dell_rbu/image_type
|
||||
parameter image_type=packet. This can also be changed later as below::
|
||||
|
||||
echo packet > /sys/devices/platform/dell_rbu/image_type
|
||||
|
||||
In packet update mode the packet size has to be given before any packets can
|
||||
be downloaded. It is done as below
|
||||
echo XXXX > /sys/devices/platform/dell_rbu/packet_size
|
||||
be downloaded. It is done as below::
|
||||
|
||||
echo XXXX > /sys/devices/platform/dell_rbu/packet_size
|
||||
|
||||
In the packet update mechanism, the user needs to create a new file having
|
||||
packets of data arranged back to back. It can be done as follows
|
||||
The user creates packets header, gets the chunk of the BIOS image and
|
||||
@ -60,41 +78,54 @@ added together should match the specified packet_size. This makes one
|
||||
packet, the user needs to create more such packets out of the entire BIOS
|
||||
image file and then arrange all these packets back to back in to one single
|
||||
file.
|
||||
|
||||
This file is then copied to /sys/class/firmware/dell_rbu/data.
|
||||
Once this file gets to the driver, the driver extracts packet_size data from
|
||||
the file and spreads it across the physical memory in contiguous packet_sized
|
||||
space.
|
||||
|
||||
This method makes sure that all the packets get to the driver in a single operation.
|
||||
|
||||
In monolithic update the user simply get the BIOS image (.hdr file) and copies
|
||||
to the data file as is without any change to the BIOS image itself.
|
||||
|
||||
Do the steps below to download the BIOS image.
|
||||
|
||||
1) echo 1 > /sys/class/firmware/dell_rbu/loading
|
||||
2) cp bios_image.hdr /sys/class/firmware/dell_rbu/data
|
||||
3) echo 0 > /sys/class/firmware/dell_rbu/loading
|
||||
|
||||
The /sys/class/firmware/dell_rbu/ entries will remain till the following is
|
||||
done.
|
||||
echo -1 > /sys/class/firmware/dell_rbu/loading
|
||||
|
||||
::
|
||||
|
||||
echo -1 > /sys/class/firmware/dell_rbu/loading
|
||||
|
||||
Until this step is completed the driver cannot be unloaded.
|
||||
|
||||
Also echoing either mono, packet or init in to image_type will free up the
|
||||
memory allocated by the driver.
|
||||
|
||||
If a user by accident executes steps 1 and 3 above without executing step 2;
|
||||
it will make the /sys/class/firmware/dell_rbu/ entries disappear.
|
||||
The entries can be recreated by doing the following
|
||||
echo init > /sys/devices/platform/dell_rbu/image_type
|
||||
NOTE: echoing init in image_type does not change it original value.
|
||||
|
||||
The entries can be recreated by doing the following::
|
||||
|
||||
echo init > /sys/devices/platform/dell_rbu/image_type
|
||||
|
||||
.. note:: echoing init in image_type does not change it original value.
|
||||
|
||||
Also the driver provides /sys/devices/platform/dell_rbu/data readonly file to
|
||||
read back the image downloaded.
|
||||
|
||||
NOTE:
|
||||
This driver requires a patch for firmware_class.c which has the modified
|
||||
request_firmware_nowait function.
|
||||
Also after updating the BIOS image a user mode application needs to execute
|
||||
code which sends the BIOS update request to the BIOS. So on the next reboot
|
||||
the BIOS knows about the new image downloaded and it updates itself.
|
||||
Also don't unload the rbu driver if the image has to be updated.
|
||||
.. note::
|
||||
|
||||
This driver requires a patch for firmware_class.c which has the modified
|
||||
request_firmware_nowait function.
|
||||
|
||||
Also after updating the BIOS image a user mode application needs to execute
|
||||
code which sends the BIOS update request to the BIOS. So on the next reboot
|
||||
the BIOS knows about the new image downloaded and it updates itself.
|
||||
Also don't unload the rbu driver if the image has to be updated.
|
||||
|
||||
|
31
Documentation/devicetree/bindings/clock/img,boston-clock.txt
Normal file
31
Documentation/devicetree/bindings/clock/img,boston-clock.txt
Normal file
@ -0,0 +1,31 @@
|
||||
Binding for Imagination Technologies MIPS Boston clock sources.
|
||||
|
||||
This binding uses the common clock binding[1].
|
||||
|
||||
[1] Documentation/devicetree/bindings/clock/clock-bindings.txt
|
||||
|
||||
The device node must be a child node of the syscon node corresponding to the
|
||||
Boston system's platform registers.
|
||||
|
||||
Required properties:
|
||||
- compatible : Should be "img,boston-clock".
|
||||
- #clock-cells : Should be set to 1.
|
||||
Values available for clock consumers can be found in the header file:
|
||||
<dt-bindings/clock/boston-clock.h>
|
||||
|
||||
Example:
|
||||
|
||||
system-controller@17ffd000 {
|
||||
compatible = "img,boston-platform-regs", "syscon";
|
||||
reg = <0x17ffd000 0x1000>;
|
||||
|
||||
clk_boston: clock {
|
||||
compatible = "img,boston-clock";
|
||||
#clock-cells = <1>;
|
||||
};
|
||||
};
|
||||
|
||||
uart0: uart@17ffe000 {
|
||||
/* ... */
|
||||
clocks = <&clk_boston BOSTON_CLK_SYS>;
|
||||
};
|
48
Documentation/devicetree/bindings/i2c/i2c-aspeed.txt
Normal file
48
Documentation/devicetree/bindings/i2c/i2c-aspeed.txt
Normal file
@ -0,0 +1,48 @@
|
||||
Device tree configuration for the I2C busses on the AST24XX and AST25XX SoCs.
|
||||
|
||||
Required Properties:
|
||||
- #address-cells : should be 1
|
||||
- #size-cells : should be 0
|
||||
- reg : address offset and range of bus
|
||||
- compatible : should be "aspeed,ast2400-i2c-bus"
|
||||
or "aspeed,ast2500-i2c-bus"
|
||||
- clocks : root clock of bus, should reference the APB
|
||||
clock
|
||||
- interrupts : interrupt number
|
||||
- interrupt-parent : interrupt controller for bus, should reference a
|
||||
aspeed,ast2400-i2c-ic or aspeed,ast2500-i2c-ic
|
||||
interrupt controller
|
||||
|
||||
Optional Properties:
|
||||
- bus-frequency : frequency of the bus clock in Hz defaults to 100 kHz when not
|
||||
specified
|
||||
- multi-master : states that there is another master active on this bus.
|
||||
|
||||
Example:
|
||||
|
||||
i2c {
|
||||
compatible = "simple-bus";
|
||||
#address-cells = <1>;
|
||||
#size-cells = <1>;
|
||||
ranges = <0 0x1e78a000 0x1000>;
|
||||
|
||||
i2c_ic: interrupt-controller@0 {
|
||||
#interrupt-cells = <1>;
|
||||
compatible = "aspeed,ast2400-i2c-ic";
|
||||
reg = <0x0 0x40>;
|
||||
interrupts = <12>;
|
||||
interrupt-controller;
|
||||
};
|
||||
|
||||
i2c0: i2c-bus@40 {
|
||||
#address-cells = <1>;
|
||||
#size-cells = <0>;
|
||||
#interrupt-cells = <1>;
|
||||
reg = <0x40 0x40>;
|
||||
compatible = "aspeed,ast2400-i2c-bus";
|
||||
clocks = <&clk_apb>;
|
||||
bus-frequency = <100000>;
|
||||
interrupts = <0>;
|
||||
interrupt-parent = <&i2c_ic>;
|
||||
};
|
||||
};
|
@ -20,7 +20,7 @@ Optional properties :
|
||||
- i2c-sda-falling-time-ns : should contain the SDA falling time in nanoseconds.
|
||||
This value which is by default 300ns is used to compute the tHIGH period.
|
||||
|
||||
Example :
|
||||
Examples :
|
||||
|
||||
i2c@f0000 {
|
||||
#address-cells = <1>;
|
||||
@ -43,3 +43,17 @@ Example :
|
||||
i2c-sda-falling-time-ns = <300>;
|
||||
i2c-scl-falling-time-ns = <300>;
|
||||
};
|
||||
|
||||
i2c@1120000 {
|
||||
#address-cells = <1>;
|
||||
#size-cells = <0>;
|
||||
reg = <0x2000 0x100>;
|
||||
clock-frequency = <400000>;
|
||||
clocks = <&i2cclk>;
|
||||
interrupts = <0>;
|
||||
|
||||
eeprom@64 {
|
||||
compatible = "linux,slave-24c02";
|
||||
reg = <0x40000064>;
|
||||
};
|
||||
};
|
||||
|
29
Documentation/devicetree/bindings/i2c/i2c-pca-platform.txt
Normal file
29
Documentation/devicetree/bindings/i2c/i2c-pca-platform.txt
Normal file
@ -0,0 +1,29 @@
|
||||
* NXP PCA PCA9564/PCA9665 I2C controller
|
||||
|
||||
The PCA9564/PCA9665 serves as an interface between most standard
|
||||
parallel-bus microcontrollers/microprocessors and the serial I2C-bus
|
||||
and allows the parallel bus system to communicate bi-directionally
|
||||
with the I2C-bus.
|
||||
|
||||
Required properties :
|
||||
|
||||
- reg : Offset and length of the register set for the device
|
||||
- compatible : one of "nxp,pca9564" or "nxp,pca9665"
|
||||
|
||||
Optional properties
|
||||
- interrupts : the interrupt number
|
||||
- interrupt-parent : the phandle for the interrupt controller.
|
||||
If an interrupt is not specified polling will be used.
|
||||
- reset-gpios : gpio specifier for gpio connected to RESET_N pin. As the line
|
||||
is active low, it should be marked GPIO_ACTIVE_LOW.
|
||||
- clock-frequency : I2C bus frequency.
|
||||
|
||||
Example:
|
||||
i2c0: i2c@80000 {
|
||||
compatible = "nxp,pca9564";
|
||||
#address-cells = <1>;
|
||||
#size-cells = <0>;
|
||||
reg = <0x80000 0x4>;
|
||||
reset-gpios = <&gpio1 0 GPIO_ACTIVE_LOW>;
|
||||
clock-frequency = <100000>;
|
||||
};
|
22
Documentation/devicetree/bindings/i2c/i2c-zx2967.txt
Normal file
22
Documentation/devicetree/bindings/i2c/i2c-zx2967.txt
Normal file
@ -0,0 +1,22 @@
|
||||
ZTE zx2967 I2C controller
|
||||
|
||||
Required properties:
|
||||
- compatible: must be "zte,zx296718-i2c"
|
||||
- reg: physical address and length of the device registers
|
||||
- interrupts: a single interrupt specifier
|
||||
- clocks: clock for the device
|
||||
- #address-cells: should be <1>
|
||||
- #size-cells: should be <0>
|
||||
- clock-frequency: the desired I2C bus clock frequency.
|
||||
|
||||
Examples:
|
||||
|
||||
i2c@112000 {
|
||||
compatible = "zte,zx296718-i2c";
|
||||
reg = <0x00112000 0x1000>;
|
||||
interrupts = <GIC_SPI 112 IRQ_TYPE_LEVEL_HIGH>;
|
||||
clocks = <&osc24m>;
|
||||
#address-cells = <1>
|
||||
#size-cells = <0>;
|
||||
clock-frequency = <1600000>;
|
||||
};
|
@ -26,6 +26,12 @@ the PCIe specification.
|
||||
* "priq" - PRI Queue not empty
|
||||
* "cmdq-sync" - CMD_SYNC complete
|
||||
* "gerror" - Global Error activated
|
||||
* "combined" - The combined interrupt is optional,
|
||||
and should only be provided if the
|
||||
hardware supports just a single,
|
||||
combined interrupt line.
|
||||
If provided, then the combined interrupt
|
||||
will be used in preference to any others.
|
||||
|
||||
- #iommu-cells : See the generic IOMMU binding described in
|
||||
devicetree/bindings/pci/pci-iommu.txt
|
||||
@ -49,6 +55,12 @@ the PCIe specification.
|
||||
- hisilicon,broken-prefetch-cmd
|
||||
: Avoid sending CMD_PREFETCH_* commands to the SMMU.
|
||||
|
||||
- cavium,cn9900-broken-page1-regspace
|
||||
: Replaces all page 1 offsets used for EVTQ_PROD/CONS,
|
||||
PRIQ_PROD/CONS register access with page 0 offsets.
|
||||
Set for Cavium ThunderX2 silicon that doesn't support
|
||||
SMMU page1 register space.
|
||||
|
||||
** Example
|
||||
|
||||
smmu@2b400000 {
|
||||
|
@ -3,10 +3,23 @@
|
||||
Required properties:
|
||||
- compatible : should be one of the following:
|
||||
"altr,socfpga-denali-nand" - for Altera SOCFPGA
|
||||
"socionext,uniphier-denali-nand-v5a" - for Socionext UniPhier (v5a)
|
||||
"socionext,uniphier-denali-nand-v5b" - for Socionext UniPhier (v5b)
|
||||
- reg : should contain registers location and length for data and reg.
|
||||
- reg-names: Should contain the reg names "nand_data" and "denali_reg"
|
||||
- interrupts : The interrupt number.
|
||||
|
||||
Optional properties:
|
||||
- nand-ecc-step-size: see nand.txt for details. If present, the value must be
|
||||
512 for "altr,socfpga-denali-nand"
|
||||
1024 for "socionext,uniphier-denali-nand-v5a"
|
||||
1024 for "socionext,uniphier-denali-nand-v5b"
|
||||
- nand-ecc-strength: see nand.txt for details. Valid values are:
|
||||
8, 15 for "altr,socfpga-denali-nand"
|
||||
8, 16, 24 for "socionext,uniphier-denali-nand-v5a"
|
||||
8, 16 for "socionext,uniphier-denali-nand-v5b"
|
||||
- nand-ecc-maximize: see nand.txt for details
|
||||
|
||||
The device tree may optionally contain sub-nodes describing partitions of the
|
||||
address space. See partition.txt for more detail.
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
Error location module
|
||||
|
||||
Required properties:
|
||||
- compatible: Must be "ti,am33xx-elm"
|
||||
- compatible: Must be "ti,am3352-elm"
|
||||
- reg: physical base address and size of the registers map.
|
||||
- interrupts: Interrupt number for the elm.
|
||||
|
||||
|
@ -5,7 +5,7 @@ the GPMC controller with a name of "nand".
|
||||
|
||||
All timing relevant properties as well as generic gpmc child properties are
|
||||
explained in a separate documents - please refer to
|
||||
Documentation/devicetree/bindings/bus/ti-gpmc.txt
|
||||
Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt
|
||||
|
||||
For NAND specific properties such as ECC modes or bus width, please refer to
|
||||
Documentation/devicetree/bindings/mtd/nand.txt
|
||||
|
@ -5,7 +5,7 @@ child nodes of the GPMC controller with a name of "nor".
|
||||
|
||||
All timing relevant properties as well as generic GPMC child properties are
|
||||
explained in a separate documents. Please refer to
|
||||
Documentation/devicetree/bindings/bus/ti-gpmc.txt
|
||||
Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt
|
||||
|
||||
Required properties:
|
||||
- bank-width: Width of NOR flash in bytes. GPMC supports 8-bit and
|
||||
@ -28,7 +28,7 @@ Required properties:
|
||||
|
||||
Optional properties:
|
||||
- gpmc,XXX Additional GPMC timings and settings parameters. See
|
||||
Documentation/devicetree/bindings/bus/ti-gpmc.txt
|
||||
Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt
|
||||
|
||||
Optional properties for partition table parsing:
|
||||
- #address-cells: should be set to 1
|
||||
|
@ -5,7 +5,7 @@ the GPMC controller with a name of "onenand".
|
||||
|
||||
All timing relevant properties as well as generic gpmc child properties are
|
||||
explained in a separate documents - please refer to
|
||||
Documentation/devicetree/bindings/bus/ti-gpmc.txt
|
||||
Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt
|
||||
|
||||
Required properties:
|
||||
|
||||
|
@ -4,7 +4,12 @@ The GPMI nand controller provides an interface to control the
|
||||
NAND flash chips.
|
||||
|
||||
Required properties:
|
||||
- compatible : should be "fsl,<chip>-gpmi-nand"
|
||||
- compatible : should be "fsl,<chip>-gpmi-nand", chip can be:
|
||||
* imx23
|
||||
* imx28
|
||||
* imx6q
|
||||
* imx6sx
|
||||
* imx7d
|
||||
- reg : should contain registers location and length for gpmi and bch.
|
||||
- reg-names: Should contain the reg names "gpmi-nand" and "bch"
|
||||
- interrupts : BCH interrupt number.
|
||||
@ -13,6 +18,13 @@ Required properties:
|
||||
and GPMI DMA channel ID.
|
||||
Refer to dma.txt and fsl-mxs-dma.txt for details.
|
||||
- dma-names: Must be "rx-tx".
|
||||
- clocks : clocks phandle and clock specifier corresponding to each clock
|
||||
specified in clock-names.
|
||||
- clock-names : The "gpmi_io" clock is always required. Which clocks are
|
||||
exactly required depends on chip:
|
||||
* imx23/imx28 : "gpmi_io"
|
||||
* imx6q/sx : "gpmi_io", "gpmi_apb", "gpmi_bch", "gpmi_bch_apb", "per1_bch"
|
||||
* imx7d : "gpmi_io", "gpmi_bch_apb"
|
||||
|
||||
Optional properties:
|
||||
- nand-on-flash-bbt: boolean to enable on flash bbt option if not
|
||||
|
@ -0,0 +1,18 @@
|
||||
* MTD SPI driver for Microchip 23K256 (and similar) serial SRAM
|
||||
|
||||
Required properties:
|
||||
- #address-cells, #size-cells : Must be present if the device has sub-nodes
|
||||
representing partitions.
|
||||
- compatible : Must be one of "microchip,mchp23k256" or "microchip,mchp23lcv1024"
|
||||
- reg : Chip-Select number
|
||||
- spi-max-frequency : Maximum frequency of the SPI bus the chip can operate at
|
||||
|
||||
Example:
|
||||
|
||||
spi-sram@0 {
|
||||
#address-cells = <1>;
|
||||
#size-cells = <1>;
|
||||
compatible = "microchip,mchp23k256";
|
||||
reg = <0>;
|
||||
spi-max-frequency = <20000000>;
|
||||
};
|
@ -12,7 +12,8 @@ tree nodes.
|
||||
|
||||
The first part of NFC is NAND Controller Interface (NFI) HW.
|
||||
Required NFI properties:
|
||||
- compatible: Should be "mediatek,mtxxxx-nfc".
|
||||
- compatible: Should be one of "mediatek,mt2701-nfc",
|
||||
"mediatek,mt2712-nfc".
|
||||
- reg: Base physical address and size of NFI.
|
||||
- interrupts: Interrupts of NFI.
|
||||
- clocks: NFI required clocks.
|
||||
@ -141,7 +142,7 @@ Example:
|
||||
==============
|
||||
|
||||
Required BCH properties:
|
||||
- compatible: Should be "mediatek,mtxxxx-ecc".
|
||||
- compatible: Should be one of "mediatek,mt2701-ecc", "mediatek,mt2712-ecc".
|
||||
- reg: Base physical address and size of ECC.
|
||||
- interrupts: Interrupts of ECC.
|
||||
- clocks: ECC required clocks.
|
||||
|
@ -21,7 +21,7 @@ Optional NAND chip properties:
|
||||
|
||||
- nand-ecc-mode : String, operation mode of the NAND ecc mode.
|
||||
Supported values are: "none", "soft", "hw", "hw_syndrome",
|
||||
"hw_oob_first".
|
||||
"hw_oob_first", "on-die".
|
||||
Deprecated values:
|
||||
"soft_bch": use "soft" and nand-ecc-algo instead
|
||||
- nand-ecc-algo: string, algorithm of NAND ECC.
|
||||
|
@ -1,29 +1,49 @@
|
||||
Representing flash partitions in devicetree
|
||||
Flash partitions in device tree
|
||||
===============================
|
||||
|
||||
Partitions can be represented by sub-nodes of an mtd device. This can be used
|
||||
Flash devices can be partitioned into one or more functional ranges (e.g. "boot
|
||||
code", "nvram", "kernel").
|
||||
|
||||
Different devices may be partitioned in a different ways. Some may use a fixed
|
||||
flash layout set at production time. Some may use on-flash table that describes
|
||||
the geometry and naming/purpose of each functional region. It is also possible
|
||||
to see these methods mixed.
|
||||
|
||||
To assist system software in locating partitions, we allow describing which
|
||||
method is used for a given flash device. To describe the method there should be
|
||||
a subnode of the flash device that is named 'partitions'. It must have a
|
||||
'compatible' property, which is used to identify the method to use.
|
||||
|
||||
We currently only document a binding for fixed layouts.
|
||||
|
||||
|
||||
Fixed Partitions
|
||||
================
|
||||
|
||||
Partitions can be represented by sub-nodes of a flash device. This can be used
|
||||
on platforms which have strong conventions about which portions of a flash are
|
||||
used for what purposes, but which don't use an on-flash partition table such
|
||||
as RedBoot.
|
||||
|
||||
The partition table should be a subnode of the mtd node and should be named
|
||||
The partition table should be a subnode of the flash node and should be named
|
||||
'partitions'. This node should have the following property:
|
||||
- compatible : (required) must be "fixed-partitions"
|
||||
Partitions are then defined in subnodes of the partitions node.
|
||||
|
||||
For backwards compatibility partitions as direct subnodes of the mtd device are
|
||||
For backwards compatibility partitions as direct subnodes of the flash device are
|
||||
supported. This use is discouraged.
|
||||
NOTE: also for backwards compatibility, direct subnodes that have a compatible
|
||||
string are not considered partitions, as they may be used for other bindings.
|
||||
|
||||
#address-cells & #size-cells must both be present in the partitions subnode of the
|
||||
mtd device. There are two valid values for both:
|
||||
flash device. There are two valid values for both:
|
||||
<1>: for partitions that require a single 32-bit cell to represent their
|
||||
size/address (aka the value is below 4 GiB)
|
||||
<2>: for partitions that require two 32-bit cells to represent their
|
||||
size/address (aka the value is 4 GiB or greater).
|
||||
|
||||
Required properties:
|
||||
- reg : The partition's offset and size within the mtd bank.
|
||||
- reg : The partition's offset and size within the flash
|
||||
|
||||
Optional properties:
|
||||
- label : The label / name for this partition. If omitted, the label is taken
|
||||
|
@ -9,7 +9,7 @@ the GPMC controller with an "ethernet" name.
|
||||
|
||||
All timing relevant properties as well as generic GPMC child properties are
|
||||
explained in a separate documents. Please refer to
|
||||
Documentation/devicetree/bindings/bus/ti-gpmc.txt
|
||||
Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt
|
||||
|
||||
For the properties relevant to the ethernet controller connected to the GPMC
|
||||
refer to the binding documentation of the device. For example, the documentation
|
||||
@ -43,7 +43,7 @@ Required properties:
|
||||
|
||||
Optional properties:
|
||||
- gpmc,XXX Additional GPMC timings and settings parameters. See
|
||||
Documentation/devicetree/bindings/bus/ti-gpmc.txt
|
||||
Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt
|
||||
|
||||
Example:
|
||||
|
||||
|
@ -2,7 +2,9 @@ Amlogic Meson PWM Controller
|
||||
============================
|
||||
|
||||
Required properties:
|
||||
- compatible: Shall contain "amlogic,meson8b-pwm" or "amlogic,meson-gxbb-pwm".
|
||||
- compatible: Shall contain "amlogic,meson8b-pwm"
|
||||
or "amlogic,meson-gxbb-pwm"
|
||||
or "amlogic,meson-gxbb-ao-pwm"
|
||||
- #pwm-cells: Should be 3. See pwm.txt in this directory for a description of
|
||||
the cells format.
|
||||
|
||||
|
@ -24,7 +24,7 @@ Example:
|
||||
compatible = "st,stm32-timers";
|
||||
reg = <0x40010000 0x400>;
|
||||
clocks = <&rcc 0 160>;
|
||||
clock-names = "clk_int";
|
||||
clock-names = "int";
|
||||
|
||||
pwm {
|
||||
compatible = "st,stm32-pwm";
|
||||
|
@ -8,6 +8,7 @@ Required Properties:
|
||||
- "renesas,pwm-r8a7791": for R-Car M2-W
|
||||
- "renesas,pwm-r8a7794": for R-Car E2
|
||||
- "renesas,pwm-r8a7795": for R-Car H3
|
||||
- "renesas,pwm-r8a7796": for R-Car M3-W
|
||||
- reg: base address and length of the registers block for the PWM.
|
||||
- #pwm-cells: should be 2. See pwm.txt in this directory for a description of
|
||||
the cells format.
|
||||
|
@ -0,0 +1,22 @@
|
||||
Broadcom STB wake-up Timer
|
||||
|
||||
The Broadcom STB wake-up timer provides a 27Mhz resolution timer, with the
|
||||
ability to wake up the system from low-power suspend/standby modes.
|
||||
|
||||
Required properties:
|
||||
- compatible : should contain "brcm,brcmstb-waketimer"
|
||||
- reg : the register start and length for the WKTMR block
|
||||
- interrupts : The TIMER interrupt
|
||||
- interrupt-parent: The phandle to the Always-On (AON) Power Management (PM) L2
|
||||
interrupt controller node
|
||||
- clocks : The phandle to the UPG fixed clock (27Mhz domain)
|
||||
|
||||
Example:
|
||||
|
||||
waketimer@f0411580 {
|
||||
compatible = "brcm,brcmstb-waketimer";
|
||||
reg = <0xf0411580 0x14>;
|
||||
interrupts = <0x3>;
|
||||
interrupt-parent = <&aon_pm_l2_intc>;
|
||||
clocks = <&upg_fixed>;
|
||||
};
|
@ -1,14 +0,0 @@
|
||||
* Cortina Systems Gemini RTC
|
||||
|
||||
Gemini SoC real-time clock.
|
||||
|
||||
Required properties:
|
||||
- compatible : Should be "cortina,gemini-rtc"
|
||||
|
||||
Examples:
|
||||
|
||||
rtc@45000000 {
|
||||
compatible = "cortina,gemini-rtc";
|
||||
reg = <0x45000000 0x100>;
|
||||
interrupts = <17 IRQ_TYPE_LEVEL_HIGH>;
|
||||
};
|
28
Documentation/devicetree/bindings/rtc/faraday,ftrtc010.txt
Normal file
28
Documentation/devicetree/bindings/rtc/faraday,ftrtc010.txt
Normal file
@ -0,0 +1,28 @@
|
||||
* Faraday Technology FTRTC010 Real Time Clock
|
||||
|
||||
This RTC appears in for example the Storlink Gemini family of
|
||||
SoCs.
|
||||
|
||||
Required properties:
|
||||
- compatible : Should be one of:
|
||||
"faraday,ftrtc010"
|
||||
"cortina,gemini-rtc", "faraday,ftrtc010"
|
||||
|
||||
Optional properties:
|
||||
- clocks: when present should contain clock references to the
|
||||
PCLK and EXTCLK clocks. Faraday calls the later CLK1HZ and
|
||||
says the clock should be 1 Hz, but implementers actually seem
|
||||
to choose different clocks here, like Cortina who chose
|
||||
32768 Hz (a typical low-power clock).
|
||||
- clock-names: should name the clocks "PCLK" and "EXTCLK"
|
||||
respectively.
|
||||
|
||||
Examples:
|
||||
|
||||
rtc@45000000 {
|
||||
compatible = "cortina,gemini-rtc";
|
||||
reg = <0x45000000 0x100>;
|
||||
interrupts = <17 IRQ_TYPE_LEVEL_HIGH>;
|
||||
clocks = <&foo 0>, <&foo 1>;
|
||||
clock-names = "PCLK", "EXTCLK";
|
||||
};
|
@ -1,17 +1,25 @@
|
||||
STM32 Real Time Clock
|
||||
|
||||
Required properties:
|
||||
- compatible: "st,stm32-rtc".
|
||||
- compatible: can be either "st,stm32-rtc" or "st,stm32h7-rtc", depending on
|
||||
the device is compatible with stm32(f4/f7) or stm32h7.
|
||||
- reg: address range of rtc register set.
|
||||
- clocks: reference to the clock entry ck_rtc.
|
||||
- clocks: can use up to two clocks, depending on part used:
|
||||
- "rtc_ck": RTC clock source.
|
||||
It is required on stm32(f4/f7) and stm32h7.
|
||||
- "pclk": RTC APB interface clock.
|
||||
It is not present on stm32(f4/f7).
|
||||
It is required on stm32h7.
|
||||
- clock-names: must be "rtc_ck" and "pclk".
|
||||
It is required only on stm32h7.
|
||||
- interrupt-parent: phandle for the interrupt controller.
|
||||
- interrupts: rtc alarm interrupt.
|
||||
- st,syscfg: phandle for pwrcfg, mandatory to disable/enable backup domain
|
||||
(RTC registers) write protection.
|
||||
|
||||
Optional properties (to override default ck_rtc parent clock):
|
||||
- assigned-clocks: reference to the ck_rtc clock entry.
|
||||
- assigned-clock-parents: phandle of the new parent clock of ck_rtc.
|
||||
Optional properties (to override default rtc_ck parent clock):
|
||||
- assigned-clocks: reference to the rtc_ck clock entry.
|
||||
- assigned-clock-parents: phandle of the new parent clock of rtc_ck.
|
||||
|
||||
Example:
|
||||
|
||||
@ -25,3 +33,17 @@ Example:
|
||||
interrupts = <17 1>;
|
||||
st,syscfg = <&pwrcfg>;
|
||||
};
|
||||
|
||||
rtc: rtc@58004000 {
|
||||
compatible = "st,stm32h7-rtc";
|
||||
reg = <0x58004000 0x400>;
|
||||
clocks = <&rcc RTCAPB_CK>, <&rcc RTC_CK>;
|
||||
clock-names = "pclk", "rtc_ck";
|
||||
assigned-clocks = <&rcc RTC_CK>;
|
||||
assigned-clock-parents = <&rcc LSE_CK>;
|
||||
interrupt-parent = <&exti>;
|
||||
interrupts = <17 1>;
|
||||
interrupt-names = "alarm";
|
||||
st,syscfg = <&pwrcfg>;
|
||||
status = "disabled";
|
||||
};
|
||||
|
23
Documentation/devicetree/bindings/watchdog/da9062-wdt.txt
Normal file
23
Documentation/devicetree/bindings/watchdog/da9062-wdt.txt
Normal file
@ -0,0 +1,23 @@
|
||||
* Dialog Semiconductor DA9062/61 Watchdog Timer
|
||||
|
||||
Required properties:
|
||||
|
||||
- compatible: should be one of the following valid compatible string lines:
|
||||
"dlg,da9061-watchdog", "dlg,da9062-watchdog"
|
||||
"dlg,da9062-watchdog"
|
||||
|
||||
Example: DA9062
|
||||
|
||||
pmic0: da9062@58 {
|
||||
watchdog {
|
||||
compatible = "dlg,da9062-watchdog";
|
||||
};
|
||||
};
|
||||
|
||||
Example: DA9061 using a fall-back compatible for the DA9062 watchdog driver
|
||||
|
||||
pmic0: da9061@58 {
|
||||
watchdog {
|
||||
compatible = "dlg,da9061-watchdog", "dlg,da9062-watchdog";
|
||||
};
|
||||
};
|
@ -10,6 +10,8 @@ Required Properties:
|
||||
Optional Properties:
|
||||
|
||||
- interrupts : The interrupt used for the watchdog timeout warning.
|
||||
- resets : phandle pointing to the system reset controller with
|
||||
line index for the watchdog.
|
||||
|
||||
Example:
|
||||
|
||||
@ -18,4 +20,5 @@ Example:
|
||||
reg = <0xffd02000 0x1000>;
|
||||
interrupts = <0 171 4>;
|
||||
clocks = <&per_base_clk>;
|
||||
resets = <&rst WDT0_RESET>;
|
||||
};
|
||||
|
@ -2,10 +2,11 @@ Renesas Watchdog Timer (WDT) Controller
|
||||
|
||||
Required properties:
|
||||
- compatible : Should be "renesas,<soctype>-wdt", and
|
||||
"renesas,rcar-gen3-wdt" as fallback.
|
||||
"renesas,rcar-gen3-wdt" or "renesas,rza-wdt" as fallback.
|
||||
Examples with soctypes are:
|
||||
- "renesas,r8a7795-wdt" (R-Car H3)
|
||||
- "renesas,r8a7796-wdt" (R-Car M3-W)
|
||||
- "renesas,r7s72100-wdt" (RZ/A1)
|
||||
|
||||
When compatible with the generic version, nodes must list the SoC-specific
|
||||
version corresponding to the platform first, followed by the generic
|
||||
@ -17,6 +18,7 @@ Required properties:
|
||||
Optional properties:
|
||||
- timeout-sec : Contains the watchdog timeout in seconds
|
||||
- power-domains : the power domain the WDT belongs to
|
||||
- interrupts: Some WDTs have an interrupt when used in interval timer mode
|
||||
|
||||
Examples:
|
||||
|
||||
|
19
Documentation/devicetree/bindings/watchdog/st,stm32-iwdg.txt
Normal file
19
Documentation/devicetree/bindings/watchdog/st,stm32-iwdg.txt
Normal file
@ -0,0 +1,19 @@
|
||||
STM32 Independent WatchDoG (IWDG)
|
||||
---------------------------------
|
||||
|
||||
Required properties:
|
||||
- compatible: "st,stm32-iwdg"
|
||||
- reg: physical base address and length of the registers set for the device
|
||||
- clocks: must contain a single entry describing the clock input
|
||||
|
||||
Optional Properties:
|
||||
- timeout-sec: Watchdog timeout value in seconds.
|
||||
|
||||
Example:
|
||||
|
||||
iwdg: watchdog@40003000 {
|
||||
compatible = "st,stm32-iwdg";
|
||||
reg = <0x40003000 0x400>;
|
||||
clocks = <&clk_lsi>;
|
||||
timeout-sec = <32>;
|
||||
};
|
20
Documentation/devicetree/bindings/watchdog/uniphier-wdt.txt
Normal file
20
Documentation/devicetree/bindings/watchdog/uniphier-wdt.txt
Normal file
@ -0,0 +1,20 @@
|
||||
UniPhier watchdog timer controller
|
||||
|
||||
This UniPhier watchdog timer controller must be under sysctrl node.
|
||||
|
||||
Required properties:
|
||||
- compatible: should be "socionext,uniphier-wdt"
|
||||
|
||||
Example:
|
||||
|
||||
sysctrl@61840000 {
|
||||
compatible = "socionext,uniphier-ld11-sysctrl",
|
||||
"simple-mfd", "syscon";
|
||||
reg = <0x61840000 0x4000>;
|
||||
|
||||
watchdog {
|
||||
compatible = "socionext,uniphier-wdt";
|
||||
}
|
||||
|
||||
other nodes ...
|
||||
};
|
@ -1,13 +1,20 @@
|
||||
==================================
|
||||
Digital Signature Verification API
|
||||
==================================
|
||||
|
||||
CONTENTS
|
||||
|
||||
1. Introduction
|
||||
2. API
|
||||
3. User-space utilities
|
||||
:Author: Dmitry Kasatkin
|
||||
:Date: 06.10.2011
|
||||
|
||||
|
||||
1. Introduction
|
||||
.. CONTENTS
|
||||
|
||||
1. Introduction
|
||||
2. API
|
||||
3. User-space utilities
|
||||
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
||||
Digital signature verification API provides a method to verify digital signature.
|
||||
Currently digital signatures are used by the IMA/EVM integrity protection subsystem.
|
||||
@ -17,25 +24,25 @@ GnuPG multi-precision integers (MPI) library. The kernel port provides
|
||||
memory allocation errors handling, has been refactored according to kernel
|
||||
coding style, and checkpatch.pl reported errors and warnings have been fixed.
|
||||
|
||||
Public key and signature consist of header and MPIs.
|
||||
Public key and signature consist of header and MPIs::
|
||||
|
||||
struct pubkey_hdr {
|
||||
uint8_t version; /* key format version */
|
||||
time_t timestamp; /* key made, always 0 for now */
|
||||
uint8_t algo;
|
||||
uint8_t nmpi;
|
||||
char mpi[0];
|
||||
} __packed;
|
||||
struct pubkey_hdr {
|
||||
uint8_t version; /* key format version */
|
||||
time_t timestamp; /* key made, always 0 for now */
|
||||
uint8_t algo;
|
||||
uint8_t nmpi;
|
||||
char mpi[0];
|
||||
} __packed;
|
||||
|
||||
struct signature_hdr {
|
||||
uint8_t version; /* signature format version */
|
||||
time_t timestamp; /* signature made */
|
||||
uint8_t algo;
|
||||
uint8_t hash;
|
||||
uint8_t keyid[8];
|
||||
uint8_t nmpi;
|
||||
char mpi[0];
|
||||
} __packed;
|
||||
struct signature_hdr {
|
||||
uint8_t version; /* signature format version */
|
||||
time_t timestamp; /* signature made */
|
||||
uint8_t algo;
|
||||
uint8_t hash;
|
||||
uint8_t keyid[8];
|
||||
uint8_t nmpi;
|
||||
char mpi[0];
|
||||
} __packed;
|
||||
|
||||
keyid equals to SHA1[12-19] over the total key content.
|
||||
Signature header is used as an input to generate a signature.
|
||||
@ -43,31 +50,33 @@ Such approach insures that key or signature header could not be changed.
|
||||
It protects timestamp from been changed and can be used for rollback
|
||||
protection.
|
||||
|
||||
2. API
|
||||
API
|
||||
===
|
||||
|
||||
API currently includes only 1 function:
|
||||
API currently includes only 1 function::
|
||||
|
||||
digsig_verify() - digital signature verification with public key
|
||||
|
||||
|
||||
/**
|
||||
* digsig_verify() - digital signature verification with public key
|
||||
* @keyring: keyring to search key in
|
||||
* @sig: digital signature
|
||||
* @sigen: length of the signature
|
||||
* @data: data
|
||||
* @datalen: length of the data
|
||||
* @return: 0 on success, -EINVAL otherwise
|
||||
*
|
||||
* Verifies data integrity against digital signature.
|
||||
* Currently only RSA is supported.
|
||||
* Normally hash of the content is used as a data for this function.
|
||||
*
|
||||
*/
|
||||
int digsig_verify(struct key *keyring, const char *sig, int siglen,
|
||||
const char *data, int datalen);
|
||||
/**
|
||||
* digsig_verify() - digital signature verification with public key
|
||||
* @keyring: keyring to search key in
|
||||
* @sig: digital signature
|
||||
* @sigen: length of the signature
|
||||
* @data: data
|
||||
* @datalen: length of the data
|
||||
* @return: 0 on success, -EINVAL otherwise
|
||||
*
|
||||
* Verifies data integrity against digital signature.
|
||||
* Currently only RSA is supported.
|
||||
* Normally hash of the content is used as a data for this function.
|
||||
*
|
||||
*/
|
||||
int digsig_verify(struct key *keyring, const char *sig, int siglen,
|
||||
const char *data, int datalen);
|
||||
|
||||
3. User-space utilities
|
||||
User-space utilities
|
||||
====================
|
||||
|
||||
The signing and key management utilities evm-utils provide functionality
|
||||
to generate signatures, to load keys into the kernel keyring.
|
||||
@ -75,22 +84,18 @@ Keys can be in PEM or converted to the kernel format.
|
||||
When the key is added to the kernel keyring, the keyid defines the name
|
||||
of the key: 5D2B05FC633EE3E8 in the example bellow.
|
||||
|
||||
Here is example output of the keyctl utility.
|
||||
Here is example output of the keyctl utility::
|
||||
|
||||
$ keyctl show
|
||||
Session Keyring
|
||||
-3 --alswrv 0 0 keyring: _ses
|
||||
603976250 --alswrv 0 -1 \_ keyring: _uid.0
|
||||
817777377 --alswrv 0 0 \_ user: kmk
|
||||
891974900 --alswrv 0 0 \_ encrypted: evm-key
|
||||
170323636 --alswrv 0 0 \_ keyring: _module
|
||||
548221616 --alswrv 0 0 \_ keyring: _ima
|
||||
128198054 --alswrv 0 0 \_ keyring: _evm
|
||||
$ keyctl show
|
||||
Session Keyring
|
||||
-3 --alswrv 0 0 keyring: _ses
|
||||
603976250 --alswrv 0 -1 \_ keyring: _uid.0
|
||||
817777377 --alswrv 0 0 \_ user: kmk
|
||||
891974900 --alswrv 0 0 \_ encrypted: evm-key
|
||||
170323636 --alswrv 0 0 \_ keyring: _module
|
||||
548221616 --alswrv 0 0 \_ keyring: _ima
|
||||
128198054 --alswrv 0 0 \_ keyring: _evm
|
||||
|
||||
$ keyctl list 128198054
|
||||
1 key in keyring:
|
||||
620789745: --alswrv 0 0 user: 5D2B05FC633EE3E8
|
||||
|
||||
|
||||
Dmitry Kasatkin
|
||||
06.10.2011
|
||||
$ keyctl list 128198054
|
||||
1 key in keyring:
|
||||
620789745: --alswrv 0 0 user: 5D2B05FC633EE3E8
|
||||
|
@ -106,9 +106,6 @@ Kernel utility functions
|
||||
.. kernel-doc:: kernel/sys.c
|
||||
:export:
|
||||
|
||||
.. kernel-doc:: kernel/rcu/srcu.c
|
||||
:export:
|
||||
|
||||
.. kernel-doc:: kernel/rcu/tree.c
|
||||
:export:
|
||||
|
||||
|
@ -41,5 +41,8 @@ i2c_adapter devices which don't support those I2C operations.
|
||||
.. kernel-doc:: drivers/i2c/i2c-boardinfo.c
|
||||
:functions: i2c_register_board_info
|
||||
|
||||
.. kernel-doc:: drivers/i2c/i2c-core.c
|
||||
.. kernel-doc:: drivers/i2c/i2c-core-base.c
|
||||
:export:
|
||||
|
||||
.. kernel-doc:: drivers/i2c/i2c-core-smbus.c
|
||||
:export:
|
||||
|
@ -1,5 +1,6 @@
|
||||
The EFI Boot Stub
|
||||
---------------------------
|
||||
=================
|
||||
The EFI Boot Stub
|
||||
=================
|
||||
|
||||
On the x86 and ARM platforms, a kernel zImage/bzImage can masquerade
|
||||
as a PE/COFF image, thereby convincing EFI firmware loaders to load
|
||||
@ -25,7 +26,8 @@ a certain sense it *IS* the boot loader.
|
||||
The EFI boot stub is enabled with the CONFIG_EFI_STUB kernel option.
|
||||
|
||||
|
||||
**** How to install bzImage.efi
|
||||
How to install bzImage.efi
|
||||
--------------------------
|
||||
|
||||
The bzImage located in arch/x86/boot/bzImage must be copied to the EFI
|
||||
System Partition (ESP) and renamed with the extension ".efi". Without
|
||||
@ -37,14 +39,16 @@ may not need to be renamed. Similarly for arm64, arch/arm64/boot/Image
|
||||
should be copied but not necessarily renamed.
|
||||
|
||||
|
||||
**** Passing kernel parameters from the EFI shell
|
||||
Passing kernel parameters from the EFI shell
|
||||
--------------------------------------------
|
||||
|
||||
Arguments to the kernel can be passed after bzImage.efi, e.g.
|
||||
Arguments to the kernel can be passed after bzImage.efi, e.g.::
|
||||
|
||||
fs0:> bzImage.efi console=ttyS0 root=/dev/sda4
|
||||
|
||||
|
||||
**** The "initrd=" option
|
||||
The "initrd=" option
|
||||
--------------------
|
||||
|
||||
Like most boot loaders, the EFI stub allows the user to specify
|
||||
multiple initrd files using the "initrd=" option. This is the only EFI
|
||||
@ -54,9 +58,9 @@ kernel when it boots.
|
||||
The path to the initrd file must be an absolute path from the
|
||||
beginning of the ESP, relative path names do not work. Also, the path
|
||||
is an EFI-style path and directory elements must be separated with
|
||||
backslashes (\). For example, given the following directory layout,
|
||||
backslashes (\). For example, given the following directory layout::
|
||||
|
||||
fs0:>
|
||||
fs0:>
|
||||
Kernels\
|
||||
bzImage.efi
|
||||
initrd-large.img
|
||||
@ -66,7 +70,7 @@ fs0:>
|
||||
initrd-medium.img
|
||||
|
||||
to boot with the initrd-large.img file if the current working
|
||||
directory is fs0:\Kernels, the following command must be used,
|
||||
directory is fs0:\Kernels, the following command must be used::
|
||||
|
||||
fs0:\Kernels> bzImage.efi initrd=\Kernels\initrd-large.img
|
||||
|
||||
@ -76,7 +80,8 @@ which understands relative paths, whereas the rest of the command line
|
||||
is passed to bzImage.efi.
|
||||
|
||||
|
||||
**** The "dtb=" option
|
||||
The "dtb=" option
|
||||
-----------------
|
||||
|
||||
For the ARM and arm64 architectures, we also need to be able to provide a
|
||||
device tree to the kernel. This is done with the "dtb=" command line option,
|
||||
|
@ -1,4 +1,8 @@
|
||||
EISA bus support (Marc Zyngier <maz@wild-wind.fr.eu.org>)
|
||||
================
|
||||
EISA bus support
|
||||
================
|
||||
|
||||
:Author: Marc Zyngier <maz@wild-wind.fr.eu.org>
|
||||
|
||||
This document groups random notes about porting EISA drivers to the
|
||||
new EISA/sysfs API.
|
||||
@ -14,168 +18,189 @@ detection code is generally also used to probe ISA cards). Moreover,
|
||||
most EISA drivers are among the oldest Linux drivers so, as you can
|
||||
imagine, some dust has settled here over the years.
|
||||
|
||||
The EISA infrastructure is made up of three parts :
|
||||
The EISA infrastructure is made up of three parts:
|
||||
|
||||
- The bus code implements most of the generic code. It is shared
|
||||
among all the architectures that the EISA code runs on. It
|
||||
implements bus probing (detecting EISA cards available on the bus),
|
||||
allocates I/O resources, allows fancy naming through sysfs, and
|
||||
offers interfaces for driver to register.
|
||||
among all the architectures that the EISA code runs on. It
|
||||
implements bus probing (detecting EISA cards available on the bus),
|
||||
allocates I/O resources, allows fancy naming through sysfs, and
|
||||
offers interfaces for driver to register.
|
||||
|
||||
- The bus root driver implements the glue between the bus hardware
|
||||
and the generic bus code. It is responsible for discovering the
|
||||
device implementing the bus, and setting it up to be latter probed
|
||||
by the bus code. This can go from something as simple as reserving
|
||||
an I/O region on x86, to the rather more complex, like the hppa
|
||||
EISA code. This is the part to implement in order to have EISA
|
||||
running on an "new" platform.
|
||||
and the generic bus code. It is responsible for discovering the
|
||||
device implementing the bus, and setting it up to be latter probed
|
||||
by the bus code. This can go from something as simple as reserving
|
||||
an I/O region on x86, to the rather more complex, like the hppa
|
||||
EISA code. This is the part to implement in order to have EISA
|
||||
running on an "new" platform.
|
||||
|
||||
- The driver offers the bus a list of devices that it manages, and
|
||||
implements the necessary callbacks to probe and release devices
|
||||
whenever told to.
|
||||
implements the necessary callbacks to probe and release devices
|
||||
whenever told to.
|
||||
|
||||
Every function/structure below lives in <linux/eisa.h>, which depends
|
||||
heavily on <linux/device.h>.
|
||||
|
||||
** Bus root driver :
|
||||
Bus root driver
|
||||
===============
|
||||
|
||||
int eisa_root_register (struct eisa_root_device *root);
|
||||
::
|
||||
|
||||
int eisa_root_register (struct eisa_root_device *root);
|
||||
|
||||
The eisa_root_register function is used to declare a device as the
|
||||
root of an EISA bus. The eisa_root_device structure holds a reference
|
||||
to this device, as well as some parameters for probing purposes.
|
||||
to this device, as well as some parameters for probing purposes::
|
||||
|
||||
struct eisa_root_device {
|
||||
struct device *dev; /* Pointer to bridge device */
|
||||
struct resource *res;
|
||||
unsigned long bus_base_addr;
|
||||
int slots; /* Max slot number */
|
||||
int force_probe; /* Probe even when no slot 0 */
|
||||
u64 dma_mask; /* from bridge device */
|
||||
int bus_nr; /* Set by eisa_root_register */
|
||||
struct resource eisa_root_res; /* ditto */
|
||||
};
|
||||
struct eisa_root_device {
|
||||
struct device *dev; /* Pointer to bridge device */
|
||||
struct resource *res;
|
||||
unsigned long bus_base_addr;
|
||||
int slots; /* Max slot number */
|
||||
int force_probe; /* Probe even when no slot 0 */
|
||||
u64 dma_mask; /* from bridge device */
|
||||
int bus_nr; /* Set by eisa_root_register */
|
||||
struct resource eisa_root_res; /* ditto */
|
||||
};
|
||||
|
||||
node : used for eisa_root_register internal purpose
|
||||
dev : pointer to the root device
|
||||
res : root device I/O resource
|
||||
bus_base_addr : slot 0 address on this bus
|
||||
slots : max slot number to probe
|
||||
force_probe : Probe even when slot 0 is empty (no EISA mainboard)
|
||||
dma_mask : Default DMA mask. Usually the bridge device dma_mask.
|
||||
bus_nr : unique bus id, set by eisa_root_register
|
||||
============= ======================================================
|
||||
node used for eisa_root_register internal purpose
|
||||
dev pointer to the root device
|
||||
res root device I/O resource
|
||||
bus_base_addr slot 0 address on this bus
|
||||
slots max slot number to probe
|
||||
force_probe Probe even when slot 0 is empty (no EISA mainboard)
|
||||
dma_mask Default DMA mask. Usually the bridge device dma_mask.
|
||||
bus_nr unique bus id, set by eisa_root_register
|
||||
============= ======================================================
|
||||
|
||||
** Driver :
|
||||
Driver
|
||||
======
|
||||
|
||||
int eisa_driver_register (struct eisa_driver *edrv);
|
||||
void eisa_driver_unregister (struct eisa_driver *edrv);
|
||||
::
|
||||
|
||||
int eisa_driver_register (struct eisa_driver *edrv);
|
||||
void eisa_driver_unregister (struct eisa_driver *edrv);
|
||||
|
||||
Clear enough ?
|
||||
|
||||
struct eisa_device_id {
|
||||
char sig[EISA_SIG_LEN];
|
||||
unsigned long driver_data;
|
||||
};
|
||||
::
|
||||
|
||||
struct eisa_driver {
|
||||
const struct eisa_device_id *id_table;
|
||||
struct device_driver driver;
|
||||
};
|
||||
struct eisa_device_id {
|
||||
char sig[EISA_SIG_LEN];
|
||||
unsigned long driver_data;
|
||||
};
|
||||
|
||||
id_table : an array of NULL terminated EISA id strings,
|
||||
followed by an empty string. Each string can
|
||||
optionally be paired with a driver-dependent value
|
||||
(driver_data).
|
||||
struct eisa_driver {
|
||||
const struct eisa_device_id *id_table;
|
||||
struct device_driver driver;
|
||||
};
|
||||
|
||||
driver : a generic driver, such as described in
|
||||
Documentation/driver-model/driver.txt. Only .name,
|
||||
.probe and .remove members are mandatory.
|
||||
=============== ====================================================
|
||||
id_table an array of NULL terminated EISA id strings,
|
||||
followed by an empty string. Each string can
|
||||
optionally be paired with a driver-dependent value
|
||||
(driver_data).
|
||||
|
||||
An example is the 3c59x driver :
|
||||
driver a generic driver, such as described in
|
||||
Documentation/driver-model/driver.txt. Only .name,
|
||||
.probe and .remove members are mandatory.
|
||||
=============== ====================================================
|
||||
|
||||
static struct eisa_device_id vortex_eisa_ids[] = {
|
||||
{ "TCM5920", EISA_3C592_OFFSET },
|
||||
{ "TCM5970", EISA_3C597_OFFSET },
|
||||
{ "" }
|
||||
};
|
||||
An example is the 3c59x driver::
|
||||
|
||||
static struct eisa_driver vortex_eisa_driver = {
|
||||
.id_table = vortex_eisa_ids,
|
||||
.driver = {
|
||||
.name = "3c59x",
|
||||
.probe = vortex_eisa_probe,
|
||||
.remove = vortex_eisa_remove
|
||||
}
|
||||
};
|
||||
static struct eisa_device_id vortex_eisa_ids[] = {
|
||||
{ "TCM5920", EISA_3C592_OFFSET },
|
||||
{ "TCM5970", EISA_3C597_OFFSET },
|
||||
{ "" }
|
||||
};
|
||||
|
||||
** Device :
|
||||
static struct eisa_driver vortex_eisa_driver = {
|
||||
.id_table = vortex_eisa_ids,
|
||||
.driver = {
|
||||
.name = "3c59x",
|
||||
.probe = vortex_eisa_probe,
|
||||
.remove = vortex_eisa_remove
|
||||
}
|
||||
};
|
||||
|
||||
Device
|
||||
======
|
||||
|
||||
The sysfs framework calls .probe and .remove functions upon device
|
||||
discovery and removal (note that the .remove function is only called
|
||||
when driver is built as a module).
|
||||
|
||||
Both functions are passed a pointer to a 'struct device', which is
|
||||
encapsulated in a 'struct eisa_device' described as follows :
|
||||
encapsulated in a 'struct eisa_device' described as follows::
|
||||
|
||||
struct eisa_device {
|
||||
struct eisa_device_id id;
|
||||
int slot;
|
||||
int state;
|
||||
unsigned long base_addr;
|
||||
struct resource res[EISA_MAX_RESOURCES];
|
||||
u64 dma_mask;
|
||||
struct device dev; /* generic device */
|
||||
};
|
||||
struct eisa_device {
|
||||
struct eisa_device_id id;
|
||||
int slot;
|
||||
int state;
|
||||
unsigned long base_addr;
|
||||
struct resource res[EISA_MAX_RESOURCES];
|
||||
u64 dma_mask;
|
||||
struct device dev; /* generic device */
|
||||
};
|
||||
|
||||
id : EISA id, as read from device. id.driver_data is set from the
|
||||
matching driver EISA id.
|
||||
slot : slot number which the device was detected on
|
||||
state : set of flags indicating the state of the device. Current
|
||||
flags are EISA_CONFIG_ENABLED and EISA_CONFIG_FORCED.
|
||||
res : set of four 256 bytes I/O regions allocated to this device
|
||||
dma_mask: DMA mask set from the parent device.
|
||||
dev : generic device (see Documentation/driver-model/device.txt)
|
||||
======== ============================================================
|
||||
id EISA id, as read from device. id.driver_data is set from the
|
||||
matching driver EISA id.
|
||||
slot slot number which the device was detected on
|
||||
state set of flags indicating the state of the device. Current
|
||||
flags are EISA_CONFIG_ENABLED and EISA_CONFIG_FORCED.
|
||||
res set of four 256 bytes I/O regions allocated to this device
|
||||
dma_mask DMA mask set from the parent device.
|
||||
dev generic device (see Documentation/driver-model/device.txt)
|
||||
======== ============================================================
|
||||
|
||||
You can get the 'struct eisa_device' from 'struct device' using the
|
||||
'to_eisa_device' macro.
|
||||
|
||||
** Misc stuff :
|
||||
Misc stuff
|
||||
==========
|
||||
|
||||
void eisa_set_drvdata (struct eisa_device *edev, void *data);
|
||||
::
|
||||
|
||||
void eisa_set_drvdata (struct eisa_device *edev, void *data);
|
||||
|
||||
Stores data into the device's driver_data area.
|
||||
|
||||
void *eisa_get_drvdata (struct eisa_device *edev):
|
||||
::
|
||||
|
||||
void *eisa_get_drvdata (struct eisa_device *edev):
|
||||
|
||||
Gets the pointer previously stored into the device's driver_data area.
|
||||
|
||||
int eisa_get_region_index (void *addr);
|
||||
::
|
||||
|
||||
int eisa_get_region_index (void *addr);
|
||||
|
||||
Returns the region number (0 <= x < EISA_MAX_RESOURCES) of a given
|
||||
address.
|
||||
|
||||
** Kernel parameters :
|
||||
Kernel parameters
|
||||
=================
|
||||
|
||||
eisa_bus.enable_dev :
|
||||
eisa_bus.enable_dev
|
||||
A comma-separated list of slots to be enabled, even if the firmware
|
||||
set the card as disabled. The driver must be able to properly
|
||||
initialize the device in such conditions.
|
||||
|
||||
A comma-separated list of slots to be enabled, even if the firmware
|
||||
set the card as disabled. The driver must be able to properly
|
||||
initialize the device in such conditions.
|
||||
eisa_bus.disable_dev
|
||||
A comma-separated list of slots to be enabled, even if the firmware
|
||||
set the card as enabled. The driver won't be called to handle this
|
||||
device.
|
||||
|
||||
eisa_bus.disable_dev :
|
||||
virtual_root.force_probe
|
||||
Force the probing code to probe EISA slots even when it cannot find an
|
||||
EISA compliant mainboard (nothing appears on slot 0). Defaults to 0
|
||||
(don't force), and set to 1 (force probing) when either
|
||||
CONFIG_ALPHA_JENSEN or CONFIG_EISA_VLB_PRIMING are set.
|
||||
|
||||
A comma-separated list of slots to be enabled, even if the firmware
|
||||
set the card as enabled. The driver won't be called to handle this
|
||||
device.
|
||||
|
||||
virtual_root.force_probe :
|
||||
|
||||
Force the probing code to probe EISA slots even when it cannot find an
|
||||
EISA compliant mainboard (nothing appears on slot 0). Defaults to 0
|
||||
(don't force), and set to 1 (force probing) when either
|
||||
CONFIG_ALPHA_JENSEN or CONFIG_EISA_VLB_PRIMING are set.
|
||||
|
||||
** Random notes :
|
||||
Random notes
|
||||
============
|
||||
|
||||
Converting an EISA driver to the new API mostly involves *deleting*
|
||||
code (since probing is now in the core EISA code). Unfortunately, most
|
||||
@ -194,9 +219,11 @@ routine.
|
||||
For example, switching your favorite EISA SCSI card to the "hotplug"
|
||||
model is "the right thing"(tm).
|
||||
|
||||
** Thanks :
|
||||
Thanks
|
||||
======
|
||||
|
||||
I'd like to thank the following people for their help:
|
||||
|
||||
I'd like to thank the following people for their help :
|
||||
- Xavier Benigni for lending me a wonderful Alpha Jensen,
|
||||
- James Bottomley, Jeff Garzik for getting this stuff into the kernel,
|
||||
- Andries Brouwer for contributing numerous EISA ids,
|
||||
|
@ -134,6 +134,23 @@ use the boot option:
|
||||
fail_futex=
|
||||
mmc_core.fail_request=<interval>,<probability>,<space>,<times>
|
||||
|
||||
o proc entries
|
||||
|
||||
- /proc/<pid>/fail-nth:
|
||||
- /proc/self/task/<tid>/fail-nth:
|
||||
|
||||
Write to this file of integer N makes N-th call in the task fail.
|
||||
Read from this file returns a integer value. A value of '0' indicates
|
||||
that the fault setup with a previous write to this file was injected.
|
||||
A positive integer N indicates that the fault wasn't yet injected.
|
||||
Note that this file enables all types of faults (slab, futex, etc).
|
||||
This setting takes precedence over all other generic debugfs settings
|
||||
like probability, interval, times, etc. But per-capability settings
|
||||
(e.g. fail_futex/ignore-private) take precedence over it.
|
||||
|
||||
This feature is intended for systematic testing of faults in a single
|
||||
system call. See an example below.
|
||||
|
||||
How to add new fault injection capability
|
||||
-----------------------------------------
|
||||
|
||||
@ -278,3 +295,65 @@ allocation failure.
|
||||
# env FAILCMD_TYPE=fail_page_alloc \
|
||||
./tools/testing/fault-injection/failcmd.sh --times=100 \
|
||||
-- make -C tools/testing/selftests/ run_tests
|
||||
|
||||
Systematic faults using fail-nth
|
||||
---------------------------------
|
||||
|
||||
The following code systematically faults 0-th, 1-st, 2-nd and so on
|
||||
capabilities in the socketpair() system call.
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <errno.h>
|
||||
|
||||
int main()
|
||||
{
|
||||
int i, err, res, fail_nth, fds[2];
|
||||
char buf[128];
|
||||
|
||||
system("echo N > /sys/kernel/debug/failslab/ignore-gfp-wait");
|
||||
sprintf(buf, "/proc/self/task/%ld/fail-nth", syscall(SYS_gettid));
|
||||
fail_nth = open(buf, O_RDWR);
|
||||
for (i = 1;; i++) {
|
||||
sprintf(buf, "%d", i);
|
||||
write(fail_nth, buf, strlen(buf));
|
||||
res = socketpair(AF_LOCAL, SOCK_STREAM, 0, fds);
|
||||
err = errno;
|
||||
pread(fail_nth, buf, sizeof(buf), 0);
|
||||
if (res == 0) {
|
||||
close(fds[0]);
|
||||
close(fds[1]);
|
||||
}
|
||||
printf("%d-th fault %c: res=%d/%d\n", i, atoi(buf) ? 'N' : 'Y',
|
||||
res, err);
|
||||
if (atoi(buf))
|
||||
break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
An example output:
|
||||
|
||||
1-th fault Y: res=-1/23
|
||||
2-th fault Y: res=-1/23
|
||||
3-th fault Y: res=-1/12
|
||||
4-th fault Y: res=-1/12
|
||||
5-th fault Y: res=-1/23
|
||||
6-th fault Y: res=-1/23
|
||||
7-th fault Y: res=-1/23
|
||||
8-th fault Y: res=-1/12
|
||||
9-th fault Y: res=-1/12
|
||||
10-th fault Y: res=-1/12
|
||||
11-th fault Y: res=-1/12
|
||||
12-th fault Y: res=-1/12
|
||||
13-th fault Y: res=-1/12
|
||||
14-th fault Y: res=-1/12
|
||||
15-th fault Y: res=-1/12
|
||||
16-th fault N: res=0/12
|
||||
|
@ -316,7 +316,7 @@ For version 5, the format of the message is:
|
||||
struct autofs_v5_packet {
|
||||
int proto_version; /* Protocol version */
|
||||
int type; /* Type of packet */
|
||||
autofs_wqt_t wait_queue_entry_token;
|
||||
autofs_wqt_t wait_queue_token;
|
||||
__u32 dev;
|
||||
__u64 ino;
|
||||
__u32 uid;
|
||||
@ -341,12 +341,12 @@ The pipe will be set to "packet mode" (equivalent to passing
|
||||
`O_DIRECT`) to _pipe2(2)_ so that a read from the pipe will return at
|
||||
most one packet, and any unread portion of a packet will be discarded.
|
||||
|
||||
The `wait_queue_entry_token` is a unique number which can identify a
|
||||
The `wait_queue_token` is a unique number which can identify a
|
||||
particular request to be acknowledged. When a message is sent over
|
||||
the pipe the affected dentry is marked as either "active" or
|
||||
"expiring" and other accesses to it block until the message is
|
||||
acknowledged using one of the ioctls below and the relevant
|
||||
`wait_queue_entry_token`.
|
||||
`wait_queue_token`.
|
||||
|
||||
Communicating with autofs: root directory ioctls
|
||||
------------------------------------------------
|
||||
@ -358,7 +358,7 @@ capability, or must be the automount daemon.
|
||||
The available ioctl commands are:
|
||||
|
||||
- **AUTOFS_IOC_READY**: a notification has been handled. The argument
|
||||
to the ioctl command is the "wait_queue_entry_token" number
|
||||
to the ioctl command is the "wait_queue_token" number
|
||||
corresponding to the notification being acknowledged.
|
||||
- **AUTOFS_IOC_FAIL**: similar to above, but indicates failure with
|
||||
the error code `ENOENT`.
|
||||
@ -382,14 +382,14 @@ The available ioctl commands are:
|
||||
struct autofs_packet_expire_multi {
|
||||
int proto_version; /* Protocol version */
|
||||
int type; /* Type of packet */
|
||||
autofs_wqt_t wait_queue_entry_token;
|
||||
autofs_wqt_t wait_queue_token;
|
||||
int len;
|
||||
char name[NAME_MAX+1];
|
||||
};
|
||||
|
||||
is required. This is filled in with the name of something
|
||||
that can be unmounted or removed. If nothing can be expired,
|
||||
`errno` is set to `EAGAIN`. Even though a `wait_queue_entry_token`
|
||||
`errno` is set to `EAGAIN`. Even though a `wait_queue_token`
|
||||
is present in the structure, no "wait queue" is established
|
||||
and no acknowledgment is needed.
|
||||
- **AUTOFS_IOC_EXPIRE_MULTI**: This is similar to
|
||||
|
@ -155,11 +155,15 @@ noinline_data Disable the inline data feature, inline data feature is
|
||||
enabled by default.
|
||||
data_flush Enable data flushing before checkpoint in order to
|
||||
persist data of regular and symlink.
|
||||
fault_injection=%d Enable fault injection in all supported types with
|
||||
specified injection rate.
|
||||
mode=%s Control block allocation mode which supports "adaptive"
|
||||
and "lfs". In "lfs" mode, there should be no random
|
||||
writes towards main area.
|
||||
io_bits=%u Set the bit size of write IO requests. It should be set
|
||||
with "mode=lfs".
|
||||
usrquota Enable plain user disk quota accounting.
|
||||
grpquota Enable plain group disk quota accounting.
|
||||
|
||||
================================================================================
|
||||
DEBUGFS ENTRIES
|
||||
|
@ -201,6 +201,40 @@ rightmost one and going left. In the above example lower1 will be the
|
||||
top, lower2 the middle and lower3 the bottom layer.
|
||||
|
||||
|
||||
Sharing and copying layers
|
||||
--------------------------
|
||||
|
||||
Lower layers may be shared among several overlay mounts and that is indeed
|
||||
a very common practice. An overlay mount may use the same lower layer
|
||||
path as another overlay mount and it may use a lower layer path that is
|
||||
beneath or above the path of another overlay lower layer path.
|
||||
|
||||
Using an upper layer path and/or a workdir path that are already used by
|
||||
another overlay mount is not allowed and will fail with EBUSY. Using
|
||||
partially overlapping paths is not allowed but will not fail with EBUSY.
|
||||
|
||||
Mounting an overlay using an upper layer path, where the upper layer path
|
||||
was previously used by another mounted overlay in combination with a
|
||||
different lower layer path, is allowed, unless the "inodes index" feature
|
||||
is enabled.
|
||||
|
||||
With the "inodes index" feature, on the first time mount, an NFS file
|
||||
handle of the lower layer root directory, along with the UUID of the lower
|
||||
filesystem, are encoded and stored in the "trusted.overlay.origin" extended
|
||||
attribute on the upper layer root directory. On subsequent mount attempts,
|
||||
the lower root directory file handle and lower filesystem UUID are compared
|
||||
to the stored origin in upper root directory. On failure to verify the
|
||||
lower root origin, mount will fail with ESTALE. An overlayfs mount with
|
||||
"inodes index" enabled will fail with EOPNOTSUPP if the lower filesystem
|
||||
does not support NFS export, lower filesystem does not have a valid UUID or
|
||||
if the upper filesystem does not support extended attributes.
|
||||
|
||||
It is quite a common practice to copy overlay layers to a different
|
||||
directory tree on the same or different underlying filesystem, and even
|
||||
to a different machine. With the "inodes index" feature, trying to mount
|
||||
the copied layers will fail the verification of the lower root file handle.
|
||||
|
||||
|
||||
Non-standard behavior
|
||||
---------------------
|
||||
|
||||
|
@ -1786,12 +1786,16 @@ pair provide additional information particular to the objects they represent.
|
||||
pos: 0
|
||||
flags: 02
|
||||
mnt_id: 9
|
||||
tfd: 5 events: 1d data: ffffffffffffffff
|
||||
tfd: 5 events: 1d data: ffffffffffffffff pos:0 ino:61af sdev:7
|
||||
|
||||
where 'tfd' is a target file descriptor number in decimal form,
|
||||
'events' is events mask being watched and the 'data' is data
|
||||
associated with a target [see epoll(7) for more details].
|
||||
|
||||
The 'pos' is current offset of the target file in decimal form
|
||||
[see lseek(2)], 'ino' and 'sdev' are inode and device numbers
|
||||
where target file resides, all in hex format.
|
||||
|
||||
Fsnotify files
|
||||
~~~~~~~~~~~~~~
|
||||
For inotify files the format is the following
|
||||
|
@ -1225,12 +1225,6 @@ The underlying reason for the above rules is to make sure, that a
|
||||
mount can be accurately replicated (e.g. umounting and mounting again)
|
||||
based on the information found in /proc/mounts.
|
||||
|
||||
A simple method of saving options at mount/remount time and showing
|
||||
them is provided with the save_mount_options() and
|
||||
generic_show_options() helper functions. Please note, that using
|
||||
these may have drawbacks. For more info see header comments for these
|
||||
functions in fs/namespace.c.
|
||||
|
||||
Resources
|
||||
=========
|
||||
|
||||
|
@ -1,6 +1,9 @@
|
||||
===================================
|
||||
Using flexible arrays in the kernel
|
||||
Last updated for 2.6.32
|
||||
Jonathan Corbet <corbet@lwn.net>
|
||||
===================================
|
||||
|
||||
:Updated: Last updated for 2.6.32
|
||||
:Author: Jonathan Corbet <corbet@lwn.net>
|
||||
|
||||
Large contiguous memory allocations can be unreliable in the Linux kernel.
|
||||
Kernel programmers will sometimes respond to this problem by allocating
|
||||
@ -26,7 +29,7 @@ operation. It's also worth noting that flexible arrays do no internal
|
||||
locking at all; if concurrent access to an array is possible, then the
|
||||
caller must arrange for appropriate mutual exclusion.
|
||||
|
||||
The creation of a flexible array is done with:
|
||||
The creation of a flexible array is done with::
|
||||
|
||||
#include <linux/flex_array.h>
|
||||
|
||||
@ -40,14 +43,14 @@ argument is passed directly to the internal memory allocation calls. With
|
||||
the current code, using flags to ask for high memory is likely to lead to
|
||||
notably unpleasant side effects.
|
||||
|
||||
It is also possible to define flexible arrays at compile time with:
|
||||
It is also possible to define flexible arrays at compile time with::
|
||||
|
||||
DEFINE_FLEX_ARRAY(name, element_size, total);
|
||||
|
||||
This macro will result in a definition of an array with the given name; the
|
||||
element size and total will be checked for validity at compile time.
|
||||
|
||||
Storing data into a flexible array is accomplished with a call to:
|
||||
Storing data into a flexible array is accomplished with a call to::
|
||||
|
||||
int flex_array_put(struct flex_array *array, unsigned int element_nr,
|
||||
void *src, gfp_t flags);
|
||||
@ -63,7 +66,7 @@ running in some sort of atomic context; in this situation, sleeping in the
|
||||
memory allocator would be a bad thing. That can be avoided by using
|
||||
GFP_ATOMIC for the flags value, but, often, there is a better way. The
|
||||
trick is to ensure that any needed memory allocations are done before
|
||||
entering atomic context, using:
|
||||
entering atomic context, using::
|
||||
|
||||
int flex_array_prealloc(struct flex_array *array, unsigned int start,
|
||||
unsigned int nr_elements, gfp_t flags);
|
||||
@ -73,7 +76,7 @@ defined by start and nr_elements has been allocated. Thereafter, a
|
||||
flex_array_put() call on an element in that range is guaranteed not to
|
||||
block.
|
||||
|
||||
Getting data back out of the array is done with:
|
||||
Getting data back out of the array is done with::
|
||||
|
||||
void *flex_array_get(struct flex_array *fa, unsigned int element_nr);
|
||||
|
||||
@ -89,7 +92,7 @@ involving that number probably result from use of unstored array entries.
|
||||
Note that, if array elements are allocated with __GFP_ZERO, they will be
|
||||
initialized to zero and this poisoning will not happen.
|
||||
|
||||
Individual elements in the array can be cleared with:
|
||||
Individual elements in the array can be cleared with::
|
||||
|
||||
int flex_array_clear(struct flex_array *array, unsigned int element_nr);
|
||||
|
||||
@ -97,7 +100,7 @@ This function will set the given element to FLEX_ARRAY_FREE and return
|
||||
zero. If storage for the indicated element is not allocated for the array,
|
||||
flex_array_clear() will return -EINVAL instead. Note that clearing an
|
||||
element does not release the storage associated with it; to reduce the
|
||||
allocated size of an array, call:
|
||||
allocated size of an array, call::
|
||||
|
||||
int flex_array_shrink(struct flex_array *array);
|
||||
|
||||
@ -106,12 +109,12 @@ This function works by scanning the array for pages containing nothing but
|
||||
FLEX_ARRAY_FREE bytes, so (1) it can be expensive, and (2) it will not work
|
||||
if the array's pages are allocated with __GFP_ZERO.
|
||||
|
||||
It is possible to remove all elements of an array with a call to:
|
||||
It is possible to remove all elements of an array with a call to::
|
||||
|
||||
void flex_array_free_parts(struct flex_array *array);
|
||||
|
||||
This call frees all elements, but leaves the array itself in place.
|
||||
Freeing the entire array is done with:
|
||||
Freeing the entire array is done with::
|
||||
|
||||
void flex_array_free(struct flex_array *array);
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
================
|
||||
Futex Requeue PI
|
||||
----------------
|
||||
================
|
||||
|
||||
Requeueing of tasks from a non-PI futex to a PI futex requires
|
||||
special handling in order to ensure the underlying rt_mutex is never
|
||||
@ -20,28 +21,28 @@ implementation would wake the highest-priority waiter, and leave the
|
||||
rest to the natural wakeup inherent in unlocking the mutex
|
||||
associated with the condvar.
|
||||
|
||||
Consider the simplified glibc calls:
|
||||
Consider the simplified glibc calls::
|
||||
|
||||
/* caller must lock mutex */
|
||||
pthread_cond_wait(cond, mutex)
|
||||
{
|
||||
lock(cond->__data.__lock);
|
||||
unlock(mutex);
|
||||
do {
|
||||
unlock(cond->__data.__lock);
|
||||
futex_wait(cond->__data.__futex);
|
||||
lock(cond->__data.__lock);
|
||||
} while(...)
|
||||
unlock(cond->__data.__lock);
|
||||
lock(mutex);
|
||||
}
|
||||
/* caller must lock mutex */
|
||||
pthread_cond_wait(cond, mutex)
|
||||
{
|
||||
lock(cond->__data.__lock);
|
||||
unlock(mutex);
|
||||
do {
|
||||
unlock(cond->__data.__lock);
|
||||
futex_wait(cond->__data.__futex);
|
||||
lock(cond->__data.__lock);
|
||||
} while(...)
|
||||
unlock(cond->__data.__lock);
|
||||
lock(mutex);
|
||||
}
|
||||
|
||||
pthread_cond_broadcast(cond)
|
||||
{
|
||||
lock(cond->__data.__lock);
|
||||
unlock(cond->__data.__lock);
|
||||
futex_requeue(cond->data.__futex, cond->mutex);
|
||||
}
|
||||
pthread_cond_broadcast(cond)
|
||||
{
|
||||
lock(cond->__data.__lock);
|
||||
unlock(cond->__data.__lock);
|
||||
futex_requeue(cond->data.__futex, cond->mutex);
|
||||
}
|
||||
|
||||
Once pthread_cond_broadcast() requeues the tasks, the cond->mutex
|
||||
has waiters. Note that pthread_cond_wait() attempts to lock the
|
||||
@ -53,29 +54,29 @@ In order to support PI-aware pthread_condvar's, the kernel needs to
|
||||
be able to requeue tasks to PI futexes. This support implies that
|
||||
upon a successful futex_wait system call, the caller would return to
|
||||
user space already holding the PI futex. The glibc implementation
|
||||
would be modified as follows:
|
||||
would be modified as follows::
|
||||
|
||||
|
||||
/* caller must lock mutex */
|
||||
pthread_cond_wait_pi(cond, mutex)
|
||||
{
|
||||
lock(cond->__data.__lock);
|
||||
unlock(mutex);
|
||||
do {
|
||||
unlock(cond->__data.__lock);
|
||||
futex_wait_requeue_pi(cond->__data.__futex);
|
||||
lock(cond->__data.__lock);
|
||||
} while(...)
|
||||
unlock(cond->__data.__lock);
|
||||
/* the kernel acquired the mutex for us */
|
||||
}
|
||||
/* caller must lock mutex */
|
||||
pthread_cond_wait_pi(cond, mutex)
|
||||
{
|
||||
lock(cond->__data.__lock);
|
||||
unlock(mutex);
|
||||
do {
|
||||
unlock(cond->__data.__lock);
|
||||
futex_wait_requeue_pi(cond->__data.__futex);
|
||||
lock(cond->__data.__lock);
|
||||
} while(...)
|
||||
unlock(cond->__data.__lock);
|
||||
/* the kernel acquired the mutex for us */
|
||||
}
|
||||
|
||||
pthread_cond_broadcast_pi(cond)
|
||||
{
|
||||
lock(cond->__data.__lock);
|
||||
unlock(cond->__data.__lock);
|
||||
futex_requeue_pi(cond->data.__futex, cond->mutex);
|
||||
}
|
||||
pthread_cond_broadcast_pi(cond)
|
||||
{
|
||||
lock(cond->__data.__lock);
|
||||
unlock(cond->__data.__lock);
|
||||
futex_requeue_pi(cond->data.__futex, cond->mutex);
|
||||
}
|
||||
|
||||
The actual glibc implementation will likely test for PI and make the
|
||||
necessary changes inside the existing calls rather than creating new
|
||||
|
@ -1,14 +1,15 @@
|
||||
=========================
|
||||
GCC plugin infrastructure
|
||||
=========================
|
||||
|
||||
|
||||
1. Introduction
|
||||
===============
|
||||
Introduction
|
||||
============
|
||||
|
||||
GCC plugins are loadable modules that provide extra features to the
|
||||
compiler [1]. They are useful for runtime instrumentation and static analysis.
|
||||
compiler [1]_. They are useful for runtime instrumentation and static analysis.
|
||||
We can analyse, change and add further code during compilation via
|
||||
callbacks [2], GIMPLE [3], IPA [4] and RTL passes [5].
|
||||
callbacks [2]_, GIMPLE [3]_, IPA [4]_ and RTL passes [5]_.
|
||||
|
||||
The GCC plugin infrastructure of the kernel supports all gcc versions from
|
||||
4.5 to 6.0, building out-of-tree modules, cross-compilation and building in a
|
||||
@ -21,56 +22,61 @@ and versions 4.8+ can only be compiled by a C++ compiler.
|
||||
Currently the GCC plugin infrastructure supports only the x86, arm, arm64 and
|
||||
powerpc architectures.
|
||||
|
||||
This infrastructure was ported from grsecurity [6] and PaX [7].
|
||||
This infrastructure was ported from grsecurity [6]_ and PaX [7]_.
|
||||
|
||||
--
|
||||
[1] https://gcc.gnu.org/onlinedocs/gccint/Plugins.html
|
||||
[2] https://gcc.gnu.org/onlinedocs/gccint/Plugin-API.html#Plugin-API
|
||||
[3] https://gcc.gnu.org/onlinedocs/gccint/GIMPLE.html
|
||||
[4] https://gcc.gnu.org/onlinedocs/gccint/IPA.html
|
||||
[5] https://gcc.gnu.org/onlinedocs/gccint/RTL.html
|
||||
[6] https://grsecurity.net/
|
||||
[7] https://pax.grsecurity.net/
|
||||
|
||||
.. [1] https://gcc.gnu.org/onlinedocs/gccint/Plugins.html
|
||||
.. [2] https://gcc.gnu.org/onlinedocs/gccint/Plugin-API.html#Plugin-API
|
||||
.. [3] https://gcc.gnu.org/onlinedocs/gccint/GIMPLE.html
|
||||
.. [4] https://gcc.gnu.org/onlinedocs/gccint/IPA.html
|
||||
.. [5] https://gcc.gnu.org/onlinedocs/gccint/RTL.html
|
||||
.. [6] https://grsecurity.net/
|
||||
.. [7] https://pax.grsecurity.net/
|
||||
|
||||
|
||||
2. Files
|
||||
========
|
||||
Files
|
||||
=====
|
||||
|
||||
**$(src)/scripts/gcc-plugins**
|
||||
|
||||
$(src)/scripts/gcc-plugins
|
||||
This is the directory of the GCC plugins.
|
||||
|
||||
$(src)/scripts/gcc-plugins/gcc-common.h
|
||||
**$(src)/scripts/gcc-plugins/gcc-common.h**
|
||||
|
||||
This is a compatibility header for GCC plugins.
|
||||
It should be always included instead of individual gcc headers.
|
||||
|
||||
$(src)/scripts/gcc-plugin.sh
|
||||
**$(src)/scripts/gcc-plugin.sh**
|
||||
|
||||
This script checks the availability of the included headers in
|
||||
gcc-common.h and chooses the proper host compiler to build the plugins
|
||||
(gcc-4.7 can be built by either gcc or g++).
|
||||
|
||||
$(src)/scripts/gcc-plugins/gcc-generate-gimple-pass.h
|
||||
$(src)/scripts/gcc-plugins/gcc-generate-ipa-pass.h
|
||||
$(src)/scripts/gcc-plugins/gcc-generate-simple_ipa-pass.h
|
||||
$(src)/scripts/gcc-plugins/gcc-generate-rtl-pass.h
|
||||
**$(src)/scripts/gcc-plugins/gcc-generate-gimple-pass.h,
|
||||
$(src)/scripts/gcc-plugins/gcc-generate-ipa-pass.h,
|
||||
$(src)/scripts/gcc-plugins/gcc-generate-simple_ipa-pass.h,
|
||||
$(src)/scripts/gcc-plugins/gcc-generate-rtl-pass.h**
|
||||
|
||||
These headers automatically generate the registration structures for
|
||||
GIMPLE, SIMPLE_IPA, IPA and RTL passes. They support all gcc versions
|
||||
from 4.5 to 6.0.
|
||||
They should be preferred to creating the structures by hand.
|
||||
|
||||
|
||||
3. Usage
|
||||
========
|
||||
Usage
|
||||
=====
|
||||
|
||||
You must install the gcc plugin headers for your gcc version,
|
||||
e.g., on Ubuntu for gcc-4.9:
|
||||
e.g., on Ubuntu for gcc-4.9::
|
||||
|
||||
apt-get install gcc-4.9-plugin-dev
|
||||
|
||||
Enable a GCC plugin based feature in the kernel config:
|
||||
Enable a GCC plugin based feature in the kernel config::
|
||||
|
||||
CONFIG_GCC_PLUGIN_CYC_COMPLEXITY = y
|
||||
|
||||
To compile only the plugin(s):
|
||||
To compile only the plugin(s)::
|
||||
|
||||
make gcc-plugins
|
||||
|
||||
|
@ -1,4 +1,9 @@
|
||||
Notes on the change from 16-bit UIDs to 32-bit UIDs:
|
||||
===================================================
|
||||
Notes on the change from 16-bit UIDs to 32-bit UIDs
|
||||
===================================================
|
||||
|
||||
:Author: Chris Wing <wingc@umich.edu>
|
||||
:Last updated: January 11, 2000
|
||||
|
||||
- kernel code MUST take into account __kernel_uid_t and __kernel_uid32_t
|
||||
when communicating between user and kernel space in an ioctl or data
|
||||
@ -28,30 +33,34 @@ What's left to be done for 32-bit UIDs on all Linux architectures:
|
||||
uses the 32-bit UID system calls properly otherwise.
|
||||
|
||||
This affects at least:
|
||||
iBCS on Intel
|
||||
|
||||
sparc32 emulation on sparc64
|
||||
(need to support whatever new 32-bit UID system calls are added to
|
||||
sparc32)
|
||||
- iBCS on Intel
|
||||
|
||||
- sparc32 emulation on sparc64
|
||||
(need to support whatever new 32-bit UID system calls are added to
|
||||
sparc32)
|
||||
|
||||
- Validate that all filesystems behave properly.
|
||||
|
||||
At present, 32-bit UIDs _should_ work for:
|
||||
ext2
|
||||
ufs
|
||||
isofs
|
||||
nfs
|
||||
coda
|
||||
udf
|
||||
|
||||
- ext2
|
||||
- ufs
|
||||
- isofs
|
||||
- nfs
|
||||
- coda
|
||||
- udf
|
||||
|
||||
Ioctl() fixups have been made for:
|
||||
ncpfs
|
||||
smbfs
|
||||
|
||||
- ncpfs
|
||||
- smbfs
|
||||
|
||||
Filesystems with simple fixups to prevent 16-bit UID wraparound:
|
||||
minix
|
||||
sysv
|
||||
qnx4
|
||||
|
||||
- minix
|
||||
- sysv
|
||||
- qnx4
|
||||
|
||||
Other filesystems have not been checked yet.
|
||||
|
||||
@ -69,9 +78,3 @@ What's left to be done for 32-bit UIDs on all Linux architectures:
|
||||
- make sure that the UID mapping feature of AX25 networking works properly
|
||||
(it should be safe because it's always used a 32-bit integer to
|
||||
communicate between user and kernel)
|
||||
|
||||
|
||||
Chris Wing
|
||||
wingc@umich.edu
|
||||
|
||||
last updated: January 11, 2000
|
||||
|
@ -1,90 +1,105 @@
|
||||
Introduction:
|
||||
==========================================================
|
||||
Linux support for random number generator in i8xx chipsets
|
||||
==========================================================
|
||||
|
||||
The hw_random framework is software that makes use of a
|
||||
special hardware feature on your CPU or motherboard,
|
||||
a Random Number Generator (RNG). The software has two parts:
|
||||
a core providing the /dev/hwrng character device and its
|
||||
sysfs support, plus a hardware-specific driver that plugs
|
||||
into that core.
|
||||
Introduction
|
||||
============
|
||||
|
||||
To make the most effective use of these mechanisms, you
|
||||
should download the support software as well. Download the
|
||||
latest version of the "rng-tools" package from the
|
||||
hw_random driver's official Web site:
|
||||
The hw_random framework is software that makes use of a
|
||||
special hardware feature on your CPU or motherboard,
|
||||
a Random Number Generator (RNG). The software has two parts:
|
||||
a core providing the /dev/hwrng character device and its
|
||||
sysfs support, plus a hardware-specific driver that plugs
|
||||
into that core.
|
||||
|
||||
http://sourceforge.net/projects/gkernel/
|
||||
To make the most effective use of these mechanisms, you
|
||||
should download the support software as well. Download the
|
||||
latest version of the "rng-tools" package from the
|
||||
hw_random driver's official Web site:
|
||||
|
||||
Those tools use /dev/hwrng to fill the kernel entropy pool,
|
||||
which is used internally and exported by the /dev/urandom and
|
||||
/dev/random special files.
|
||||
http://sourceforge.net/projects/gkernel/
|
||||
|
||||
Theory of operation:
|
||||
Those tools use /dev/hwrng to fill the kernel entropy pool,
|
||||
which is used internally and exported by the /dev/urandom and
|
||||
/dev/random special files.
|
||||
|
||||
CHARACTER DEVICE. Using the standard open()
|
||||
and read() system calls, you can read random data from
|
||||
the hardware RNG device. This data is NOT CHECKED by any
|
||||
fitness tests, and could potentially be bogus (if the
|
||||
hardware is faulty or has been tampered with). Data is only
|
||||
output if the hardware "has-data" flag is set, but nevertheless
|
||||
a security-conscious person would run fitness tests on the
|
||||
data before assuming it is truly random.
|
||||
Theory of operation
|
||||
===================
|
||||
|
||||
The rng-tools package uses such tests in "rngd", and lets you
|
||||
run them by hand with a "rngtest" utility.
|
||||
CHARACTER DEVICE. Using the standard open()
|
||||
and read() system calls, you can read random data from
|
||||
the hardware RNG device. This data is NOT CHECKED by any
|
||||
fitness tests, and could potentially be bogus (if the
|
||||
hardware is faulty or has been tampered with). Data is only
|
||||
output if the hardware "has-data" flag is set, but nevertheless
|
||||
a security-conscious person would run fitness tests on the
|
||||
data before assuming it is truly random.
|
||||
|
||||
/dev/hwrng is char device major 10, minor 183.
|
||||
The rng-tools package uses such tests in "rngd", and lets you
|
||||
run them by hand with a "rngtest" utility.
|
||||
|
||||
CLASS DEVICE. There is a /sys/class/misc/hw_random node with
|
||||
two unique attributes, "rng_available" and "rng_current". The
|
||||
"rng_available" attribute lists the hardware-specific drivers
|
||||
available, while "rng_current" lists the one which is currently
|
||||
connected to /dev/hwrng. If your system has more than one
|
||||
RNG available, you may change the one used by writing a name from
|
||||
the list in "rng_available" into "rng_current".
|
||||
/dev/hwrng is char device major 10, minor 183.
|
||||
|
||||
CLASS DEVICE. There is a /sys/class/misc/hw_random node with
|
||||
two unique attributes, "rng_available" and "rng_current". The
|
||||
"rng_available" attribute lists the hardware-specific drivers
|
||||
available, while "rng_current" lists the one which is currently
|
||||
connected to /dev/hwrng. If your system has more than one
|
||||
RNG available, you may change the one used by writing a name from
|
||||
the list in "rng_available" into "rng_current".
|
||||
|
||||
==========================================================================
|
||||
|
||||
Hardware driver for Intel/AMD/VIA Random Number Generators (RNG)
|
||||
Copyright 2000,2001 Jeff Garzik <jgarzik@pobox.com>
|
||||
Copyright 2000,2001 Philipp Rumpf <prumpf@mandrakesoft.com>
|
||||
|
||||
Hardware driver for Intel/AMD/VIA Random Number Generators (RNG)
|
||||
- Copyright 2000,2001 Jeff Garzik <jgarzik@pobox.com>
|
||||
- Copyright 2000,2001 Philipp Rumpf <prumpf@mandrakesoft.com>
|
||||
|
||||
|
||||
About the Intel RNG hardware, from the firmware hub datasheet:
|
||||
About the Intel RNG hardware, from the firmware hub datasheet
|
||||
=============================================================
|
||||
|
||||
The Firmware Hub integrates a Random Number Generator (RNG)
|
||||
using thermal noise generated from inherently random quantum
|
||||
mechanical properties of silicon. When not generating new random
|
||||
bits the RNG circuitry will enter a low power state. Intel will
|
||||
provide a binary software driver to give third party software
|
||||
access to our RNG for use as a security feature. At this time,
|
||||
the RNG is only to be used with a system in an OS-present state.
|
||||
The Firmware Hub integrates a Random Number Generator (RNG)
|
||||
using thermal noise generated from inherently random quantum
|
||||
mechanical properties of silicon. When not generating new random
|
||||
bits the RNG circuitry will enter a low power state. Intel will
|
||||
provide a binary software driver to give third party software
|
||||
access to our RNG for use as a security feature. At this time,
|
||||
the RNG is only to be used with a system in an OS-present state.
|
||||
|
||||
Intel RNG Driver notes:
|
||||
Intel RNG Driver notes
|
||||
======================
|
||||
|
||||
* FIXME: support poll(2)
|
||||
FIXME: support poll(2)
|
||||
|
||||
NOTE: request_mem_region was removed, for three reasons:
|
||||
1) Only one RNG is supported by this driver, 2) The location
|
||||
used by the RNG is a fixed location in MMIO-addressable memory,
|
||||
.. note::
|
||||
|
||||
request_mem_region was removed, for three reasons:
|
||||
|
||||
1) Only one RNG is supported by this driver;
|
||||
2) The location used by the RNG is a fixed location in
|
||||
MMIO-addressable memory;
|
||||
3) users with properly working BIOS e820 handling will always
|
||||
have the region in which the RNG is located reserved, so
|
||||
request_mem_region calls always fail for proper setups.
|
||||
However, for people who use mem=XX, BIOS e820 information is
|
||||
-not- in /proc/iomem, and request_mem_region(RNG_ADDR) can
|
||||
succeed.
|
||||
have the region in which the RNG is located reserved, so
|
||||
request_mem_region calls always fail for proper setups.
|
||||
However, for people who use mem=XX, BIOS e820 information is
|
||||
**not** in /proc/iomem, and request_mem_region(RNG_ADDR) can
|
||||
succeed.
|
||||
|
||||
Driver details:
|
||||
Driver details
|
||||
==============
|
||||
|
||||
Based on:
|
||||
Based on:
|
||||
Intel 82802AB/82802AC Firmware Hub (FWH) Datasheet
|
||||
May 1999 Order Number: 290658-002 R
|
||||
May 1999 Order Number: 290658-002 R
|
||||
|
||||
Intel 82802 Firmware Hub: Random Number Generator
|
||||
Intel 82802 Firmware Hub:
|
||||
Random Number Generator
|
||||
Programmer's Reference Manual
|
||||
December 1999 Order Number: 298029-001 R
|
||||
December 1999 Order Number: 298029-001 R
|
||||
|
||||
Intel 82802 Firmware HUB Random Number Generator Driver
|
||||
Intel 82802 Firmware HUB Random Number Generator Driver
|
||||
Copyright (c) 2000 Matt Sottek <msottek@quiknet.com>
|
||||
|
||||
Special thanks to Matt Sottek. I did the "guts", he
|
||||
did the "brains" and all the testing.
|
||||
Special thanks to Matt Sottek. I did the "guts", he
|
||||
did the "brains" and all the testing.
|
||||
|
@ -1,6 +1,9 @@
|
||||
===========================
|
||||
Hardware Spinlock Framework
|
||||
===========================
|
||||
|
||||
1. Introduction
|
||||
Introduction
|
||||
============
|
||||
|
||||
Hardware spinlock modules provide hardware assistance for synchronization
|
||||
and mutual exclusion between heterogeneous processors and those not operating
|
||||
@ -32,286 +35,370 @@ structure).
|
||||
A common hwspinlock interface makes it possible to have generic, platform-
|
||||
independent, drivers.
|
||||
|
||||
2. User API
|
||||
User API
|
||||
========
|
||||
|
||||
::
|
||||
|
||||
struct hwspinlock *hwspin_lock_request(void);
|
||||
- dynamically assign an hwspinlock and return its address, or NULL
|
||||
in case an unused hwspinlock isn't available. Users of this
|
||||
API will usually want to communicate the lock's id to the remote core
|
||||
before it can be used to achieve synchronization.
|
||||
Should be called from a process context (might sleep).
|
||||
|
||||
Dynamically assign an hwspinlock and return its address, or NULL
|
||||
in case an unused hwspinlock isn't available. Users of this
|
||||
API will usually want to communicate the lock's id to the remote core
|
||||
before it can be used to achieve synchronization.
|
||||
|
||||
Should be called from a process context (might sleep).
|
||||
|
||||
::
|
||||
|
||||
struct hwspinlock *hwspin_lock_request_specific(unsigned int id);
|
||||
- assign a specific hwspinlock id and return its address, or NULL
|
||||
if that hwspinlock is already in use. Usually board code will
|
||||
be calling this function in order to reserve specific hwspinlock
|
||||
ids for predefined purposes.
|
||||
Should be called from a process context (might sleep).
|
||||
|
||||
Assign a specific hwspinlock id and return its address, or NULL
|
||||
if that hwspinlock is already in use. Usually board code will
|
||||
be calling this function in order to reserve specific hwspinlock
|
||||
ids for predefined purposes.
|
||||
|
||||
Should be called from a process context (might sleep).
|
||||
|
||||
::
|
||||
|
||||
int of_hwspin_lock_get_id(struct device_node *np, int index);
|
||||
- retrieve the global lock id for an OF phandle-based specific lock.
|
||||
This function provides a means for DT users of a hwspinlock module
|
||||
to get the global lock id of a specific hwspinlock, so that it can
|
||||
be requested using the normal hwspin_lock_request_specific() API.
|
||||
The function returns a lock id number on success, -EPROBE_DEFER if
|
||||
the hwspinlock device is not yet registered with the core, or other
|
||||
error values.
|
||||
Should be called from a process context (might sleep).
|
||||
|
||||
Retrieve the global lock id for an OF phandle-based specific lock.
|
||||
This function provides a means for DT users of a hwspinlock module
|
||||
to get the global lock id of a specific hwspinlock, so that it can
|
||||
be requested using the normal hwspin_lock_request_specific() API.
|
||||
|
||||
The function returns a lock id number on success, -EPROBE_DEFER if
|
||||
the hwspinlock device is not yet registered with the core, or other
|
||||
error values.
|
||||
|
||||
Should be called from a process context (might sleep).
|
||||
|
||||
::
|
||||
|
||||
int hwspin_lock_free(struct hwspinlock *hwlock);
|
||||
- free a previously-assigned hwspinlock; returns 0 on success, or an
|
||||
appropriate error code on failure (e.g. -EINVAL if the hwspinlock
|
||||
is already free).
|
||||
Should be called from a process context (might sleep).
|
||||
|
||||
Free a previously-assigned hwspinlock; returns 0 on success, or an
|
||||
appropriate error code on failure (e.g. -EINVAL if the hwspinlock
|
||||
is already free).
|
||||
|
||||
Should be called from a process context (might sleep).
|
||||
|
||||
::
|
||||
|
||||
int hwspin_lock_timeout(struct hwspinlock *hwlock, unsigned int timeout);
|
||||
- lock a previously-assigned hwspinlock with a timeout limit (specified in
|
||||
msecs). If the hwspinlock is already taken, the function will busy loop
|
||||
waiting for it to be released, but give up when the timeout elapses.
|
||||
Upon a successful return from this function, preemption is disabled so
|
||||
the caller must not sleep, and is advised to release the hwspinlock as
|
||||
soon as possible, in order to minimize remote cores polling on the
|
||||
hardware interconnect.
|
||||
Returns 0 when successful and an appropriate error code otherwise (most
|
||||
notably -ETIMEDOUT if the hwspinlock is still busy after timeout msecs).
|
||||
The function will never sleep.
|
||||
|
||||
Lock a previously-assigned hwspinlock with a timeout limit (specified in
|
||||
msecs). If the hwspinlock is already taken, the function will busy loop
|
||||
waiting for it to be released, but give up when the timeout elapses.
|
||||
Upon a successful return from this function, preemption is disabled so
|
||||
the caller must not sleep, and is advised to release the hwspinlock as
|
||||
soon as possible, in order to minimize remote cores polling on the
|
||||
hardware interconnect.
|
||||
|
||||
Returns 0 when successful and an appropriate error code otherwise (most
|
||||
notably -ETIMEDOUT if the hwspinlock is still busy after timeout msecs).
|
||||
The function will never sleep.
|
||||
|
||||
::
|
||||
|
||||
int hwspin_lock_timeout_irq(struct hwspinlock *hwlock, unsigned int timeout);
|
||||
- lock a previously-assigned hwspinlock with a timeout limit (specified in
|
||||
msecs). If the hwspinlock is already taken, the function will busy loop
|
||||
waiting for it to be released, but give up when the timeout elapses.
|
||||
Upon a successful return from this function, preemption and the local
|
||||
interrupts are disabled, so the caller must not sleep, and is advised to
|
||||
release the hwspinlock as soon as possible.
|
||||
Returns 0 when successful and an appropriate error code otherwise (most
|
||||
notably -ETIMEDOUT if the hwspinlock is still busy after timeout msecs).
|
||||
The function will never sleep.
|
||||
|
||||
Lock a previously-assigned hwspinlock with a timeout limit (specified in
|
||||
msecs). If the hwspinlock is already taken, the function will busy loop
|
||||
waiting for it to be released, but give up when the timeout elapses.
|
||||
Upon a successful return from this function, preemption and the local
|
||||
interrupts are disabled, so the caller must not sleep, and is advised to
|
||||
release the hwspinlock as soon as possible.
|
||||
|
||||
Returns 0 when successful and an appropriate error code otherwise (most
|
||||
notably -ETIMEDOUT if the hwspinlock is still busy after timeout msecs).
|
||||
The function will never sleep.
|
||||
|
||||
::
|
||||
|
||||
int hwspin_lock_timeout_irqsave(struct hwspinlock *hwlock, unsigned int to,
|
||||
unsigned long *flags);
|
||||
- lock a previously-assigned hwspinlock with a timeout limit (specified in
|
||||
msecs). If the hwspinlock is already taken, the function will busy loop
|
||||
waiting for it to be released, but give up when the timeout elapses.
|
||||
Upon a successful return from this function, preemption is disabled,
|
||||
local interrupts are disabled and their previous state is saved at the
|
||||
given flags placeholder. The caller must not sleep, and is advised to
|
||||
release the hwspinlock as soon as possible.
|
||||
Returns 0 when successful and an appropriate error code otherwise (most
|
||||
notably -ETIMEDOUT if the hwspinlock is still busy after timeout msecs).
|
||||
The function will never sleep.
|
||||
unsigned long *flags);
|
||||
|
||||
Lock a previously-assigned hwspinlock with a timeout limit (specified in
|
||||
msecs). If the hwspinlock is already taken, the function will busy loop
|
||||
waiting for it to be released, but give up when the timeout elapses.
|
||||
Upon a successful return from this function, preemption is disabled,
|
||||
local interrupts are disabled and their previous state is saved at the
|
||||
given flags placeholder. The caller must not sleep, and is advised to
|
||||
release the hwspinlock as soon as possible.
|
||||
|
||||
Returns 0 when successful and an appropriate error code otherwise (most
|
||||
notably -ETIMEDOUT if the hwspinlock is still busy after timeout msecs).
|
||||
|
||||
The function will never sleep.
|
||||
|
||||
::
|
||||
|
||||
int hwspin_trylock(struct hwspinlock *hwlock);
|
||||
- attempt to lock a previously-assigned hwspinlock, but immediately fail if
|
||||
it is already taken.
|
||||
Upon a successful return from this function, preemption is disabled so
|
||||
caller must not sleep, and is advised to release the hwspinlock as soon as
|
||||
possible, in order to minimize remote cores polling on the hardware
|
||||
interconnect.
|
||||
Returns 0 on success and an appropriate error code otherwise (most
|
||||
notably -EBUSY if the hwspinlock was already taken).
|
||||
The function will never sleep.
|
||||
|
||||
|
||||
Attempt to lock a previously-assigned hwspinlock, but immediately fail if
|
||||
it is already taken.
|
||||
|
||||
Upon a successful return from this function, preemption is disabled so
|
||||
caller must not sleep, and is advised to release the hwspinlock as soon as
|
||||
possible, in order to minimize remote cores polling on the hardware
|
||||
interconnect.
|
||||
|
||||
Returns 0 on success and an appropriate error code otherwise (most
|
||||
notably -EBUSY if the hwspinlock was already taken).
|
||||
The function will never sleep.
|
||||
|
||||
::
|
||||
|
||||
int hwspin_trylock_irq(struct hwspinlock *hwlock);
|
||||
- attempt to lock a previously-assigned hwspinlock, but immediately fail if
|
||||
it is already taken.
|
||||
Upon a successful return from this function, preemption and the local
|
||||
interrupts are disabled so caller must not sleep, and is advised to
|
||||
release the hwspinlock as soon as possible.
|
||||
Returns 0 on success and an appropriate error code otherwise (most
|
||||
notably -EBUSY if the hwspinlock was already taken).
|
||||
The function will never sleep.
|
||||
|
||||
|
||||
Attempt to lock a previously-assigned hwspinlock, but immediately fail if
|
||||
it is already taken.
|
||||
|
||||
Upon a successful return from this function, preemption and the local
|
||||
interrupts are disabled so caller must not sleep, and is advised to
|
||||
release the hwspinlock as soon as possible.
|
||||
|
||||
Returns 0 on success and an appropriate error code otherwise (most
|
||||
notably -EBUSY if the hwspinlock was already taken).
|
||||
|
||||
The function will never sleep.
|
||||
|
||||
::
|
||||
|
||||
int hwspin_trylock_irqsave(struct hwspinlock *hwlock, unsigned long *flags);
|
||||
- attempt to lock a previously-assigned hwspinlock, but immediately fail if
|
||||
it is already taken.
|
||||
Upon a successful return from this function, preemption is disabled,
|
||||
the local interrupts are disabled and their previous state is saved
|
||||
at the given flags placeholder. The caller must not sleep, and is advised
|
||||
to release the hwspinlock as soon as possible.
|
||||
Returns 0 on success and an appropriate error code otherwise (most
|
||||
notably -EBUSY if the hwspinlock was already taken).
|
||||
The function will never sleep.
|
||||
|
||||
Attempt to lock a previously-assigned hwspinlock, but immediately fail if
|
||||
it is already taken.
|
||||
|
||||
Upon a successful return from this function, preemption is disabled,
|
||||
the local interrupts are disabled and their previous state is saved
|
||||
at the given flags placeholder. The caller must not sleep, and is advised
|
||||
to release the hwspinlock as soon as possible.
|
||||
|
||||
Returns 0 on success and an appropriate error code otherwise (most
|
||||
notably -EBUSY if the hwspinlock was already taken).
|
||||
The function will never sleep.
|
||||
|
||||
::
|
||||
|
||||
void hwspin_unlock(struct hwspinlock *hwlock);
|
||||
- unlock a previously-locked hwspinlock. Always succeed, and can be called
|
||||
from any context (the function never sleeps). Note: code should _never_
|
||||
unlock an hwspinlock which is already unlocked (there is no protection
|
||||
against this).
|
||||
|
||||
Unlock a previously-locked hwspinlock. Always succeed, and can be called
|
||||
from any context (the function never sleeps).
|
||||
|
||||
.. note::
|
||||
|
||||
code should **never** unlock an hwspinlock which is already unlocked
|
||||
(there is no protection against this).
|
||||
|
||||
::
|
||||
|
||||
void hwspin_unlock_irq(struct hwspinlock *hwlock);
|
||||
- unlock a previously-locked hwspinlock and enable local interrupts.
|
||||
The caller should _never_ unlock an hwspinlock which is already unlocked.
|
||||
Doing so is considered a bug (there is no protection against this).
|
||||
Upon a successful return from this function, preemption and local
|
||||
interrupts are enabled. This function will never sleep.
|
||||
|
||||
Unlock a previously-locked hwspinlock and enable local interrupts.
|
||||
The caller should **never** unlock an hwspinlock which is already unlocked.
|
||||
|
||||
Doing so is considered a bug (there is no protection against this).
|
||||
Upon a successful return from this function, preemption and local
|
||||
interrupts are enabled. This function will never sleep.
|
||||
|
||||
::
|
||||
|
||||
void
|
||||
hwspin_unlock_irqrestore(struct hwspinlock *hwlock, unsigned long *flags);
|
||||
- unlock a previously-locked hwspinlock.
|
||||
The caller should _never_ unlock an hwspinlock which is already unlocked.
|
||||
Doing so is considered a bug (there is no protection against this).
|
||||
Upon a successful return from this function, preemption is reenabled,
|
||||
and the state of the local interrupts is restored to the state saved at
|
||||
the given flags. This function will never sleep.
|
||||
|
||||
Unlock a previously-locked hwspinlock.
|
||||
|
||||
The caller should **never** unlock an hwspinlock which is already unlocked.
|
||||
Doing so is considered a bug (there is no protection against this).
|
||||
Upon a successful return from this function, preemption is reenabled,
|
||||
and the state of the local interrupts is restored to the state saved at
|
||||
the given flags. This function will never sleep.
|
||||
|
||||
::
|
||||
|
||||
int hwspin_lock_get_id(struct hwspinlock *hwlock);
|
||||
- retrieve id number of a given hwspinlock. This is needed when an
|
||||
hwspinlock is dynamically assigned: before it can be used to achieve
|
||||
mutual exclusion with a remote cpu, the id number should be communicated
|
||||
to the remote task with which we want to synchronize.
|
||||
Returns the hwspinlock id number, or -EINVAL if hwlock is null.
|
||||
|
||||
3. Typical usage
|
||||
Retrieve id number of a given hwspinlock. This is needed when an
|
||||
hwspinlock is dynamically assigned: before it can be used to achieve
|
||||
mutual exclusion with a remote cpu, the id number should be communicated
|
||||
to the remote task with which we want to synchronize.
|
||||
|
||||
#include <linux/hwspinlock.h>
|
||||
#include <linux/err.h>
|
||||
Returns the hwspinlock id number, or -EINVAL if hwlock is null.
|
||||
|
||||
int hwspinlock_example1(void)
|
||||
{
|
||||
struct hwspinlock *hwlock;
|
||||
int ret;
|
||||
Typical usage
|
||||
=============
|
||||
|
||||
/* dynamically assign a hwspinlock */
|
||||
hwlock = hwspin_lock_request();
|
||||
if (!hwlock)
|
||||
...
|
||||
::
|
||||
|
||||
id = hwspin_lock_get_id(hwlock);
|
||||
/* probably need to communicate id to a remote processor now */
|
||||
#include <linux/hwspinlock.h>
|
||||
#include <linux/err.h>
|
||||
|
||||
/* take the lock, spin for 1 sec if it's already taken */
|
||||
ret = hwspin_lock_timeout(hwlock, 1000);
|
||||
if (ret)
|
||||
...
|
||||
int hwspinlock_example1(void)
|
||||
{
|
||||
struct hwspinlock *hwlock;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* we took the lock, do our thing now, but do NOT sleep
|
||||
*/
|
||||
/* dynamically assign a hwspinlock */
|
||||
hwlock = hwspin_lock_request();
|
||||
if (!hwlock)
|
||||
...
|
||||
|
||||
/* release the lock */
|
||||
hwspin_unlock(hwlock);
|
||||
id = hwspin_lock_get_id(hwlock);
|
||||
/* probably need to communicate id to a remote processor now */
|
||||
|
||||
/* free the lock */
|
||||
ret = hwspin_lock_free(hwlock);
|
||||
if (ret)
|
||||
...
|
||||
/* take the lock, spin for 1 sec if it's already taken */
|
||||
ret = hwspin_lock_timeout(hwlock, 1000);
|
||||
if (ret)
|
||||
...
|
||||
|
||||
return ret;
|
||||
}
|
||||
/*
|
||||
* we took the lock, do our thing now, but do NOT sleep
|
||||
*/
|
||||
|
||||
int hwspinlock_example2(void)
|
||||
{
|
||||
struct hwspinlock *hwlock;
|
||||
int ret;
|
||||
/* release the lock */
|
||||
hwspin_unlock(hwlock);
|
||||
|
||||
/*
|
||||
* assign a specific hwspinlock id - this should be called early
|
||||
* by board init code.
|
||||
*/
|
||||
hwlock = hwspin_lock_request_specific(PREDEFINED_LOCK_ID);
|
||||
if (!hwlock)
|
||||
...
|
||||
/* free the lock */
|
||||
ret = hwspin_lock_free(hwlock);
|
||||
if (ret)
|
||||
...
|
||||
|
||||
/* try to take it, but don't spin on it */
|
||||
ret = hwspin_trylock(hwlock);
|
||||
if (!ret) {
|
||||
pr_info("lock is already taken\n");
|
||||
return -EBUSY;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* we took the lock, do our thing now, but do NOT sleep
|
||||
*/
|
||||
int hwspinlock_example2(void)
|
||||
{
|
||||
struct hwspinlock *hwlock;
|
||||
int ret;
|
||||
|
||||
/* release the lock */
|
||||
hwspin_unlock(hwlock);
|
||||
/*
|
||||
* assign a specific hwspinlock id - this should be called early
|
||||
* by board init code.
|
||||
*/
|
||||
hwlock = hwspin_lock_request_specific(PREDEFINED_LOCK_ID);
|
||||
if (!hwlock)
|
||||
...
|
||||
|
||||
/* free the lock */
|
||||
ret = hwspin_lock_free(hwlock);
|
||||
if (ret)
|
||||
...
|
||||
/* try to take it, but don't spin on it */
|
||||
ret = hwspin_trylock(hwlock);
|
||||
if (!ret) {
|
||||
pr_info("lock is already taken\n");
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
/*
|
||||
* we took the lock, do our thing now, but do NOT sleep
|
||||
*/
|
||||
|
||||
/* release the lock */
|
||||
hwspin_unlock(hwlock);
|
||||
|
||||
/* free the lock */
|
||||
ret = hwspin_lock_free(hwlock);
|
||||
if (ret)
|
||||
...
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
4. API for implementors
|
||||
API for implementors
|
||||
====================
|
||||
|
||||
::
|
||||
|
||||
int hwspin_lock_register(struct hwspinlock_device *bank, struct device *dev,
|
||||
const struct hwspinlock_ops *ops, int base_id, int num_locks);
|
||||
- to be called from the underlying platform-specific implementation, in
|
||||
order to register a new hwspinlock device (which is usually a bank of
|
||||
numerous locks). Should be called from a process context (this function
|
||||
might sleep).
|
||||
Returns 0 on success, or appropriate error code on failure.
|
||||
|
||||
To be called from the underlying platform-specific implementation, in
|
||||
order to register a new hwspinlock device (which is usually a bank of
|
||||
numerous locks). Should be called from a process context (this function
|
||||
might sleep).
|
||||
|
||||
Returns 0 on success, or appropriate error code on failure.
|
||||
|
||||
::
|
||||
|
||||
int hwspin_lock_unregister(struct hwspinlock_device *bank);
|
||||
- to be called from the underlying vendor-specific implementation, in order
|
||||
to unregister an hwspinlock device (which is usually a bank of numerous
|
||||
locks).
|
||||
Should be called from a process context (this function might sleep).
|
||||
Returns the address of hwspinlock on success, or NULL on error (e.g.
|
||||
if the hwspinlock is still in use).
|
||||
|
||||
5. Important structs
|
||||
To be called from the underlying vendor-specific implementation, in order
|
||||
to unregister an hwspinlock device (which is usually a bank of numerous
|
||||
locks).
|
||||
|
||||
Should be called from a process context (this function might sleep).
|
||||
|
||||
Returns the address of hwspinlock on success, or NULL on error (e.g.
|
||||
if the hwspinlock is still in use).
|
||||
|
||||
Important structs
|
||||
=================
|
||||
|
||||
struct hwspinlock_device is a device which usually contains a bank
|
||||
of hardware locks. It is registered by the underlying hwspinlock
|
||||
implementation using the hwspin_lock_register() API.
|
||||
|
||||
/**
|
||||
* struct hwspinlock_device - a device which usually spans numerous hwspinlocks
|
||||
* @dev: underlying device, will be used to invoke runtime PM api
|
||||
* @ops: platform-specific hwspinlock handlers
|
||||
* @base_id: id index of the first lock in this device
|
||||
* @num_locks: number of locks in this device
|
||||
* @lock: dynamically allocated array of 'struct hwspinlock'
|
||||
*/
|
||||
struct hwspinlock_device {
|
||||
struct device *dev;
|
||||
const struct hwspinlock_ops *ops;
|
||||
int base_id;
|
||||
int num_locks;
|
||||
struct hwspinlock lock[0];
|
||||
};
|
||||
::
|
||||
|
||||
/**
|
||||
* struct hwspinlock_device - a device which usually spans numerous hwspinlocks
|
||||
* @dev: underlying device, will be used to invoke runtime PM api
|
||||
* @ops: platform-specific hwspinlock handlers
|
||||
* @base_id: id index of the first lock in this device
|
||||
* @num_locks: number of locks in this device
|
||||
* @lock: dynamically allocated array of 'struct hwspinlock'
|
||||
*/
|
||||
struct hwspinlock_device {
|
||||
struct device *dev;
|
||||
const struct hwspinlock_ops *ops;
|
||||
int base_id;
|
||||
int num_locks;
|
||||
struct hwspinlock lock[0];
|
||||
};
|
||||
|
||||
struct hwspinlock_device contains an array of hwspinlock structs, each
|
||||
of which represents a single hardware lock:
|
||||
of which represents a single hardware lock::
|
||||
|
||||
/**
|
||||
* struct hwspinlock - this struct represents a single hwspinlock instance
|
||||
* @bank: the hwspinlock_device structure which owns this lock
|
||||
* @lock: initialized and used by hwspinlock core
|
||||
* @priv: private data, owned by the underlying platform-specific hwspinlock drv
|
||||
*/
|
||||
struct hwspinlock {
|
||||
struct hwspinlock_device *bank;
|
||||
spinlock_t lock;
|
||||
void *priv;
|
||||
};
|
||||
/**
|
||||
* struct hwspinlock - this struct represents a single hwspinlock instance
|
||||
* @bank: the hwspinlock_device structure which owns this lock
|
||||
* @lock: initialized and used by hwspinlock core
|
||||
* @priv: private data, owned by the underlying platform-specific hwspinlock drv
|
||||
*/
|
||||
struct hwspinlock {
|
||||
struct hwspinlock_device *bank;
|
||||
spinlock_t lock;
|
||||
void *priv;
|
||||
};
|
||||
|
||||
When registering a bank of locks, the hwspinlock driver only needs to
|
||||
set the priv members of the locks. The rest of the members are set and
|
||||
initialized by the hwspinlock core itself.
|
||||
|
||||
6. Implementation callbacks
|
||||
Implementation callbacks
|
||||
========================
|
||||
|
||||
There are three possible callbacks defined in 'struct hwspinlock_ops':
|
||||
There are three possible callbacks defined in 'struct hwspinlock_ops'::
|
||||
|
||||
struct hwspinlock_ops {
|
||||
int (*trylock)(struct hwspinlock *lock);
|
||||
void (*unlock)(struct hwspinlock *lock);
|
||||
void (*relax)(struct hwspinlock *lock);
|
||||
};
|
||||
struct hwspinlock_ops {
|
||||
int (*trylock)(struct hwspinlock *lock);
|
||||
void (*unlock)(struct hwspinlock *lock);
|
||||
void (*relax)(struct hwspinlock *lock);
|
||||
};
|
||||
|
||||
The first two callbacks are mandatory:
|
||||
|
||||
The ->trylock() callback should make a single attempt to take the lock, and
|
||||
return 0 on failure and 1 on success. This callback may _not_ sleep.
|
||||
return 0 on failure and 1 on success. This callback may **not** sleep.
|
||||
|
||||
The ->unlock() callback releases the lock. It always succeed, and it, too,
|
||||
may _not_ sleep.
|
||||
may **not** sleep.
|
||||
|
||||
The ->relax() callback is optional. It is called by hwspinlock core while
|
||||
spinning on a lock, and can be used by the underlying implementation to force
|
||||
a delay between two successive invocations of ->trylock(). It may _not_ sleep.
|
||||
a delay between two successive invocations of ->trylock(). It may **not** sleep.
|
||||
|
@ -34,6 +34,8 @@ Supported adapters:
|
||||
* Intel Broxton (SOC)
|
||||
* Intel Lewisburg (PCH)
|
||||
* Intel Gemini Lake (SOC)
|
||||
* Intel Cannon Lake-H (PCH)
|
||||
* Intel Cannon Lake-LP (PCH)
|
||||
Datasheets: Publicly available at the Intel website
|
||||
|
||||
On Intel Patsburg and later chipsets, both the normal host SMBus controller
|
||||
|
@ -191,7 +191,7 @@ checking on future transactions.)
|
||||
4* Other ioctl() calls are converted to in-kernel function calls by
|
||||
i2c-dev. Examples include I2C_FUNCS, which queries the I2C adapter
|
||||
functionality using i2c.h:i2c_get_functionality(), and I2C_SMBUS, which
|
||||
performs an SMBus transaction using i2c-core.c:i2c_smbus_xfer().
|
||||
performs an SMBus transaction using i2c-core-smbus.c:i2c_smbus_xfer().
|
||||
|
||||
The i2c-dev driver is responsible for checking all the parameters that
|
||||
come from user-space for validity. After this point, there is no
|
||||
@ -200,13 +200,13 @@ and calls that would have been performed by kernel I2C chip drivers
|
||||
directly. This means that I2C bus drivers don't need to implement
|
||||
anything special to support access from user-space.
|
||||
|
||||
5* These i2c-core.c/i2c.h functions are wrappers to the actual
|
||||
implementation of your I2C bus driver. Each adapter must declare
|
||||
callback functions implementing these standard calls.
|
||||
i2c.h:i2c_get_functionality() calls i2c_adapter.algo->functionality(),
|
||||
while i2c-core.c:i2c_smbus_xfer() calls either
|
||||
5* These i2c.h functions are wrappers to the actual implementation of
|
||||
your I2C bus driver. Each adapter must declare callback functions
|
||||
implementing these standard calls. i2c.h:i2c_get_functionality() calls
|
||||
i2c_adapter.algo->functionality(), while
|
||||
i2c-core-smbus.c:i2c_smbus_xfer() calls either
|
||||
adapter.algo->smbus_xfer() if it is implemented, or if not,
|
||||
i2c-core.c:i2c_smbus_xfer_emulated() which in turn calls
|
||||
i2c-core-smbus.c:i2c_smbus_xfer_emulated() which in turn calls
|
||||
i2c_adapter.algo->master_xfer().
|
||||
|
||||
After your I2C bus driver has processed these requests, execution runs
|
||||
|
@ -6,7 +6,6 @@ Contents:
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:numbered:
|
||||
|
||||
input_uapi
|
||||
input_kapi
|
||||
|
@ -1,4 +1,5 @@
|
||||
Intel(R) TXT Overview:
|
||||
=====================
|
||||
Intel(R) TXT Overview
|
||||
=====================
|
||||
|
||||
Intel's technology for safer computing, Intel(R) Trusted Execution
|
||||
@ -8,9 +9,10 @@ provide the building blocks for creating trusted platforms.
|
||||
Intel TXT was formerly known by the code name LaGrande Technology (LT).
|
||||
|
||||
Intel TXT in Brief:
|
||||
o Provides dynamic root of trust for measurement (DRTM)
|
||||
o Data protection in case of improper shutdown
|
||||
o Measurement and verification of launched environment
|
||||
|
||||
- Provides dynamic root of trust for measurement (DRTM)
|
||||
- Data protection in case of improper shutdown
|
||||
- Measurement and verification of launched environment
|
||||
|
||||
Intel TXT is part of the vPro(TM) brand and is also available some
|
||||
non-vPro systems. It is currently available on desktop systems
|
||||
@ -24,16 +26,21 @@ which has been updated for the new released platforms.
|
||||
|
||||
Intel TXT has been presented at various events over the past few
|
||||
years, some of which are:
|
||||
LinuxTAG 2008:
|
||||
|
||||
- LinuxTAG 2008:
|
||||
http://www.linuxtag.org/2008/en/conf/events/vp-donnerstag.html
|
||||
TRUST2008:
|
||||
|
||||
- TRUST2008:
|
||||
http://www.trust-conference.eu/downloads/Keynote-Speakers/
|
||||
3_David-Grawrock_The-Front-Door-of-Trusted-Computing.pdf
|
||||
IDF, Shanghai:
|
||||
http://www.prcidf.com.cn/index_en.html
|
||||
IDFs 2006, 2007 (I'm not sure if/where they are online)
|
||||
|
||||
Trusted Boot Project Overview:
|
||||
- IDF, Shanghai:
|
||||
http://www.prcidf.com.cn/index_en.html
|
||||
|
||||
- IDFs 2006, 2007
|
||||
(I'm not sure if/where they are online)
|
||||
|
||||
Trusted Boot Project Overview
|
||||
=============================
|
||||
|
||||
Trusted Boot (tboot) is an open source, pre-kernel/VMM module that
|
||||
@ -87,11 +94,12 @@ Intel-provided firmware).
|
||||
How Does it Work?
|
||||
=================
|
||||
|
||||
o Tboot is an executable that is launched by the bootloader as
|
||||
- Tboot is an executable that is launched by the bootloader as
|
||||
the "kernel" (the binary the bootloader executes).
|
||||
o It performs all of the work necessary to determine if the
|
||||
- It performs all of the work necessary to determine if the
|
||||
platform supports Intel TXT and, if so, executes the GETSEC[SENTER]
|
||||
processor instruction that initiates the dynamic root of trust.
|
||||
|
||||
- If tboot determines that the system does not support Intel TXT
|
||||
or is not configured correctly (e.g. the SINIT AC Module was
|
||||
incorrect), it will directly launch the kernel with no changes
|
||||
@ -99,12 +107,14 @@ o It performs all of the work necessary to determine if the
|
||||
- Tboot will output various information about its progress to the
|
||||
terminal, serial port, and/or an in-memory log; the output
|
||||
locations can be configured with a command line switch.
|
||||
o The GETSEC[SENTER] instruction will return control to tboot and
|
||||
|
||||
- The GETSEC[SENTER] instruction will return control to tboot and
|
||||
tboot then verifies certain aspects of the environment (e.g. TPM NV
|
||||
lock, e820 table does not have invalid entries, etc.).
|
||||
o It will wake the APs from the special sleep state the GETSEC[SENTER]
|
||||
- It will wake the APs from the special sleep state the GETSEC[SENTER]
|
||||
instruction had put them in and place them into a wait-for-SIPI
|
||||
state.
|
||||
|
||||
- Because the processors will not respond to an INIT or SIPI when
|
||||
in the TXT environment, it is necessary to create a small VT-x
|
||||
guest for the APs. When they run in this guest, they will
|
||||
@ -112,8 +122,10 @@ o It will wake the APs from the special sleep state the GETSEC[SENTER]
|
||||
VMEXITs, and then disable VT and jump to the SIPI vector. This
|
||||
approach seemed like a better choice than having to insert
|
||||
special code into the kernel's MP wakeup sequence.
|
||||
o Tboot then applies an (optional) user-defined launch policy to
|
||||
|
||||
- Tboot then applies an (optional) user-defined launch policy to
|
||||
verify the kernel and initrd.
|
||||
|
||||
- This policy is rooted in TPM NV and is described in the tboot
|
||||
project. The tboot project also contains code for tools to
|
||||
create and provision the policy.
|
||||
@ -121,30 +133,34 @@ o Tboot then applies an (optional) user-defined launch policy to
|
||||
then any kernel will be launched.
|
||||
- Policy action is flexible and can include halting on failures
|
||||
or simply logging them and continuing.
|
||||
o Tboot adjusts the e820 table provided by the bootloader to reserve
|
||||
|
||||
- Tboot adjusts the e820 table provided by the bootloader to reserve
|
||||
its own location in memory as well as to reserve certain other
|
||||
TXT-related regions.
|
||||
o As part of its launch, tboot DMA protects all of RAM (using the
|
||||
- As part of its launch, tboot DMA protects all of RAM (using the
|
||||
VT-d PMRs). Thus, the kernel must be booted with 'intel_iommu=on'
|
||||
in order to remove this blanket protection and use VT-d's
|
||||
page-level protection.
|
||||
o Tboot will populate a shared page with some data about itself and
|
||||
- Tboot will populate a shared page with some data about itself and
|
||||
pass this to the Linux kernel as it transfers control.
|
||||
|
||||
- The location of the shared page is passed via the boot_params
|
||||
struct as a physical address.
|
||||
o The kernel will look for the tboot shared page address and, if it
|
||||
|
||||
- The kernel will look for the tboot shared page address and, if it
|
||||
exists, map it.
|
||||
o As one of the checks/protections provided by TXT, it makes a copy
|
||||
- As one of the checks/protections provided by TXT, it makes a copy
|
||||
of the VT-d DMARs in a DMA-protected region of memory and verifies
|
||||
them for correctness. The VT-d code will detect if the kernel was
|
||||
launched with tboot and use this copy instead of the one in the
|
||||
ACPI table.
|
||||
o At this point, tboot and TXT are out of the picture until a
|
||||
- At this point, tboot and TXT are out of the picture until a
|
||||
shutdown (S<n>)
|
||||
o In order to put a system into any of the sleep states after a TXT
|
||||
- In order to put a system into any of the sleep states after a TXT
|
||||
launch, TXT must first be exited. This is to prevent attacks that
|
||||
attempt to crash the system to gain control on reboot and steal
|
||||
data left in memory.
|
||||
|
||||
- The kernel will perform all of its sleep preparation and
|
||||
populate the shared page with the ACPI data needed to put the
|
||||
platform in the desired sleep state.
|
||||
@ -172,7 +188,7 @@ o In order to put a system into any of the sleep states after a TXT
|
||||
That's pretty much it for TXT support.
|
||||
|
||||
|
||||
Configuring the System:
|
||||
Configuring the System
|
||||
======================
|
||||
|
||||
This code works with 32bit, 32bit PAE, and 64bit (x86_64) kernels.
|
||||
@ -181,7 +197,8 @@ In BIOS, the user must enable: TPM, TXT, VT-x, VT-d. Not all BIOSes
|
||||
allow these to be individually enabled/disabled and the screens in
|
||||
which to find them are BIOS-specific.
|
||||
|
||||
grub.conf needs to be modified as follows:
|
||||
grub.conf needs to be modified as follows::
|
||||
|
||||
title Linux 2.6.29-tip w/ tboot
|
||||
root (hd0,0)
|
||||
kernel /tboot.gz logging=serial,vga,memory
|
||||
|
@ -1,66 +1,81 @@
|
||||
========================
|
||||
The io_mapping functions
|
||||
========================
|
||||
|
||||
API
|
||||
===
|
||||
|
||||
The io_mapping functions in linux/io-mapping.h provide an abstraction for
|
||||
efficiently mapping small regions of an I/O device to the CPU. The initial
|
||||
usage is to support the large graphics aperture on 32-bit processors where
|
||||
ioremap_wc cannot be used to statically map the entire aperture to the CPU
|
||||
as it would consume too much of the kernel address space.
|
||||
|
||||
A mapping object is created during driver initialization using
|
||||
A mapping object is created during driver initialization using::
|
||||
|
||||
struct io_mapping *io_mapping_create_wc(unsigned long base,
|
||||
unsigned long size)
|
||||
|
||||
'base' is the bus address of the region to be made
|
||||
mappable, while 'size' indicates how large a mapping region to
|
||||
enable. Both are in bytes.
|
||||
'base' is the bus address of the region to be made
|
||||
mappable, while 'size' indicates how large a mapping region to
|
||||
enable. Both are in bytes.
|
||||
|
||||
This _wc variant provides a mapping which may only be used
|
||||
with the io_mapping_map_atomic_wc or io_mapping_map_wc.
|
||||
This _wc variant provides a mapping which may only be used
|
||||
with the io_mapping_map_atomic_wc or io_mapping_map_wc.
|
||||
|
||||
With this mapping object, individual pages can be mapped either atomically
|
||||
or not, depending on the necessary scheduling environment. Of course, atomic
|
||||
maps are more efficient:
|
||||
maps are more efficient::
|
||||
|
||||
void *io_mapping_map_atomic_wc(struct io_mapping *mapping,
|
||||
unsigned long offset)
|
||||
|
||||
'offset' is the offset within the defined mapping region.
|
||||
Accessing addresses beyond the region specified in the
|
||||
creation function yields undefined results. Using an offset
|
||||
which is not page aligned yields an undefined result. The
|
||||
return value points to a single page in CPU address space.
|
||||
'offset' is the offset within the defined mapping region.
|
||||
Accessing addresses beyond the region specified in the
|
||||
creation function yields undefined results. Using an offset
|
||||
which is not page aligned yields an undefined result. The
|
||||
return value points to a single page in CPU address space.
|
||||
|
||||
This _wc variant returns a write-combining map to the
|
||||
page and may only be used with mappings created by
|
||||
io_mapping_create_wc
|
||||
This _wc variant returns a write-combining map to the
|
||||
page and may only be used with mappings created by
|
||||
io_mapping_create_wc
|
||||
|
||||
Note that the task may not sleep while holding this page
|
||||
mapped.
|
||||
Note that the task may not sleep while holding this page
|
||||
mapped.
|
||||
|
||||
::
|
||||
|
||||
void io_mapping_unmap_atomic(void *vaddr)
|
||||
|
||||
'vaddr' must be the value returned by the last
|
||||
io_mapping_map_atomic_wc call. This unmaps the specified
|
||||
page and allows the task to sleep once again.
|
||||
'vaddr' must be the value returned by the last
|
||||
io_mapping_map_atomic_wc call. This unmaps the specified
|
||||
page and allows the task to sleep once again.
|
||||
|
||||
If you need to sleep while holding the lock, you can use the non-atomic
|
||||
variant, although they may be significantly slower.
|
||||
|
||||
::
|
||||
|
||||
void *io_mapping_map_wc(struct io_mapping *mapping,
|
||||
unsigned long offset)
|
||||
|
||||
This works like io_mapping_map_atomic_wc except it allows
|
||||
the task to sleep while holding the page mapped.
|
||||
This works like io_mapping_map_atomic_wc except it allows
|
||||
the task to sleep while holding the page mapped.
|
||||
|
||||
|
||||
::
|
||||
|
||||
void io_mapping_unmap(void *vaddr)
|
||||
|
||||
This works like io_mapping_unmap_atomic, except it is used
|
||||
for pages mapped with io_mapping_map_wc.
|
||||
This works like io_mapping_unmap_atomic, except it is used
|
||||
for pages mapped with io_mapping_map_wc.
|
||||
|
||||
At driver close time, the io_mapping object must be freed:
|
||||
At driver close time, the io_mapping object must be freed::
|
||||
|
||||
void io_mapping_free(struct io_mapping *mapping)
|
||||
|
||||
Current Implementation:
|
||||
Current Implementation
|
||||
======================
|
||||
|
||||
The initial implementation of these functions uses existing mapping
|
||||
mechanisms and so provides only an abstraction layer and no new
|
||||
|
@ -1,3 +1,7 @@
|
||||
==============================================
|
||||
Ordering I/O writes to memory-mapped addresses
|
||||
==============================================
|
||||
|
||||
On some platforms, so-called memory-mapped I/O is weakly ordered. On such
|
||||
platforms, driver writers are responsible for ensuring that I/O writes to
|
||||
memory-mapped addresses on their device arrive in the order intended. This is
|
||||
@ -8,39 +12,39 @@ critical section of code protected by spinlocks. This would ensure that
|
||||
subsequent writes to I/O space arrived only after all prior writes (much like a
|
||||
memory barrier op, mb(), only with respect to I/O).
|
||||
|
||||
A more concrete example from a hypothetical device driver:
|
||||
A more concrete example from a hypothetical device driver::
|
||||
|
||||
...
|
||||
CPU A: spin_lock_irqsave(&dev_lock, flags)
|
||||
CPU A: val = readl(my_status);
|
||||
CPU A: ...
|
||||
CPU A: writel(newval, ring_ptr);
|
||||
CPU A: spin_unlock_irqrestore(&dev_lock, flags)
|
||||
...
|
||||
CPU B: spin_lock_irqsave(&dev_lock, flags)
|
||||
CPU B: val = readl(my_status);
|
||||
CPU B: ...
|
||||
CPU B: writel(newval2, ring_ptr);
|
||||
CPU B: spin_unlock_irqrestore(&dev_lock, flags)
|
||||
...
|
||||
...
|
||||
CPU A: spin_lock_irqsave(&dev_lock, flags)
|
||||
CPU A: val = readl(my_status);
|
||||
CPU A: ...
|
||||
CPU A: writel(newval, ring_ptr);
|
||||
CPU A: spin_unlock_irqrestore(&dev_lock, flags)
|
||||
...
|
||||
CPU B: spin_lock_irqsave(&dev_lock, flags)
|
||||
CPU B: val = readl(my_status);
|
||||
CPU B: ...
|
||||
CPU B: writel(newval2, ring_ptr);
|
||||
CPU B: spin_unlock_irqrestore(&dev_lock, flags)
|
||||
...
|
||||
|
||||
In the case above, the device may receive newval2 before it receives newval,
|
||||
which could cause problems. Fixing it is easy enough though:
|
||||
which could cause problems. Fixing it is easy enough though::
|
||||
|
||||
...
|
||||
CPU A: spin_lock_irqsave(&dev_lock, flags)
|
||||
CPU A: val = readl(my_status);
|
||||
CPU A: ...
|
||||
CPU A: writel(newval, ring_ptr);
|
||||
CPU A: (void)readl(safe_register); /* maybe a config register? */
|
||||
CPU A: spin_unlock_irqrestore(&dev_lock, flags)
|
||||
...
|
||||
CPU B: spin_lock_irqsave(&dev_lock, flags)
|
||||
CPU B: val = readl(my_status);
|
||||
CPU B: ...
|
||||
CPU B: writel(newval2, ring_ptr);
|
||||
CPU B: (void)readl(safe_register); /* maybe a config register? */
|
||||
CPU B: spin_unlock_irqrestore(&dev_lock, flags)
|
||||
...
|
||||
CPU A: spin_lock_irqsave(&dev_lock, flags)
|
||||
CPU A: val = readl(my_status);
|
||||
CPU A: ...
|
||||
CPU A: writel(newval, ring_ptr);
|
||||
CPU A: (void)readl(safe_register); /* maybe a config register? */
|
||||
CPU A: spin_unlock_irqrestore(&dev_lock, flags)
|
||||
...
|
||||
CPU B: spin_lock_irqsave(&dev_lock, flags)
|
||||
CPU B: val = readl(my_status);
|
||||
CPU B: ...
|
||||
CPU B: writel(newval2, ring_ptr);
|
||||
CPU B: (void)readl(safe_register); /* maybe a config register? */
|
||||
CPU B: spin_unlock_irqrestore(&dev_lock, flags)
|
||||
|
||||
Here, the reads from safe_register will cause the I/O chipset to flush any
|
||||
pending writes before actually posting the read to the chipset, preventing
|
||||
|
@ -1,49 +1,50 @@
|
||||
=====================
|
||||
I/O statistics fields
|
||||
---------------
|
||||
=====================
|
||||
|
||||
Since 2.4.20 (and some versions before, with patches), and 2.5.45,
|
||||
more extensive disk statistics have been introduced to help measure disk
|
||||
activity. Tools such as sar and iostat typically interpret these and do
|
||||
activity. Tools such as ``sar`` and ``iostat`` typically interpret these and do
|
||||
the work for you, but in case you are interested in creating your own
|
||||
tools, the fields are explained here.
|
||||
|
||||
In 2.4 now, the information is found as additional fields in
|
||||
/proc/partitions. In 2.6, the same information is found in two
|
||||
places: one is in the file /proc/diskstats, and the other is within
|
||||
``/proc/partitions``. In 2.6 and upper, the same information is found in two
|
||||
places: one is in the file ``/proc/diskstats``, and the other is within
|
||||
the sysfs file system, which must be mounted in order to obtain
|
||||
the information. Throughout this document we'll assume that sysfs
|
||||
is mounted on /sys, although of course it may be mounted anywhere.
|
||||
Both /proc/diskstats and sysfs use the same source for the information
|
||||
is mounted on ``/sys``, although of course it may be mounted anywhere.
|
||||
Both ``/proc/diskstats`` and sysfs use the same source for the information
|
||||
and so should not differ.
|
||||
|
||||
Here are examples of these different formats:
|
||||
Here are examples of these different formats::
|
||||
|
||||
2.4:
|
||||
3 0 39082680 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
|
||||
3 1 9221278 hda1 35486 0 35496 38030 0 0 0 0 0 38030 38030
|
||||
2.4:
|
||||
3 0 39082680 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
|
||||
3 1 9221278 hda1 35486 0 35496 38030 0 0 0 0 0 38030 38030
|
||||
|
||||
2.6+ sysfs:
|
||||
446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
|
||||
35486 38030 38030 38030
|
||||
|
||||
2.6 sysfs:
|
||||
446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
|
||||
35486 38030 38030 38030
|
||||
2.6+ diskstats:
|
||||
3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
|
||||
3 1 hda1 35486 38030 38030 38030
|
||||
|
||||
2.6 diskstats:
|
||||
3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
|
||||
3 1 hda1 35486 38030 38030 38030
|
||||
On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have
|
||||
a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``.
|
||||
|
||||
On 2.4 you might execute "grep 'hda ' /proc/partitions". On 2.6, you have
|
||||
a choice of "cat /sys/block/hda/stat" or "grep 'hda ' /proc/diskstats".
|
||||
The advantage of one over the other is that the sysfs choice works well
|
||||
if you are watching a known, small set of disks. /proc/diskstats may
|
||||
if you are watching a known, small set of disks. ``/proc/diskstats`` may
|
||||
be a better choice if you are watching a large number of disks because
|
||||
you'll avoid the overhead of 50, 100, or 500 or more opens/closes with
|
||||
each snapshot of your disk statistics.
|
||||
|
||||
In 2.4, the statistics fields are those after the device name. In
|
||||
the above example, the first field of statistics would be 446216.
|
||||
By contrast, in 2.6 if you look at /sys/block/hda/stat, you'll
|
||||
By contrast, in 2.6+ if you look at ``/sys/block/hda/stat``, you'll
|
||||
find just the eleven fields, beginning with 446216. If you look at
|
||||
/proc/diskstats, the eleven fields will be preceded by the major and
|
||||
``/proc/diskstats``, the eleven fields will be preceded by the major and
|
||||
minor device numbers, and device name. Each of these formats provides
|
||||
eleven fields of statistics, each meaning exactly the same things.
|
||||
All fields except field 9 are cumulative since boot. Field 9 should
|
||||
@ -59,30 +60,40 @@ system-wide stats you'll have to find all the devices and sum them all up.
|
||||
|
||||
Field 1 -- # of reads completed
|
||||
This is the total number of reads completed successfully.
|
||||
|
||||
Field 2 -- # of reads merged, field 6 -- # of writes merged
|
||||
Reads and writes which are adjacent to each other may be merged for
|
||||
efficiency. Thus two 4K reads may become one 8K read before it is
|
||||
ultimately handed to the disk, and so it will be counted (and queued)
|
||||
as only one I/O. This field lets you know how often this was done.
|
||||
|
||||
Field 3 -- # of sectors read
|
||||
This is the total number of sectors read successfully.
|
||||
|
||||
Field 4 -- # of milliseconds spent reading
|
||||
This is the total number of milliseconds spent by all reads (as
|
||||
measured from __make_request() to end_that_request_last()).
|
||||
|
||||
Field 5 -- # of writes completed
|
||||
This is the total number of writes completed successfully.
|
||||
|
||||
Field 6 -- # of writes merged
|
||||
See the description of field 2.
|
||||
|
||||
Field 7 -- # of sectors written
|
||||
This is the total number of sectors written successfully.
|
||||
|
||||
Field 8 -- # of milliseconds spent writing
|
||||
This is the total number of milliseconds spent by all writes (as
|
||||
measured from __make_request() to end_that_request_last()).
|
||||
|
||||
Field 9 -- # of I/Os currently in progress
|
||||
The only field that should go to zero. Incremented as requests are
|
||||
given to appropriate struct request_queue and decremented as they finish.
|
||||
|
||||
Field 10 -- # of milliseconds spent doing I/Os
|
||||
This field increases so long as field 9 is nonzero.
|
||||
|
||||
Field 11 -- weighted # of milliseconds spent doing I/Os
|
||||
This field is incremented at each I/O start, I/O completion, I/O
|
||||
merge, or read of these stats by the number of I/Os in progress
|
||||
@ -97,7 +108,7 @@ introduced when changes collide, so (for instance) adding up all the
|
||||
read I/Os issued per partition should equal those made to the disks ...
|
||||
but due to the lack of locking it may only be very close.
|
||||
|
||||
In 2.6, there are counters for each CPU, which make the lack of locking
|
||||
In 2.6+, there are counters for each CPU, which make the lack of locking
|
||||
almost a non-issue. When the statistics are read, the per-CPU counters
|
||||
are summed (possibly overflowing the unsigned long variable they are
|
||||
summed to) and the result given to the user. There is no convenient
|
||||
@ -106,22 +117,25 @@ user interface for accessing the per-CPU counters themselves.
|
||||
Disks vs Partitions
|
||||
-------------------
|
||||
|
||||
There were significant changes between 2.4 and 2.6 in the I/O subsystem.
|
||||
There were significant changes between 2.4 and 2.6+ in the I/O subsystem.
|
||||
As a result, some statistic information disappeared. The translation from
|
||||
a disk address relative to a partition to the disk address relative to
|
||||
the host disk happens much earlier. All merges and timings now happen
|
||||
at the disk level rather than at both the disk and partition level as
|
||||
in 2.4. Consequently, you'll see a different statistics output on 2.6 for
|
||||
in 2.4. Consequently, you'll see a different statistics output on 2.6+ for
|
||||
partitions from that for disks. There are only *four* fields available
|
||||
for partitions on 2.6 machines. This is reflected in the examples above.
|
||||
for partitions on 2.6+ machines. This is reflected in the examples above.
|
||||
|
||||
Field 1 -- # of reads issued
|
||||
This is the total number of reads issued to this partition.
|
||||
|
||||
Field 2 -- # of sectors read
|
||||
This is the total number of sectors requested to be read from this
|
||||
partition.
|
||||
|
||||
Field 3 -- # of writes issued
|
||||
This is the total number of writes issued to this partition.
|
||||
|
||||
Field 4 -- # of sectors written
|
||||
This is the total number of sectors requested to be written to
|
||||
this partition.
|
||||
@ -149,16 +163,16 @@ to some (probably insignificant) inaccuracy.
|
||||
Additional notes
|
||||
----------------
|
||||
|
||||
In 2.6, sysfs is not mounted by default. If your distribution of
|
||||
In 2.6+, sysfs is not mounted by default. If your distribution of
|
||||
Linux hasn't added it already, here's the line you'll want to add to
|
||||
your /etc/fstab:
|
||||
your ``/etc/fstab``::
|
||||
|
||||
none /sys sysfs defaults 0 0
|
||||
none /sys sysfs defaults 0 0
|
||||
|
||||
|
||||
In 2.6, all disk statistics were removed from /proc/stat. In 2.4, they
|
||||
appear in both /proc/partitions and /proc/stat, although the ones in
|
||||
/proc/stat take a very different format from those in /proc/partitions
|
||||
In 2.6+, all disk statistics were removed from ``/proc/stat``. In 2.4, they
|
||||
appear in both ``/proc/partitions`` and ``/proc/stat``, although the ones in
|
||||
``/proc/stat`` take a very different format from those in ``/proc/partitions``
|
||||
(see proc(5), if your system has it.)
|
||||
|
||||
-- ricklind@us.ibm.com
|
||||
|
@ -1,8 +1,10 @@
|
||||
=======================
|
||||
IRQ-flags state tracing
|
||||
=======================
|
||||
|
||||
started by Ingo Molnar <mingo@redhat.com>
|
||||
:Author: started by Ingo Molnar <mingo@redhat.com>
|
||||
|
||||
the "irq-flags tracing" feature "traces" hardirq and softirq state, in
|
||||
The "irq-flags tracing" feature "traces" hardirq and softirq state, in
|
||||
that it gives interested subsystems an opportunity to be notified of
|
||||
every hardirqs-off/hardirqs-on, softirqs-off/softirqs-on event that
|
||||
happens in the kernel.
|
||||
@ -14,7 +16,7 @@ CONFIG_PROVE_RWSEM_LOCKING will be offered on an architecture - these
|
||||
are locking APIs that are not used in IRQ context. (the one exception
|
||||
for rwsems is worked around)
|
||||
|
||||
architecture support for this is certainly not in the "trivial"
|
||||
Architecture support for this is certainly not in the "trivial"
|
||||
category, because lots of lowlevel assembly code deal with irq-flags
|
||||
state changes. But an architecture can be irq-flags-tracing enabled in a
|
||||
rather straightforward and risk-free manner.
|
||||
@ -41,7 +43,7 @@ irq-flags-tracing support:
|
||||
excluded from the irq-tracing [and lock validation] mechanism via
|
||||
lockdep_off()/lockdep_on().
|
||||
|
||||
in general there is no risk from having an incomplete irq-flags-tracing
|
||||
In general there is no risk from having an incomplete irq-flags-tracing
|
||||
implementation in an architecture: lockdep will detect that and will
|
||||
turn itself off. I.e. the lock validator will still be reliable. There
|
||||
should be no crashes due to irq-tracing bugs. (except if the assembly
|
||||
|
@ -1,5 +1,6 @@
|
||||
===========
|
||||
ISA Drivers
|
||||
-----------
|
||||
===========
|
||||
|
||||
The following text is adapted from the commit message of the initial
|
||||
commit of the ISA bus driver authored by Rene Herman.
|
||||
@ -23,17 +24,17 @@ that all device creation has been made internal as well.
|
||||
|
||||
The usage model this provides is nice, and has been acked from the ALSA
|
||||
side by Takashi Iwai and Jaroslav Kysela. The ALSA driver module_init's
|
||||
now (for oldisa-only drivers) become:
|
||||
now (for oldisa-only drivers) become::
|
||||
|
||||
static int __init alsa_card_foo_init(void)
|
||||
{
|
||||
return isa_register_driver(&snd_foo_isa_driver, SNDRV_CARDS);
|
||||
}
|
||||
static int __init alsa_card_foo_init(void)
|
||||
{
|
||||
return isa_register_driver(&snd_foo_isa_driver, SNDRV_CARDS);
|
||||
}
|
||||
|
||||
static void __exit alsa_card_foo_exit(void)
|
||||
{
|
||||
isa_unregister_driver(&snd_foo_isa_driver);
|
||||
}
|
||||
static void __exit alsa_card_foo_exit(void)
|
||||
{
|
||||
isa_unregister_driver(&snd_foo_isa_driver);
|
||||
}
|
||||
|
||||
Quite like the other bus models therefore. This removes a lot of
|
||||
duplicated init code from the ALSA ISA drivers.
|
||||
@ -47,11 +48,11 @@ parameter, indicating how many devices to create and call our methods
|
||||
with.
|
||||
|
||||
The platform_driver callbacks are called with a platform_device param;
|
||||
the isa_driver callbacks are being called with a "struct device *dev,
|
||||
unsigned int id" pair directly -- with the device creation completely
|
||||
the isa_driver callbacks are being called with a ``struct device *dev,
|
||||
unsigned int id`` pair directly -- with the device creation completely
|
||||
internal to the bus it's much cleaner to not leak isa_dev's by passing
|
||||
them in at all. The id is the only thing we ever want other then the
|
||||
struct device * anyways, and it makes for nicer code in the callbacks as
|
||||
struct device anyways, and it makes for nicer code in the callbacks as
|
||||
well.
|
||||
|
||||
With this additional .match() callback ISA drivers have all options. If
|
||||
@ -75,20 +76,20 @@ This exports only two functions; isa_{,un}register_driver().
|
||||
|
||||
isa_register_driver() register's the struct device_driver, and then
|
||||
loops over the passed in ndev creating devices and registering them.
|
||||
This causes the bus match method to be called for them, which is:
|
||||
This causes the bus match method to be called for them, which is::
|
||||
|
||||
int isa_bus_match(struct device *dev, struct device_driver *driver)
|
||||
{
|
||||
struct isa_driver *isa_driver = to_isa_driver(driver);
|
||||
int isa_bus_match(struct device *dev, struct device_driver *driver)
|
||||
{
|
||||
struct isa_driver *isa_driver = to_isa_driver(driver);
|
||||
|
||||
if (dev->platform_data == isa_driver) {
|
||||
if (!isa_driver->match ||
|
||||
isa_driver->match(dev, to_isa_dev(dev)->id))
|
||||
return 1;
|
||||
dev->platform_data = NULL;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
if (dev->platform_data == isa_driver) {
|
||||
if (!isa_driver->match ||
|
||||
isa_driver->match(dev, to_isa_dev(dev)->id))
|
||||
return 1;
|
||||
dev->platform_data = NULL;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
The first thing this does is check if this device is in fact one of this
|
||||
driver's devices by seeing if the device's platform_data pointer is set
|
||||
@ -102,7 +103,7 @@ well.
|
||||
Then, if the the driver did not provide a .match, it matches. If it did,
|
||||
the driver match() method is called to determine a match.
|
||||
|
||||
If it did _not_ match, dev->platform_data is reset to indicate this to
|
||||
If it did **not** match, dev->platform_data is reset to indicate this to
|
||||
isa_register_driver which can then unregister the device again.
|
||||
|
||||
If during all this, there's any error, or no devices matched at all
|
||||
|
@ -1,3 +1,4 @@
|
||||
==========================================================
|
||||
ISA Plug & Play support by Jaroslav Kysela <perex@suse.cz>
|
||||
==========================================================
|
||||
|
||||
|
@ -112,8 +112,8 @@ There are two possible methods of using Kdump.
|
||||
2) Or use the system kernel binary itself as dump-capture kernel and there is
|
||||
no need to build a separate dump-capture kernel. This is possible
|
||||
only with the architectures which support a relocatable kernel. As
|
||||
of today, i386, x86_64, ppc64, ia64 and arm architectures support relocatable
|
||||
kernel.
|
||||
of today, i386, x86_64, ppc64, ia64, arm and arm64 architectures support
|
||||
relocatable kernel.
|
||||
|
||||
Building a relocatable kernel is advantageous from the point of view that
|
||||
one does not have to build a second kernel for capturing the dump. But
|
||||
@ -339,7 +339,7 @@ For arm:
|
||||
For arm64:
|
||||
- Use vmlinux or Image
|
||||
|
||||
If you are using a uncompressed vmlinux image then use following command
|
||||
If you are using an uncompressed vmlinux image then use following command
|
||||
to load dump-capture kernel.
|
||||
|
||||
kexec -p <dump-capture-kernel-vmlinux-image> \
|
||||
@ -361,6 +361,12 @@ to load dump-capture kernel.
|
||||
--dtb=<dtb-for-dump-capture-kernel> \
|
||||
--append="root=<root-dev> <arch-specific-options>"
|
||||
|
||||
If you are using an uncompressed Image, then use following command
|
||||
to load dump-capture kernel.
|
||||
|
||||
kexec -p <dump-capture-kernel-Image> \
|
||||
--initrd=<initrd-for-dump-capture-kernel> \
|
||||
--append="root=<root-dev> <arch-specific-options>"
|
||||
|
||||
Please note, that --args-linux does not need to be specified for ia64.
|
||||
It is planned to make this a no-op on that architecture, but for now
|
||||
|
@ -1,27 +1,29 @@
|
||||
REDUCING OS JITTER DUE TO PER-CPU KTHREADS
|
||||
==========================================
|
||||
Reducing OS jitter due to per-cpu kthreads
|
||||
==========================================
|
||||
|
||||
This document lists per-CPU kthreads in the Linux kernel and presents
|
||||
options to control their OS jitter. Note that non-per-CPU kthreads are
|
||||
not listed here. To reduce OS jitter from non-per-CPU kthreads, bind
|
||||
them to a "housekeeping" CPU dedicated to such work.
|
||||
|
||||
References
|
||||
==========
|
||||
|
||||
REFERENCES
|
||||
- Documentation/IRQ-affinity.txt: Binding interrupts to sets of CPUs.
|
||||
|
||||
o Documentation/IRQ-affinity.txt: Binding interrupts to sets of CPUs.
|
||||
- Documentation/cgroup-v1: Using cgroups to bind tasks to sets of CPUs.
|
||||
|
||||
o Documentation/cgroup-v1: Using cgroups to bind tasks to sets of CPUs.
|
||||
|
||||
o man taskset: Using the taskset command to bind tasks to sets
|
||||
- man taskset: Using the taskset command to bind tasks to sets
|
||||
of CPUs.
|
||||
|
||||
o man sched_setaffinity: Using the sched_setaffinity() system
|
||||
- man sched_setaffinity: Using the sched_setaffinity() system
|
||||
call to bind tasks to sets of CPUs.
|
||||
|
||||
o /sys/devices/system/cpu/cpuN/online: Control CPU N's hotplug state,
|
||||
- /sys/devices/system/cpu/cpuN/online: Control CPU N's hotplug state,
|
||||
writing "0" to offline and "1" to online.
|
||||
|
||||
o In order to locate kernel-generated OS jitter on CPU N:
|
||||
- In order to locate kernel-generated OS jitter on CPU N:
|
||||
|
||||
cd /sys/kernel/debug/tracing
|
||||
echo 1 > max_graph_depth # Increase the "1" for more detail
|
||||
@ -29,12 +31,17 @@ o In order to locate kernel-generated OS jitter on CPU N:
|
||||
# run workload
|
||||
cat per_cpu/cpuN/trace
|
||||
|
||||
kthreads
|
||||
========
|
||||
|
||||
KTHREADS
|
||||
Name:
|
||||
ehca_comp/%u
|
||||
|
||||
Purpose:
|
||||
Periodically process Infiniband-related work.
|
||||
|
||||
Name: ehca_comp/%u
|
||||
Purpose: Periodically process Infiniband-related work.
|
||||
To reduce its OS jitter, do any of the following:
|
||||
|
||||
1. Don't use eHCA Infiniband hardware, instead choosing hardware
|
||||
that does not require per-CPU kthreads. This will prevent these
|
||||
kthreads from being created in the first place. (This will
|
||||
@ -46,26 +53,45 @@ To reduce its OS jitter, do any of the following:
|
||||
provisioned only on selected CPUs.
|
||||
|
||||
|
||||
Name: irq/%d-%s
|
||||
Purpose: Handle threaded interrupts.
|
||||
Name:
|
||||
irq/%d-%s
|
||||
|
||||
Purpose:
|
||||
Handle threaded interrupts.
|
||||
|
||||
To reduce its OS jitter, do the following:
|
||||
|
||||
1. Use irq affinity to force the irq threads to execute on
|
||||
some other CPU.
|
||||
|
||||
Name: kcmtpd_ctr_%d
|
||||
Purpose: Handle Bluetooth work.
|
||||
Name:
|
||||
kcmtpd_ctr_%d
|
||||
|
||||
Purpose:
|
||||
Handle Bluetooth work.
|
||||
|
||||
To reduce its OS jitter, do one of the following:
|
||||
|
||||
1. Don't use Bluetooth, in which case these kthreads won't be
|
||||
created in the first place.
|
||||
2. Use irq affinity to force Bluetooth-related interrupts to
|
||||
occur on some other CPU and furthermore initiate all
|
||||
Bluetooth activity on some other CPU.
|
||||
|
||||
Name: ksoftirqd/%u
|
||||
Purpose: Execute softirq handlers when threaded or when under heavy load.
|
||||
Name:
|
||||
ksoftirqd/%u
|
||||
|
||||
Purpose:
|
||||
Execute softirq handlers when threaded or when under heavy load.
|
||||
|
||||
To reduce its OS jitter, each softirq vector must be handled
|
||||
separately as follows:
|
||||
TIMER_SOFTIRQ: Do all of the following:
|
||||
|
||||
TIMER_SOFTIRQ
|
||||
-------------
|
||||
|
||||
Do all of the following:
|
||||
|
||||
1. To the extent possible, keep the CPU out of the kernel when it
|
||||
is non-idle, for example, by avoiding system calls and by forcing
|
||||
both kernel threads and interrupts to execute elsewhere.
|
||||
@ -76,34 +102,59 @@ TIMER_SOFTIRQ: Do all of the following:
|
||||
first one back online. Once you have onlined the CPUs in question,
|
||||
do not offline any other CPUs, because doing so could force the
|
||||
timer back onto one of the CPUs in question.
|
||||
NET_TX_SOFTIRQ and NET_RX_SOFTIRQ: Do all of the following:
|
||||
|
||||
NET_TX_SOFTIRQ and NET_RX_SOFTIRQ
|
||||
---------------------------------
|
||||
|
||||
Do all of the following:
|
||||
|
||||
1. Force networking interrupts onto other CPUs.
|
||||
2. Initiate any network I/O on other CPUs.
|
||||
3. Once your application has started, prevent CPU-hotplug operations
|
||||
from being initiated from tasks that might run on the CPU to
|
||||
be de-jittered. (It is OK to force this CPU offline and then
|
||||
bring it back online before you start your application.)
|
||||
BLOCK_SOFTIRQ: Do all of the following:
|
||||
|
||||
BLOCK_SOFTIRQ
|
||||
-------------
|
||||
|
||||
Do all of the following:
|
||||
|
||||
1. Force block-device interrupts onto some other CPU.
|
||||
2. Initiate any block I/O on other CPUs.
|
||||
3. Once your application has started, prevent CPU-hotplug operations
|
||||
from being initiated from tasks that might run on the CPU to
|
||||
be de-jittered. (It is OK to force this CPU offline and then
|
||||
bring it back online before you start your application.)
|
||||
IRQ_POLL_SOFTIRQ: Do all of the following:
|
||||
|
||||
IRQ_POLL_SOFTIRQ
|
||||
----------------
|
||||
|
||||
Do all of the following:
|
||||
|
||||
1. Force block-device interrupts onto some other CPU.
|
||||
2. Initiate any block I/O and block-I/O polling on other CPUs.
|
||||
3. Once your application has started, prevent CPU-hotplug operations
|
||||
from being initiated from tasks that might run on the CPU to
|
||||
be de-jittered. (It is OK to force this CPU offline and then
|
||||
bring it back online before you start your application.)
|
||||
TASKLET_SOFTIRQ: Do one or more of the following:
|
||||
|
||||
TASKLET_SOFTIRQ
|
||||
---------------
|
||||
|
||||
Do one or more of the following:
|
||||
|
||||
1. Avoid use of drivers that use tasklets. (Such drivers will contain
|
||||
calls to things like tasklet_schedule().)
|
||||
2. Convert all drivers that you must use from tasklets to workqueues.
|
||||
3. Force interrupts for drivers using tasklets onto other CPUs,
|
||||
and also do I/O involving these drivers on other CPUs.
|
||||
SCHED_SOFTIRQ: Do all of the following:
|
||||
|
||||
SCHED_SOFTIRQ
|
||||
-------------
|
||||
|
||||
Do all of the following:
|
||||
|
||||
1. Avoid sending scheduler IPIs to the CPU to be de-jittered,
|
||||
for example, ensure that at most one runnable kthread is present
|
||||
on that CPU. If a thread that expects to run on the de-jittered
|
||||
@ -120,7 +171,12 @@ SCHED_SOFTIRQ: Do all of the following:
|
||||
forcing both kernel threads and interrupts to execute elsewhere.
|
||||
This further reduces the number of scheduler-clock interrupts
|
||||
received by the de-jittered CPU.
|
||||
HRTIMER_SOFTIRQ: Do all of the following:
|
||||
|
||||
HRTIMER_SOFTIRQ
|
||||
---------------
|
||||
|
||||
Do all of the following:
|
||||
|
||||
1. To the extent possible, keep the CPU out of the kernel when it
|
||||
is non-idle. For example, avoid system calls and force both
|
||||
kernel threads and interrupts to execute elsewhere.
|
||||
@ -131,9 +187,15 @@ HRTIMER_SOFTIRQ: Do all of the following:
|
||||
back online. Once you have onlined the CPUs in question, do not
|
||||
offline any other CPUs, because doing so could force the timer
|
||||
back onto one of the CPUs in question.
|
||||
RCU_SOFTIRQ: Do at least one of the following:
|
||||
|
||||
RCU_SOFTIRQ
|
||||
-----------
|
||||
|
||||
Do at least one of the following:
|
||||
|
||||
1. Offload callbacks and keep the CPU in either dyntick-idle or
|
||||
adaptive-ticks state by doing all of the following:
|
||||
|
||||
a. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be
|
||||
de-jittered is marked as an adaptive-ticks CPU using the
|
||||
"nohz_full=" boot parameter. Bind the rcuo kthreads to
|
||||
@ -142,8 +204,10 @@ RCU_SOFTIRQ: Do at least one of the following:
|
||||
when it is non-idle, for example, by avoiding system
|
||||
calls and by forcing both kernel threads and interrupts
|
||||
to execute elsewhere.
|
||||
|
||||
2. Enable RCU to do its processing remotely via dyntick-idle by
|
||||
doing all of the following:
|
||||
|
||||
a. Build with CONFIG_NO_HZ=y and CONFIG_RCU_FAST_NO_HZ=y.
|
||||
b. Ensure that the CPU goes idle frequently, allowing other
|
||||
CPUs to detect that it has passed through an RCU quiescent
|
||||
@ -155,15 +219,20 @@ RCU_SOFTIRQ: Do at least one of the following:
|
||||
calls and by forcing both kernel threads and interrupts
|
||||
to execute elsewhere.
|
||||
|
||||
Name: kworker/%u:%d%s (cpu, id, priority)
|
||||
Purpose: Execute workqueue requests
|
||||
Name:
|
||||
kworker/%u:%d%s (cpu, id, priority)
|
||||
|
||||
Purpose:
|
||||
Execute workqueue requests
|
||||
|
||||
To reduce its OS jitter, do any of the following:
|
||||
|
||||
1. Run your workload at a real-time priority, which will allow
|
||||
preempting the kworker daemons.
|
||||
2. A given workqueue can be made visible in the sysfs filesystem
|
||||
by passing the WQ_SYSFS to that workqueue's alloc_workqueue().
|
||||
Such a workqueue can be confined to a given subset of the
|
||||
CPUs using the /sys/devices/virtual/workqueue/*/cpumask sysfs
|
||||
CPUs using the ``/sys/devices/virtual/workqueue/*/cpumask`` sysfs
|
||||
files. The set of WQ_SYSFS workqueues can be displayed using
|
||||
"ls sys/devices/virtual/workqueue". That said, the workqueues
|
||||
maintainer would like to caution people against indiscriminately
|
||||
@ -173,6 +242,7 @@ To reduce its OS jitter, do any of the following:
|
||||
to remove it, even if its addition was a mistake.
|
||||
3. Do any of the following needed to avoid jitter that your
|
||||
application cannot tolerate:
|
||||
|
||||
a. Build your kernel with CONFIG_SLUB=y rather than
|
||||
CONFIG_SLAB=y, thus avoiding the slab allocator's periodic
|
||||
use of each CPU's workqueues to run its cache_reap()
|
||||
@ -186,6 +256,7 @@ To reduce its OS jitter, do any of the following:
|
||||
be able to build your kernel with CONFIG_CPU_FREQ=n to
|
||||
avoid the CPU-frequency governor periodically running
|
||||
on each CPU, including cs_dbs_timer() and od_dbs_timer().
|
||||
|
||||
WARNING: Please check your CPU specifications to
|
||||
make sure that this is safe on your particular system.
|
||||
d. As of v3.18, Christoph Lameter's on-demand vmstat workers
|
||||
@ -222,9 +293,14 @@ To reduce its OS jitter, do any of the following:
|
||||
CONFIG_PMAC_RACKMETER=n to disable the CPU-meter,
|
||||
avoiding OS jitter from rackmeter_do_timer().
|
||||
|
||||
Name: rcuc/%u
|
||||
Purpose: Execute RCU callbacks in CONFIG_RCU_BOOST=y kernels.
|
||||
Name:
|
||||
rcuc/%u
|
||||
|
||||
Purpose:
|
||||
Execute RCU callbacks in CONFIG_RCU_BOOST=y kernels.
|
||||
|
||||
To reduce its OS jitter, do at least one of the following:
|
||||
|
||||
1. Build the kernel with CONFIG_PREEMPT=n. This prevents these
|
||||
kthreads from being created in the first place, and also obviates
|
||||
the need for RCU priority boosting. This approach is feasible
|
||||
@ -244,9 +320,14 @@ To reduce its OS jitter, do at least one of the following:
|
||||
CPU, again preventing the rcuc/%u kthreads from having any work
|
||||
to do.
|
||||
|
||||
Name: rcuob/%d, rcuop/%d, and rcuos/%d
|
||||
Purpose: Offload RCU callbacks from the corresponding CPU.
|
||||
Name:
|
||||
rcuob/%d, rcuop/%d, and rcuos/%d
|
||||
|
||||
Purpose:
|
||||
Offload RCU callbacks from the corresponding CPU.
|
||||
|
||||
To reduce its OS jitter, do at least one of the following:
|
||||
|
||||
1. Use affinity, cgroups, or other mechanism to force these kthreads
|
||||
to execute on some other CPU.
|
||||
2. Build with CONFIG_RCU_NOCB_CPU=n, which will prevent these
|
||||
@ -254,9 +335,14 @@ To reduce its OS jitter, do at least one of the following:
|
||||
note that this will not eliminate OS jitter, but will instead
|
||||
shift it to RCU_SOFTIRQ.
|
||||
|
||||
Name: watchdog/%u
|
||||
Purpose: Detect software lockups on each CPU.
|
||||
Name:
|
||||
watchdog/%u
|
||||
|
||||
Purpose:
|
||||
Detect software lockups on each CPU.
|
||||
|
||||
To reduce its OS jitter, do at least one of the following:
|
||||
|
||||
1. Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these
|
||||
kthreads from being created in the first place.
|
||||
2. Boot with "nosoftlockup=0", which will also prevent these kthreads
|
||||
|
@ -1,13 +1,13 @@
|
||||
=====================================================================
|
||||
Everything you never wanted to know about kobjects, ksets, and ktypes
|
||||
=====================================================================
|
||||
|
||||
Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
||||
:Author: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
||||
:Last updated: December 19, 2007
|
||||
|
||||
Based on an original article by Jon Corbet for lwn.net written October 1,
|
||||
2003 and located at http://lwn.net/Articles/51437/
|
||||
|
||||
Last updated December 19, 2007
|
||||
|
||||
|
||||
Part of the difficulty in understanding the driver model - and the kobject
|
||||
abstraction upon which it is built - is that there is no obvious starting
|
||||
place. Dealing with kobjects requires understanding a few different types,
|
||||
@ -47,6 +47,7 @@ approach will be taken, so we'll go back to kobjects.
|
||||
|
||||
|
||||
Embedding kobjects
|
||||
==================
|
||||
|
||||
It is rare for kernel code to create a standalone kobject, with one major
|
||||
exception explained below. Instead, kobjects are used to control access to
|
||||
@ -65,7 +66,7 @@ their own, but are invariably found embedded in the larger objects of
|
||||
interest.)
|
||||
|
||||
So, for example, the UIO code in drivers/uio/uio.c has a structure that
|
||||
defines the memory region associated with a uio device:
|
||||
defines the memory region associated with a uio device::
|
||||
|
||||
struct uio_map {
|
||||
struct kobject kobj;
|
||||
@ -77,7 +78,7 @@ just a matter of using the kobj member. Code that works with kobjects will
|
||||
often have the opposite problem, however: given a struct kobject pointer,
|
||||
what is the pointer to the containing structure? You must avoid tricks
|
||||
(such as assuming that the kobject is at the beginning of the structure)
|
||||
and, instead, use the container_of() macro, found in <linux/kernel.h>:
|
||||
and, instead, use the container_of() macro, found in <linux/kernel.h>::
|
||||
|
||||
container_of(pointer, type, member)
|
||||
|
||||
@ -90,13 +91,13 @@ where:
|
||||
The return value from container_of() is a pointer to the corresponding
|
||||
container type. So, for example, a pointer "kp" to a struct kobject
|
||||
embedded *within* a struct uio_map could be converted to a pointer to the
|
||||
*containing* uio_map structure with:
|
||||
*containing* uio_map structure with::
|
||||
|
||||
struct uio_map *u_map = container_of(kp, struct uio_map, kobj);
|
||||
|
||||
For convenience, programmers often define a simple macro for "back-casting"
|
||||
kobject pointers to the containing type. Exactly this happens in the
|
||||
earlier drivers/uio/uio.c, as you can see here:
|
||||
earlier drivers/uio/uio.c, as you can see here::
|
||||
|
||||
struct uio_map {
|
||||
struct kobject kobj;
|
||||
@ -106,23 +107,25 @@ earlier drivers/uio/uio.c, as you can see here:
|
||||
#define to_map(map) container_of(map, struct uio_map, kobj)
|
||||
|
||||
where the macro argument "map" is a pointer to the struct kobject in
|
||||
question. That macro is subsequently invoked with:
|
||||
question. That macro is subsequently invoked with::
|
||||
|
||||
struct uio_map *map = to_map(kobj);
|
||||
|
||||
|
||||
Initialization of kobjects
|
||||
==========================
|
||||
|
||||
Code which creates a kobject must, of course, initialize that object. Some
|
||||
of the internal fields are setup with a (mandatory) call to kobject_init():
|
||||
of the internal fields are setup with a (mandatory) call to kobject_init()::
|
||||
|
||||
void kobject_init(struct kobject *kobj, struct kobj_type *ktype);
|
||||
|
||||
The ktype is required for a kobject to be created properly, as every kobject
|
||||
must have an associated kobj_type. After calling kobject_init(), to
|
||||
register the kobject with sysfs, the function kobject_add() must be called:
|
||||
register the kobject with sysfs, the function kobject_add() must be called::
|
||||
|
||||
int kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...);
|
||||
int kobject_add(struct kobject *kobj, struct kobject *parent,
|
||||
const char *fmt, ...);
|
||||
|
||||
This sets up the parent of the kobject and the name for the kobject
|
||||
properly. If the kobject is to be associated with a specific kset,
|
||||
@ -133,7 +136,7 @@ kset itself.
|
||||
|
||||
As the name of the kobject is set when it is added to the kernel, the name
|
||||
of the kobject should never be manipulated directly. If you must change
|
||||
the name of the kobject, call kobject_rename():
|
||||
the name of the kobject, call kobject_rename()::
|
||||
|
||||
int kobject_rename(struct kobject *kobj, const char *new_name);
|
||||
|
||||
@ -146,12 +149,12 @@ is being removed. If your code needs to call this function, it is
|
||||
incorrect and needs to be fixed.
|
||||
|
||||
To properly access the name of the kobject, use the function
|
||||
kobject_name():
|
||||
kobject_name()::
|
||||
|
||||
const char *kobject_name(const struct kobject * kobj);
|
||||
|
||||
There is a helper function to both initialize and add the kobject to the
|
||||
kernel at the same time, called surprisingly enough kobject_init_and_add():
|
||||
kernel at the same time, called surprisingly enough kobject_init_and_add()::
|
||||
|
||||
int kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype,
|
||||
struct kobject *parent, const char *fmt, ...);
|
||||
@ -161,10 +164,11 @@ kobject_add() functions described above.
|
||||
|
||||
|
||||
Uevents
|
||||
=======
|
||||
|
||||
After a kobject has been registered with the kobject core, you need to
|
||||
announce to the world that it has been created. This can be done with a
|
||||
call to kobject_uevent():
|
||||
call to kobject_uevent()::
|
||||
|
||||
int kobject_uevent(struct kobject *kobj, enum kobject_action action);
|
||||
|
||||
@ -180,11 +184,12 @@ hand.
|
||||
|
||||
|
||||
Reference counts
|
||||
================
|
||||
|
||||
One of the key functions of a kobject is to serve as a reference counter
|
||||
for the object in which it is embedded. As long as references to the object
|
||||
exist, the object (and the code which supports it) must continue to exist.
|
||||
The low-level functions for manipulating a kobject's reference counts are:
|
||||
The low-level functions for manipulating a kobject's reference counts are::
|
||||
|
||||
struct kobject *kobject_get(struct kobject *kobj);
|
||||
void kobject_put(struct kobject *kobj);
|
||||
@ -209,21 +214,24 @@ file Documentation/kref.txt in the Linux kernel source tree.
|
||||
|
||||
|
||||
Creating "simple" kobjects
|
||||
==========================
|
||||
|
||||
Sometimes all that a developer wants is a way to create a simple directory
|
||||
in the sysfs hierarchy, and not have to mess with the whole complication of
|
||||
ksets, show and store functions, and other details. This is the one
|
||||
exception where a single kobject should be created. To create such an
|
||||
entry, use the function:
|
||||
entry, use the function::
|
||||
|
||||
struct kobject *kobject_create_and_add(char *name, struct kobject *parent);
|
||||
|
||||
This function will create a kobject and place it in sysfs in the location
|
||||
underneath the specified parent kobject. To create simple attributes
|
||||
associated with this kobject, use:
|
||||
associated with this kobject, use::
|
||||
|
||||
int sysfs_create_file(struct kobject *kobj, struct attribute *attr);
|
||||
or
|
||||
|
||||
or::
|
||||
|
||||
int sysfs_create_group(struct kobject *kobj, struct attribute_group *grp);
|
||||
|
||||
Both types of attributes used here, with a kobject that has been created
|
||||
@ -236,6 +244,7 @@ implementation of a simple kobject and attributes.
|
||||
|
||||
|
||||
ktypes and release methods
|
||||
==========================
|
||||
|
||||
One important thing still missing from the discussion is what happens to a
|
||||
kobject when its reference count reaches zero. The code which created the
|
||||
@ -257,7 +266,7 @@ is good practice to always use kobject_put() after kobject_init() to avoid
|
||||
errors creeping in.
|
||||
|
||||
This notification is done through a kobject's release() method. Usually
|
||||
such a method has a form like:
|
||||
such a method has a form like::
|
||||
|
||||
void my_object_release(struct kobject *kobj)
|
||||
{
|
||||
@ -281,7 +290,7 @@ leak in the kobject core, which makes people unhappy.
|
||||
|
||||
Interestingly, the release() method is not stored in the kobject itself;
|
||||
instead, it is associated with the ktype. So let us introduce struct
|
||||
kobj_type:
|
||||
kobj_type::
|
||||
|
||||
struct kobj_type {
|
||||
void (*release)(struct kobject *kobj);
|
||||
@ -306,6 +315,7 @@ automatically created for any kobject that is registered with this ktype.
|
||||
|
||||
|
||||
ksets
|
||||
=====
|
||||
|
||||
A kset is merely a collection of kobjects that want to be associated with
|
||||
each other. There is no restriction that they be of the same ktype, but be
|
||||
@ -335,13 +345,16 @@ kobject) in their parent.
|
||||
|
||||
As a kset contains a kobject within it, it should always be dynamically
|
||||
created and never declared statically or on the stack. To create a new
|
||||
kset use:
|
||||
kset use::
|
||||
|
||||
struct kset *kset_create_and_add(const char *name,
|
||||
struct kset_uevent_ops *u,
|
||||
struct kobject *parent);
|
||||
|
||||
When you are finished with the kset, call:
|
||||
When you are finished with the kset, call::
|
||||
|
||||
void kset_unregister(struct kset *kset);
|
||||
|
||||
to destroy it. This removes the kset from sysfs and decrements its reference
|
||||
count. When the reference count goes to zero, the kset will be released.
|
||||
Because other references to the kset may still exist, the release may happen
|
||||
@ -351,14 +364,14 @@ An example of using a kset can be seen in the
|
||||
samples/kobject/kset-example.c file in the kernel tree.
|
||||
|
||||
If a kset wishes to control the uevent operations of the kobjects
|
||||
associated with it, it can use the struct kset_uevent_ops to handle it:
|
||||
associated with it, it can use the struct kset_uevent_ops to handle it::
|
||||
|
||||
struct kset_uevent_ops {
|
||||
struct kset_uevent_ops {
|
||||
int (*filter)(struct kset *kset, struct kobject *kobj);
|
||||
const char *(*name)(struct kset *kset, struct kobject *kobj);
|
||||
int (*uevent)(struct kset *kset, struct kobject *kobj,
|
||||
struct kobj_uevent_env *env);
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
The filter function allows a kset to prevent a uevent from being emitted to
|
||||
@ -386,6 +399,7 @@ added below the parent kobject.
|
||||
|
||||
|
||||
Kobject removal
|
||||
===============
|
||||
|
||||
After a kobject has been registered with the kobject core successfully, it
|
||||
must be cleaned up when the code is finished with it. To do that, call
|
||||
@ -409,6 +423,7 @@ called, and the objects in the former circle release each other.
|
||||
|
||||
|
||||
Example code to copy from
|
||||
=========================
|
||||
|
||||
For a more complete example of using ksets and kobjects properly, see the
|
||||
example programs samples/kobject/{kobject-example.c,kset-example.c},
|
||||
|
@ -1,30 +1,36 @@
|
||||
Title : Kernel Probes (Kprobes)
|
||||
Authors : Jim Keniston <jkenisto@us.ibm.com>
|
||||
: Prasanna S Panchamukhi <prasanna.panchamukhi@gmail.com>
|
||||
: Masami Hiramatsu <mhiramat@redhat.com>
|
||||
=======================
|
||||
Kernel Probes (Kprobes)
|
||||
=======================
|
||||
|
||||
CONTENTS
|
||||
:Author: Jim Keniston <jkenisto@us.ibm.com>
|
||||
:Author: Prasanna S Panchamukhi <prasanna.panchamukhi@gmail.com>
|
||||
:Author: Masami Hiramatsu <mhiramat@redhat.com>
|
||||
|
||||
1. Concepts: Kprobes, Jprobes, Return Probes
|
||||
2. Architectures Supported
|
||||
3. Configuring Kprobes
|
||||
4. API Reference
|
||||
5. Kprobes Features and Limitations
|
||||
6. Probe Overhead
|
||||
7. TODO
|
||||
8. Kprobes Example
|
||||
9. Jprobes Example
|
||||
10. Kretprobes Example
|
||||
Appendix A: The kprobes debugfs interface
|
||||
Appendix B: The kprobes sysctl interface
|
||||
.. CONTENTS
|
||||
|
||||
1. Concepts: Kprobes, Jprobes, Return Probes
|
||||
1. Concepts: Kprobes, Jprobes, Return Probes
|
||||
2. Architectures Supported
|
||||
3. Configuring Kprobes
|
||||
4. API Reference
|
||||
5. Kprobes Features and Limitations
|
||||
6. Probe Overhead
|
||||
7. TODO
|
||||
8. Kprobes Example
|
||||
9. Jprobes Example
|
||||
10. Kretprobes Example
|
||||
Appendix A: The kprobes debugfs interface
|
||||
Appendix B: The kprobes sysctl interface
|
||||
|
||||
Concepts: Kprobes, Jprobes, Return Probes
|
||||
=========================================
|
||||
|
||||
Kprobes enables you to dynamically break into any kernel routine and
|
||||
collect debugging and performance information non-disruptively. You
|
||||
can trap at almost any kernel code address(*), specifying a handler
|
||||
can trap at almost any kernel code address [1]_, specifying a handler
|
||||
routine to be invoked when the breakpoint is hit.
|
||||
(*: some parts of the kernel code can not be trapped, see 1.5 Blacklist)
|
||||
|
||||
.. [1] some parts of the kernel code can not be trapped, see
|
||||
:ref:`kprobes_blacklist`)
|
||||
|
||||
There are currently three types of probes: kprobes, jprobes, and
|
||||
kretprobes (also called return probes). A kprobe can be inserted
|
||||
@ -40,8 +46,8 @@ registration function such as register_kprobe() specifies where
|
||||
the probe is to be inserted and what handler is to be called when
|
||||
the probe is hit.
|
||||
|
||||
There are also register_/unregister_*probes() functions for batch
|
||||
registration/unregistration of a group of *probes. These functions
|
||||
There are also ``register_/unregister_*probes()`` functions for batch
|
||||
registration/unregistration of a group of ``*probes``. These functions
|
||||
can speed up unregistration process when you have to unregister
|
||||
a lot of probes at once.
|
||||
|
||||
@ -51,9 +57,10 @@ things that you'll need to know in order to make the best use of
|
||||
Kprobes -- e.g., the difference between a pre_handler and
|
||||
a post_handler, and how to use the maxactive and nmissed fields of
|
||||
a kretprobe. But if you're in a hurry to start using Kprobes, you
|
||||
can skip ahead to section 2.
|
||||
can skip ahead to :ref:`kprobes_archs_supported`.
|
||||
|
||||
1.1 How Does a Kprobe Work?
|
||||
How Does a Kprobe Work?
|
||||
-----------------------
|
||||
|
||||
When a kprobe is registered, Kprobes makes a copy of the probed
|
||||
instruction and replaces the first byte(s) of the probed instruction
|
||||
@ -75,7 +82,8 @@ After the instruction is single-stepped, Kprobes executes the
|
||||
"post_handler," if any, that is associated with the kprobe.
|
||||
Execution then continues with the instruction following the probepoint.
|
||||
|
||||
1.2 How Does a Jprobe Work?
|
||||
How Does a Jprobe Work?
|
||||
-----------------------
|
||||
|
||||
A jprobe is implemented using a kprobe that is placed on a function's
|
||||
entry point. It employs a simple mirroring principle to allow
|
||||
@ -113,9 +121,11 @@ more than eight function arguments, an argument of more than sixteen
|
||||
bytes, or more than 64 bytes of argument data, depending on
|
||||
architecture).
|
||||
|
||||
1.3 Return Probes
|
||||
Return Probes
|
||||
-------------
|
||||
|
||||
1.3.1 How Does a Return Probe Work?
|
||||
How Does a Return Probe Work?
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
When you call register_kretprobe(), Kprobes establishes a kprobe at
|
||||
the entry to the function. When the probed function is called and this
|
||||
@ -150,7 +160,8 @@ zero when the return probe is registered, and is incremented every
|
||||
time the probed function is entered but there is no kretprobe_instance
|
||||
object available for establishing the return probe.
|
||||
|
||||
1.3.2 Kretprobe entry-handler
|
||||
Kretprobe entry-handler
|
||||
^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Kretprobes also provides an optional user-specified handler which runs
|
||||
on function entry. This handler is specified by setting the entry_handler
|
||||
@ -174,7 +185,10 @@ In case probed function is entered but there is no kretprobe_instance
|
||||
object available, then in addition to incrementing the nmissed count,
|
||||
the user entry_handler invocation is also skipped.
|
||||
|
||||
1.4 How Does Jump Optimization Work?
|
||||
.. _kprobes_jump_optimization:
|
||||
|
||||
How Does Jump Optimization Work?
|
||||
--------------------------------
|
||||
|
||||
If your kernel is built with CONFIG_OPTPROBES=y (currently this flag
|
||||
is automatically set 'y' on x86/x86-64, non-preemptive kernel) and
|
||||
@ -182,53 +196,60 @@ the "debug.kprobes_optimization" kernel parameter is set to 1 (see
|
||||
sysctl(8)), Kprobes tries to reduce probe-hit overhead by using a jump
|
||||
instruction instead of a breakpoint instruction at each probepoint.
|
||||
|
||||
1.4.1 Init a Kprobe
|
||||
Init a Kprobe
|
||||
^^^^^^^^^^^^^
|
||||
|
||||
When a probe is registered, before attempting this optimization,
|
||||
Kprobes inserts an ordinary, breakpoint-based kprobe at the specified
|
||||
address. So, even if it's not possible to optimize this particular
|
||||
probepoint, there'll be a probe there.
|
||||
|
||||
1.4.2 Safety Check
|
||||
Safety Check
|
||||
^^^^^^^^^^^^
|
||||
|
||||
Before optimizing a probe, Kprobes performs the following safety checks:
|
||||
|
||||
- Kprobes verifies that the region that will be replaced by the jump
|
||||
instruction (the "optimized region") lies entirely within one function.
|
||||
(A jump instruction is multiple bytes, and so may overlay multiple
|
||||
instructions.)
|
||||
instruction (the "optimized region") lies entirely within one function.
|
||||
(A jump instruction is multiple bytes, and so may overlay multiple
|
||||
instructions.)
|
||||
|
||||
- Kprobes analyzes the entire function and verifies that there is no
|
||||
jump into the optimized region. Specifically:
|
||||
jump into the optimized region. Specifically:
|
||||
|
||||
- the function contains no indirect jump;
|
||||
- the function contains no instruction that causes an exception (since
|
||||
the fixup code triggered by the exception could jump back into the
|
||||
optimized region -- Kprobes checks the exception tables to verify this);
|
||||
and
|
||||
the fixup code triggered by the exception could jump back into the
|
||||
optimized region -- Kprobes checks the exception tables to verify this);
|
||||
- there is no near jump to the optimized region (other than to the first
|
||||
byte).
|
||||
byte).
|
||||
|
||||
- For each instruction in the optimized region, Kprobes verifies that
|
||||
the instruction can be executed out of line.
|
||||
the instruction can be executed out of line.
|
||||
|
||||
1.4.3 Preparing Detour Buffer
|
||||
Preparing Detour Buffer
|
||||
^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Next, Kprobes prepares a "detour" buffer, which contains the following
|
||||
instruction sequence:
|
||||
|
||||
- code to push the CPU's registers (emulating a breakpoint trap)
|
||||
- a call to the trampoline code which calls user's probe handlers.
|
||||
- code to restore registers
|
||||
- the instructions from the optimized region
|
||||
- a jump back to the original execution path.
|
||||
|
||||
1.4.4 Pre-optimization
|
||||
Pre-optimization
|
||||
^^^^^^^^^^^^^^^^
|
||||
|
||||
After preparing the detour buffer, Kprobes verifies that none of the
|
||||
following situations exist:
|
||||
|
||||
- The probe has either a break_handler (i.e., it's a jprobe) or a
|
||||
post_handler.
|
||||
post_handler.
|
||||
- Other instructions in the optimized region are probed.
|
||||
- The probe is disabled.
|
||||
|
||||
In any of the above cases, Kprobes won't start optimizing the probe.
|
||||
Since these are temporary situations, Kprobes tries to start
|
||||
optimizing it again if the situation is changed.
|
||||
@ -240,21 +261,23 @@ Kprobes returns control to the original instruction path by setting
|
||||
the CPU's instruction pointer to the copied code in the detour buffer
|
||||
-- thus at least avoiding the single-step.
|
||||
|
||||
1.4.5 Optimization
|
||||
Optimization
|
||||
^^^^^^^^^^^^
|
||||
|
||||
The Kprobe-optimizer doesn't insert the jump instruction immediately;
|
||||
rather, it calls synchronize_sched() for safety first, because it's
|
||||
possible for a CPU to be interrupted in the middle of executing the
|
||||
optimized region(*). As you know, synchronize_sched() can ensure
|
||||
optimized region [3]_. As you know, synchronize_sched() can ensure
|
||||
that all interruptions that were active when synchronize_sched()
|
||||
was called are done, but only if CONFIG_PREEMPT=n. So, this version
|
||||
of kprobe optimization supports only kernels with CONFIG_PREEMPT=n.(**)
|
||||
of kprobe optimization supports only kernels with CONFIG_PREEMPT=n [4]_.
|
||||
|
||||
After that, the Kprobe-optimizer calls stop_machine() to replace
|
||||
the optimized region with a jump instruction to the detour buffer,
|
||||
using text_poke_smp().
|
||||
|
||||
1.4.6 Unoptimization
|
||||
Unoptimization
|
||||
^^^^^^^^^^^^^^
|
||||
|
||||
When an optimized kprobe is unregistered, disabled, or blocked by
|
||||
another kprobe, it will be unoptimized. If this happens before
|
||||
@ -263,15 +286,15 @@ optimized list. If the optimization has been done, the jump is
|
||||
replaced with the original code (except for an int3 breakpoint in
|
||||
the first byte) by using text_poke_smp().
|
||||
|
||||
(*)Please imagine that the 2nd instruction is interrupted and then
|
||||
the optimizer replaces the 2nd instruction with the jump *address*
|
||||
while the interrupt handler is running. When the interrupt
|
||||
returns to original address, there is no valid instruction,
|
||||
and it causes an unexpected result.
|
||||
.. [3] Please imagine that the 2nd instruction is interrupted and then
|
||||
the optimizer replaces the 2nd instruction with the jump *address*
|
||||
while the interrupt handler is running. When the interrupt
|
||||
returns to original address, there is no valid instruction,
|
||||
and it causes an unexpected result.
|
||||
|
||||
(**)This optimization-safety checking may be replaced with the
|
||||
stop-machine method that ksplice uses for supporting a CONFIG_PREEMPT=y
|
||||
kernel.
|
||||
.. [4] This optimization-safety checking may be replaced with the
|
||||
stop-machine method that ksplice uses for supporting a CONFIG_PREEMPT=y
|
||||
kernel.
|
||||
|
||||
NOTE for geeks:
|
||||
The jump optimization changes the kprobe's pre_handler behavior.
|
||||
@ -280,11 +303,17 @@ path by changing regs->ip and returning 1. However, when the probe
|
||||
is optimized, that modification is ignored. Thus, if you want to
|
||||
tweak the kernel's execution path, you need to suppress optimization,
|
||||
using one of the following techniques:
|
||||
|
||||
- Specify an empty function for the kprobe's post_handler or break_handler.
|
||||
or
|
||||
|
||||
or
|
||||
|
||||
- Execute 'sysctl -w debug.kprobes_optimization=n'
|
||||
|
||||
1.5 Blacklist
|
||||
.. _kprobes_blacklist:
|
||||
|
||||
Blacklist
|
||||
---------
|
||||
|
||||
Kprobes can probe most of the kernel except itself. This means
|
||||
that there are some functions where kprobes cannot probe. Probing
|
||||
@ -297,7 +326,10 @@ to specify a blacklisted function.
|
||||
Kprobes checks the given probe address against the blacklist and
|
||||
rejects registering it, if the given address is in the blacklist.
|
||||
|
||||
2. Architectures Supported
|
||||
.. _kprobes_archs_supported:
|
||||
|
||||
Architectures Supported
|
||||
=======================
|
||||
|
||||
Kprobes, jprobes, and return probes are implemented on the following
|
||||
architectures:
|
||||
@ -312,7 +344,8 @@ architectures:
|
||||
- mips
|
||||
- s390
|
||||
|
||||
3. Configuring Kprobes
|
||||
Configuring Kprobes
|
||||
===================
|
||||
|
||||
When configuring the kernel using make menuconfig/xconfig/oldconfig,
|
||||
ensure that CONFIG_KPROBES is set to "y". Under "General setup", look
|
||||
@ -331,7 +364,8 @@ it useful to "Compile the kernel with debug info" (CONFIG_DEBUG_INFO),
|
||||
so you can use "objdump -d -l vmlinux" to see the source-to-object
|
||||
code mapping.
|
||||
|
||||
4. API Reference
|
||||
API Reference
|
||||
=============
|
||||
|
||||
The Kprobes API includes a "register" function and an "unregister"
|
||||
function for each type of probe. The API also includes "register_*probes"
|
||||
@ -340,10 +374,13 @@ Here are terse, mini-man-page specifications for these functions and
|
||||
the associated probe handlers that you'll write. See the files in the
|
||||
samples/kprobes/ sub-directory for examples.
|
||||
|
||||
4.1 register_kprobe
|
||||
register_kprobe
|
||||
---------------
|
||||
|
||||
#include <linux/kprobes.h>
|
||||
int register_kprobe(struct kprobe *kp);
|
||||
::
|
||||
|
||||
#include <linux/kprobes.h>
|
||||
int register_kprobe(struct kprobe *kp);
|
||||
|
||||
Sets a breakpoint at the address kp->addr. When the breakpoint is
|
||||
hit, Kprobes calls kp->pre_handler. After the probed instruction
|
||||
@ -354,61 +391,68 @@ kp->fault_handler. Any or all handlers can be NULL. If kp->flags
|
||||
is set KPROBE_FLAG_DISABLED, that kp will be registered but disabled,
|
||||
so, its handlers aren't hit until calling enable_kprobe(kp).
|
||||
|
||||
NOTE:
|
||||
1. With the introduction of the "symbol_name" field to struct kprobe,
|
||||
the probepoint address resolution will now be taken care of by the kernel.
|
||||
The following will now work:
|
||||
.. note::
|
||||
|
||||
1. With the introduction of the "symbol_name" field to struct kprobe,
|
||||
the probepoint address resolution will now be taken care of by the kernel.
|
||||
The following will now work::
|
||||
|
||||
kp.symbol_name = "symbol_name";
|
||||
|
||||
(64-bit powerpc intricacies such as function descriptors are handled
|
||||
transparently)
|
||||
(64-bit powerpc intricacies such as function descriptors are handled
|
||||
transparently)
|
||||
|
||||
2. Use the "offset" field of struct kprobe if the offset into the symbol
|
||||
to install a probepoint is known. This field is used to calculate the
|
||||
probepoint.
|
||||
2. Use the "offset" field of struct kprobe if the offset into the symbol
|
||||
to install a probepoint is known. This field is used to calculate the
|
||||
probepoint.
|
||||
|
||||
3. Specify either the kprobe "symbol_name" OR the "addr". If both are
|
||||
specified, kprobe registration will fail with -EINVAL.
|
||||
3. Specify either the kprobe "symbol_name" OR the "addr". If both are
|
||||
specified, kprobe registration will fail with -EINVAL.
|
||||
|
||||
4. With CISC architectures (such as i386 and x86_64), the kprobes code
|
||||
does not validate if the kprobe.addr is at an instruction boundary.
|
||||
Use "offset" with caution.
|
||||
4. With CISC architectures (such as i386 and x86_64), the kprobes code
|
||||
does not validate if the kprobe.addr is at an instruction boundary.
|
||||
Use "offset" with caution.
|
||||
|
||||
register_kprobe() returns 0 on success, or a negative errno otherwise.
|
||||
|
||||
User's pre-handler (kp->pre_handler):
|
||||
#include <linux/kprobes.h>
|
||||
#include <linux/ptrace.h>
|
||||
int pre_handler(struct kprobe *p, struct pt_regs *regs);
|
||||
User's pre-handler (kp->pre_handler)::
|
||||
|
||||
#include <linux/kprobes.h>
|
||||
#include <linux/ptrace.h>
|
||||
int pre_handler(struct kprobe *p, struct pt_regs *regs);
|
||||
|
||||
Called with p pointing to the kprobe associated with the breakpoint,
|
||||
and regs pointing to the struct containing the registers saved when
|
||||
the breakpoint was hit. Return 0 here unless you're a Kprobes geek.
|
||||
|
||||
User's post-handler (kp->post_handler):
|
||||
#include <linux/kprobes.h>
|
||||
#include <linux/ptrace.h>
|
||||
void post_handler(struct kprobe *p, struct pt_regs *regs,
|
||||
unsigned long flags);
|
||||
User's post-handler (kp->post_handler)::
|
||||
|
||||
#include <linux/kprobes.h>
|
||||
#include <linux/ptrace.h>
|
||||
void post_handler(struct kprobe *p, struct pt_regs *regs,
|
||||
unsigned long flags);
|
||||
|
||||
p and regs are as described for the pre_handler. flags always seems
|
||||
to be zero.
|
||||
|
||||
User's fault-handler (kp->fault_handler):
|
||||
#include <linux/kprobes.h>
|
||||
#include <linux/ptrace.h>
|
||||
int fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr);
|
||||
User's fault-handler (kp->fault_handler)::
|
||||
|
||||
#include <linux/kprobes.h>
|
||||
#include <linux/ptrace.h>
|
||||
int fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr);
|
||||
|
||||
p and regs are as described for the pre_handler. trapnr is the
|
||||
architecture-specific trap number associated with the fault (e.g.,
|
||||
on i386, 13 for a general protection fault or 14 for a page fault).
|
||||
Returns 1 if it successfully handled the exception.
|
||||
|
||||
4.2 register_jprobe
|
||||
register_jprobe
|
||||
---------------
|
||||
|
||||
#include <linux/kprobes.h>
|
||||
int register_jprobe(struct jprobe *jp)
|
||||
::
|
||||
|
||||
#include <linux/kprobes.h>
|
||||
int register_jprobe(struct jprobe *jp)
|
||||
|
||||
Sets a breakpoint at the address jp->kp.addr, which must be the address
|
||||
of the first instruction of a function. When the breakpoint is hit,
|
||||
@ -423,10 +467,13 @@ declaration must match.
|
||||
|
||||
register_jprobe() returns 0 on success, or a negative errno otherwise.
|
||||
|
||||
4.3 register_kretprobe
|
||||
register_kretprobe
|
||||
------------------
|
||||
|
||||
#include <linux/kprobes.h>
|
||||
int register_kretprobe(struct kretprobe *rp);
|
||||
::
|
||||
|
||||
#include <linux/kprobes.h>
|
||||
int register_kretprobe(struct kretprobe *rp);
|
||||
|
||||
Establishes a return probe for the function whose address is
|
||||
rp->kp.addr. When that function returns, Kprobes calls rp->handler.
|
||||
@ -436,14 +483,17 @@ register_kretprobe(); see "How Does a Return Probe Work?" for details.
|
||||
register_kretprobe() returns 0 on success, or a negative errno
|
||||
otherwise.
|
||||
|
||||
User's return-probe handler (rp->handler):
|
||||
#include <linux/kprobes.h>
|
||||
#include <linux/ptrace.h>
|
||||
int kretprobe_handler(struct kretprobe_instance *ri, struct pt_regs *regs);
|
||||
User's return-probe handler (rp->handler)::
|
||||
|
||||
#include <linux/kprobes.h>
|
||||
#include <linux/ptrace.h>
|
||||
int kretprobe_handler(struct kretprobe_instance *ri,
|
||||
struct pt_regs *regs);
|
||||
|
||||
regs is as described for kprobe.pre_handler. ri points to the
|
||||
kretprobe_instance object, of which the following fields may be
|
||||
of interest:
|
||||
|
||||
- ret_addr: the return address
|
||||
- rp: points to the corresponding kretprobe object
|
||||
- task: points to the corresponding task struct
|
||||
@ -456,74 +506,94 @@ the architecture's ABI.
|
||||
|
||||
The handler's return value is currently ignored.
|
||||
|
||||
4.4 unregister_*probe
|
||||
unregister_*probe
|
||||
------------------
|
||||
|
||||
#include <linux/kprobes.h>
|
||||
void unregister_kprobe(struct kprobe *kp);
|
||||
void unregister_jprobe(struct jprobe *jp);
|
||||
void unregister_kretprobe(struct kretprobe *rp);
|
||||
::
|
||||
|
||||
#include <linux/kprobes.h>
|
||||
void unregister_kprobe(struct kprobe *kp);
|
||||
void unregister_jprobe(struct jprobe *jp);
|
||||
void unregister_kretprobe(struct kretprobe *rp);
|
||||
|
||||
Removes the specified probe. The unregister function can be called
|
||||
at any time after the probe has been registered.
|
||||
|
||||
NOTE:
|
||||
If the functions find an incorrect probe (ex. an unregistered probe),
|
||||
they clear the addr field of the probe.
|
||||
.. note::
|
||||
|
||||
4.5 register_*probes
|
||||
If the functions find an incorrect probe (ex. an unregistered probe),
|
||||
they clear the addr field of the probe.
|
||||
|
||||
#include <linux/kprobes.h>
|
||||
int register_kprobes(struct kprobe **kps, int num);
|
||||
int register_kretprobes(struct kretprobe **rps, int num);
|
||||
int register_jprobes(struct jprobe **jps, int num);
|
||||
register_*probes
|
||||
----------------
|
||||
|
||||
::
|
||||
|
||||
#include <linux/kprobes.h>
|
||||
int register_kprobes(struct kprobe **kps, int num);
|
||||
int register_kretprobes(struct kretprobe **rps, int num);
|
||||
int register_jprobes(struct jprobe **jps, int num);
|
||||
|
||||
Registers each of the num probes in the specified array. If any
|
||||
error occurs during registration, all probes in the array, up to
|
||||
the bad probe, are safely unregistered before the register_*probes
|
||||
function returns.
|
||||
- kps/rps/jps: an array of pointers to *probe data structures
|
||||
|
||||
- kps/rps/jps: an array of pointers to ``*probe`` data structures
|
||||
- num: the number of the array entries.
|
||||
|
||||
NOTE:
|
||||
You have to allocate(or define) an array of pointers and set all
|
||||
of the array entries before using these functions.
|
||||
.. note::
|
||||
|
||||
4.6 unregister_*probes
|
||||
You have to allocate(or define) an array of pointers and set all
|
||||
of the array entries before using these functions.
|
||||
|
||||
#include <linux/kprobes.h>
|
||||
void unregister_kprobes(struct kprobe **kps, int num);
|
||||
void unregister_kretprobes(struct kretprobe **rps, int num);
|
||||
void unregister_jprobes(struct jprobe **jps, int num);
|
||||
unregister_*probes
|
||||
------------------
|
||||
|
||||
::
|
||||
|
||||
#include <linux/kprobes.h>
|
||||
void unregister_kprobes(struct kprobe **kps, int num);
|
||||
void unregister_kretprobes(struct kretprobe **rps, int num);
|
||||
void unregister_jprobes(struct jprobe **jps, int num);
|
||||
|
||||
Removes each of the num probes in the specified array at once.
|
||||
|
||||
NOTE:
|
||||
If the functions find some incorrect probes (ex. unregistered
|
||||
probes) in the specified array, they clear the addr field of those
|
||||
incorrect probes. However, other probes in the array are
|
||||
unregistered correctly.
|
||||
.. note::
|
||||
|
||||
4.7 disable_*probe
|
||||
If the functions find some incorrect probes (ex. unregistered
|
||||
probes) in the specified array, they clear the addr field of those
|
||||
incorrect probes. However, other probes in the array are
|
||||
unregistered correctly.
|
||||
|
||||
#include <linux/kprobes.h>
|
||||
int disable_kprobe(struct kprobe *kp);
|
||||
int disable_kretprobe(struct kretprobe *rp);
|
||||
int disable_jprobe(struct jprobe *jp);
|
||||
disable_*probe
|
||||
--------------
|
||||
|
||||
Temporarily disables the specified *probe. You can enable it again by using
|
||||
::
|
||||
|
||||
#include <linux/kprobes.h>
|
||||
int disable_kprobe(struct kprobe *kp);
|
||||
int disable_kretprobe(struct kretprobe *rp);
|
||||
int disable_jprobe(struct jprobe *jp);
|
||||
|
||||
Temporarily disables the specified ``*probe``. You can enable it again by using
|
||||
enable_*probe(). You must specify the probe which has been registered.
|
||||
|
||||
4.8 enable_*probe
|
||||
enable_*probe
|
||||
-------------
|
||||
|
||||
#include <linux/kprobes.h>
|
||||
int enable_kprobe(struct kprobe *kp);
|
||||
int enable_kretprobe(struct kretprobe *rp);
|
||||
int enable_jprobe(struct jprobe *jp);
|
||||
::
|
||||
|
||||
Enables *probe which has been disabled by disable_*probe(). You must specify
|
||||
#include <linux/kprobes.h>
|
||||
int enable_kprobe(struct kprobe *kp);
|
||||
int enable_kretprobe(struct kretprobe *rp);
|
||||
int enable_jprobe(struct jprobe *jp);
|
||||
|
||||
Enables ``*probe`` which has been disabled by disable_*probe(). You must specify
|
||||
the probe which has been registered.
|
||||
|
||||
5. Kprobes Features and Limitations
|
||||
Kprobes Features and Limitations
|
||||
================================
|
||||
|
||||
Kprobes allows multiple probes at the same address. Currently,
|
||||
however, there cannot be multiple jprobes on the same function at
|
||||
@ -538,7 +608,7 @@ are discussed in this section.
|
||||
|
||||
The register_*probe functions will return -EINVAL if you attempt
|
||||
to install a probe in the code that implements Kprobes (mostly
|
||||
kernel/kprobes.c and arch/*/kernel/kprobes.c, but also functions such
|
||||
kernel/kprobes.c and ``arch/*/kernel/kprobes.c``, but also functions such
|
||||
as do_page_fault and notifier_call_chain).
|
||||
|
||||
If you install a probe in an inline-able function, Kprobes makes
|
||||
@ -602,19 +672,21 @@ explain it, we introduce some terminology. Imagine a 3-instruction
|
||||
sequence consisting of a two 2-byte instructions and one 3-byte
|
||||
instruction.
|
||||
|
||||
IA
|
||||
|
|
||||
[-2][-1][0][1][2][3][4][5][6][7]
|
||||
[ins1][ins2][ ins3 ]
|
||||
[<- DCR ->]
|
||||
[<- JTPR ->]
|
||||
::
|
||||
|
||||
ins1: 1st Instruction
|
||||
ins2: 2nd Instruction
|
||||
ins3: 3rd Instruction
|
||||
IA: Insertion Address
|
||||
JTPR: Jump Target Prohibition Region
|
||||
DCR: Detoured Code Region
|
||||
IA
|
||||
|
|
||||
[-2][-1][0][1][2][3][4][5][6][7]
|
||||
[ins1][ins2][ ins3 ]
|
||||
[<- DCR ->]
|
||||
[<- JTPR ->]
|
||||
|
||||
ins1: 1st Instruction
|
||||
ins2: 2nd Instruction
|
||||
ins3: 3rd Instruction
|
||||
IA: Insertion Address
|
||||
JTPR: Jump Target Prohibition Region
|
||||
DCR: Detoured Code Region
|
||||
|
||||
The instructions in DCR are copied to the out-of-line buffer
|
||||
of the kprobe, because the bytes in DCR are replaced by
|
||||
@ -628,7 +700,8 @@ d) DCR must not straddle the border between functions.
|
||||
Anyway, these limitations are checked by the in-kernel instruction
|
||||
decoder, so you don't need to worry about that.
|
||||
|
||||
6. Probe Overhead
|
||||
Probe Overhead
|
||||
==============
|
||||
|
||||
On a typical CPU in use in 2005, a kprobe hit takes 0.5 to 1.0
|
||||
microseconds to process. Specifically, a benchmark that hits the same
|
||||
@ -638,70 +711,80 @@ return-probe hit typically takes 50-75% longer than a kprobe hit.
|
||||
When you have a return probe set on a function, adding a kprobe at
|
||||
the entry to that function adds essentially no overhead.
|
||||
|
||||
Here are sample overhead figures (in usec) for different architectures.
|
||||
k = kprobe; j = jprobe; r = return probe; kr = kprobe + return probe
|
||||
on same function; jr = jprobe + return probe on same function
|
||||
Here are sample overhead figures (in usec) for different architectures::
|
||||
|
||||
i386: Intel Pentium M, 1495 MHz, 2957.31 bogomips
|
||||
k = 0.57 usec; j = 1.00; r = 0.92; kr = 0.99; jr = 1.40
|
||||
k = kprobe; j = jprobe; r = return probe; kr = kprobe + return probe
|
||||
on same function; jr = jprobe + return probe on same function::
|
||||
|
||||
x86_64: AMD Opteron 246, 1994 MHz, 3971.48 bogomips
|
||||
k = 0.49 usec; j = 0.76; r = 0.80; kr = 0.82; jr = 1.07
|
||||
i386: Intel Pentium M, 1495 MHz, 2957.31 bogomips
|
||||
k = 0.57 usec; j = 1.00; r = 0.92; kr = 0.99; jr = 1.40
|
||||
|
||||
ppc64: POWER5 (gr), 1656 MHz (SMT disabled, 1 virtual CPU per physical CPU)
|
||||
k = 0.77 usec; j = 1.31; r = 1.26; kr = 1.45; jr = 1.99
|
||||
x86_64: AMD Opteron 246, 1994 MHz, 3971.48 bogomips
|
||||
k = 0.49 usec; j = 0.76; r = 0.80; kr = 0.82; jr = 1.07
|
||||
|
||||
6.1 Optimized Probe Overhead
|
||||
ppc64: POWER5 (gr), 1656 MHz (SMT disabled, 1 virtual CPU per physical CPU)
|
||||
k = 0.77 usec; j = 1.31; r = 1.26; kr = 1.45; jr = 1.99
|
||||
|
||||
Optimized Probe Overhead
|
||||
------------------------
|
||||
|
||||
Typically, an optimized kprobe hit takes 0.07 to 0.1 microseconds to
|
||||
process. Here are sample overhead figures (in usec) for x86 architectures.
|
||||
k = unoptimized kprobe, b = boosted (single-step skipped), o = optimized kprobe,
|
||||
r = unoptimized kretprobe, rb = boosted kretprobe, ro = optimized kretprobe.
|
||||
process. Here are sample overhead figures (in usec) for x86 architectures::
|
||||
|
||||
i386: Intel(R) Xeon(R) E5410, 2.33GHz, 4656.90 bogomips
|
||||
k = 0.80 usec; b = 0.33; o = 0.05; r = 1.10; rb = 0.61; ro = 0.33
|
||||
k = unoptimized kprobe, b = boosted (single-step skipped), o = optimized kprobe,
|
||||
r = unoptimized kretprobe, rb = boosted kretprobe, ro = optimized kretprobe.
|
||||
|
||||
x86-64: Intel(R) Xeon(R) E5410, 2.33GHz, 4656.90 bogomips
|
||||
k = 0.99 usec; b = 0.43; o = 0.06; r = 1.24; rb = 0.68; ro = 0.30
|
||||
i386: Intel(R) Xeon(R) E5410, 2.33GHz, 4656.90 bogomips
|
||||
k = 0.80 usec; b = 0.33; o = 0.05; r = 1.10; rb = 0.61; ro = 0.33
|
||||
|
||||
7. TODO
|
||||
x86-64: Intel(R) Xeon(R) E5410, 2.33GHz, 4656.90 bogomips
|
||||
k = 0.99 usec; b = 0.43; o = 0.06; r = 1.24; rb = 0.68; ro = 0.30
|
||||
|
||||
TODO
|
||||
====
|
||||
|
||||
a. SystemTap (http://sourceware.org/systemtap): Provides a simplified
|
||||
programming interface for probe-based instrumentation. Try it out.
|
||||
programming interface for probe-based instrumentation. Try it out.
|
||||
b. Kernel return probes for sparc64.
|
||||
c. Support for other architectures.
|
||||
d. User-space probes.
|
||||
e. Watchpoint probes (which fire on data references).
|
||||
|
||||
8. Kprobes Example
|
||||
Kprobes Example
|
||||
===============
|
||||
|
||||
See samples/kprobes/kprobe_example.c
|
||||
|
||||
9. Jprobes Example
|
||||
Jprobes Example
|
||||
===============
|
||||
|
||||
See samples/kprobes/jprobe_example.c
|
||||
|
||||
10. Kretprobes Example
|
||||
Kretprobes Example
|
||||
==================
|
||||
|
||||
See samples/kprobes/kretprobe_example.c
|
||||
|
||||
For additional information on Kprobes, refer to the following URLs:
|
||||
http://www-106.ibm.com/developerworks/library/l-kprobes.html?ca=dgr-lnxw42Kprobe
|
||||
http://www.redhat.com/magazine/005mar05/features/kprobes/
|
||||
http://www-users.cs.umn.edu/~boutcher/kprobes/
|
||||
http://www.linuxsymposium.org/2006/linuxsymposium_procv2.pdf (pages 101-115)
|
||||
|
||||
- http://www-106.ibm.com/developerworks/library/l-kprobes.html?ca=dgr-lnxw42Kprobe
|
||||
- http://www.redhat.com/magazine/005mar05/features/kprobes/
|
||||
- http://www-users.cs.umn.edu/~boutcher/kprobes/
|
||||
- http://www.linuxsymposium.org/2006/linuxsymposium_procv2.pdf (pages 101-115)
|
||||
|
||||
|
||||
Appendix A: The kprobes debugfs interface
|
||||
The kprobes debugfs interface
|
||||
=============================
|
||||
|
||||
|
||||
With recent kernels (> 2.6.20) the list of registered kprobes is visible
|
||||
under the /sys/kernel/debug/kprobes/ directory (assuming debugfs is mounted at //sys/kernel/debug).
|
||||
|
||||
/sys/kernel/debug/kprobes/list: Lists all registered probes on the system
|
||||
/sys/kernel/debug/kprobes/list: Lists all registered probes on the system::
|
||||
|
||||
c015d71a k vfs_read+0x0
|
||||
c011a316 j do_fork+0x0
|
||||
c03dedc5 r tcp_v4_rcv+0x0
|
||||
c015d71a k vfs_read+0x0
|
||||
c011a316 j do_fork+0x0
|
||||
c03dedc5 r tcp_v4_rcv+0x0
|
||||
|
||||
The first column provides the kernel address where the probe is inserted.
|
||||
The second column identifies the type of probe (k - kprobe, r - kretprobe
|
||||
@ -725,17 +808,19 @@ change each probe's disabling state. This means that disabled kprobes (marked
|
||||
[DISABLED]) will be not enabled if you turn ON all kprobes by this knob.
|
||||
|
||||
|
||||
Appendix B: The kprobes sysctl interface
|
||||
The kprobes sysctl interface
|
||||
============================
|
||||
|
||||
/proc/sys/debug/kprobes-optimization: Turn kprobes optimization ON/OFF.
|
||||
|
||||
When CONFIG_OPTPROBES=y, this sysctl interface appears and it provides
|
||||
a knob to globally and forcibly turn jump optimization (see section
|
||||
1.4) ON or OFF. By default, jump optimization is allowed (ON).
|
||||
If you echo "0" to this file or set "debug.kprobes_optimization" to
|
||||
0 via sysctl, all optimized probes will be unoptimized, and any new
|
||||
probes registered after that will not be optimized. Note that this
|
||||
knob *changes* the optimized state. This means that optimized probes
|
||||
(marked [OPTIMIZED]) will be unoptimized ([OPTIMIZED] tag will be
|
||||
:ref:`kprobes_jump_optimization`) ON or OFF. By default, jump optimization
|
||||
is allowed (ON). If you echo "0" to this file or set
|
||||
"debug.kprobes_optimization" to 0 via sysctl, all optimized probes will be
|
||||
unoptimized, and any new probes registered after that will not be optimized.
|
||||
|
||||
Note that this knob *changes* the optimized state. This means that optimized
|
||||
probes (marked [OPTIMIZED]) will be unoptimized ([OPTIMIZED] tag will be
|
||||
removed). If the knob is turned on, they will be optimized again.
|
||||
|
||||
|
@ -1,24 +1,42 @@
|
||||
===================================================
|
||||
Adding reference counters (krefs) to kernel objects
|
||||
===================================================
|
||||
|
||||
:Author: Corey Minyard <minyard@acm.org>
|
||||
:Author: Thomas Hellstrom <thellstrom@vmware.com>
|
||||
|
||||
A lot of this was lifted from Greg Kroah-Hartman's 2004 OLS paper and
|
||||
presentation on krefs, which can be found at:
|
||||
|
||||
- http://www.kroah.com/linux/talks/ols_2004_kref_paper/Reprint-Kroah-Hartman-OLS2004.pdf
|
||||
- http://www.kroah.com/linux/talks/ols_2004_kref_talk/
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
||||
krefs allow you to add reference counters to your objects. If you
|
||||
have objects that are used in multiple places and passed around, and
|
||||
you don't have refcounts, your code is almost certainly broken. If
|
||||
you want refcounts, krefs are the way to go.
|
||||
|
||||
To use a kref, add one to your data structures like:
|
||||
To use a kref, add one to your data structures like::
|
||||
|
||||
struct my_data
|
||||
{
|
||||
struct my_data
|
||||
{
|
||||
.
|
||||
.
|
||||
struct kref refcount;
|
||||
.
|
||||
.
|
||||
};
|
||||
};
|
||||
|
||||
The kref can occur anywhere within the data structure.
|
||||
|
||||
Initialization
|
||||
==============
|
||||
|
||||
You must initialize the kref after you allocate it. To do this, call
|
||||
kref_init as so:
|
||||
kref_init as so::
|
||||
|
||||
struct my_data *data;
|
||||
|
||||
@ -29,18 +47,25 @@ kref_init as so:
|
||||
|
||||
This sets the refcount in the kref to 1.
|
||||
|
||||
Kref rules
|
||||
==========
|
||||
|
||||
Once you have an initialized kref, you must follow the following
|
||||
rules:
|
||||
|
||||
1) If you make a non-temporary copy of a pointer, especially if
|
||||
it can be passed to another thread of execution, you must
|
||||
increment the refcount with kref_get() before passing it off:
|
||||
increment the refcount with kref_get() before passing it off::
|
||||
|
||||
kref_get(&data->refcount);
|
||||
|
||||
If you already have a valid pointer to a kref-ed structure (the
|
||||
refcount cannot go to zero) you may do this without a lock.
|
||||
|
||||
2) When you are done with a pointer, you must call kref_put():
|
||||
2) When you are done with a pointer, you must call kref_put()::
|
||||
|
||||
kref_put(&data->refcount, data_release);
|
||||
|
||||
If this is the last reference to the pointer, the release
|
||||
routine will be called. If the code never tries to get
|
||||
a valid pointer to a kref-ed structure without already
|
||||
@ -53,25 +78,25 @@ rules:
|
||||
structure must remain valid during the kref_get().
|
||||
|
||||
For example, if you allocate some data and then pass it to another
|
||||
thread to process:
|
||||
thread to process::
|
||||
|
||||
void data_release(struct kref *ref)
|
||||
{
|
||||
void data_release(struct kref *ref)
|
||||
{
|
||||
struct my_data *data = container_of(ref, struct my_data, refcount);
|
||||
kfree(data);
|
||||
}
|
||||
}
|
||||
|
||||
void more_data_handling(void *cb_data)
|
||||
{
|
||||
void more_data_handling(void *cb_data)
|
||||
{
|
||||
struct my_data *data = cb_data;
|
||||
.
|
||||
. do stuff with data here
|
||||
.
|
||||
kref_put(&data->refcount, data_release);
|
||||
}
|
||||
}
|
||||
|
||||
int my_data_handler(void)
|
||||
{
|
||||
int my_data_handler(void)
|
||||
{
|
||||
int rv = 0;
|
||||
struct my_data *data;
|
||||
struct task_struct *task;
|
||||
@ -91,10 +116,10 @@ int my_data_handler(void)
|
||||
.
|
||||
. do stuff with data here
|
||||
.
|
||||
out:
|
||||
out:
|
||||
kref_put(&data->refcount, data_release);
|
||||
return rv;
|
||||
}
|
||||
}
|
||||
|
||||
This way, it doesn't matter what order the two threads handle the
|
||||
data, the kref_put() handles knowing when the data is not referenced
|
||||
@ -104,7 +129,7 @@ put needs no lock because nothing tries to get the data without
|
||||
already holding a pointer.
|
||||
|
||||
Note that the "before" in rule 1 is very important. You should never
|
||||
do something like:
|
||||
do something like::
|
||||
|
||||
task = kthread_run(more_data_handling, data, "more_data_handling");
|
||||
if (task == ERR_PTR(-ENOMEM)) {
|
||||
@ -124,14 +149,14 @@ bad style. Don't do it.
|
||||
There are some situations where you can optimize the gets and puts.
|
||||
For instance, if you are done with an object and enqueuing it for
|
||||
something else or passing it off to something else, there is no reason
|
||||
to do a get then a put:
|
||||
to do a get then a put::
|
||||
|
||||
/* Silly extra get and put */
|
||||
kref_get(&obj->ref);
|
||||
enqueue(obj);
|
||||
kref_put(&obj->ref, obj_cleanup);
|
||||
|
||||
Just do the enqueue. A comment about this is always welcome:
|
||||
Just do the enqueue. A comment about this is always welcome::
|
||||
|
||||
enqueue(obj);
|
||||
/* We are done with obj, so we pass our refcount off
|
||||
@ -142,109 +167,99 @@ instance, you have a list of items that are each kref-ed, and you wish
|
||||
to get the first one. You can't just pull the first item off the list
|
||||
and kref_get() it. That violates rule 3 because you are not already
|
||||
holding a valid pointer. You must add a mutex (or some other lock).
|
||||
For instance:
|
||||
For instance::
|
||||
|
||||
static DEFINE_MUTEX(mutex);
|
||||
static LIST_HEAD(q);
|
||||
struct my_data
|
||||
{
|
||||
struct kref refcount;
|
||||
struct list_head link;
|
||||
};
|
||||
static DEFINE_MUTEX(mutex);
|
||||
static LIST_HEAD(q);
|
||||
struct my_data
|
||||
{
|
||||
struct kref refcount;
|
||||
struct list_head link;
|
||||
};
|
||||
|
||||
static struct my_data *get_entry()
|
||||
{
|
||||
struct my_data *entry = NULL;
|
||||
mutex_lock(&mutex);
|
||||
if (!list_empty(&q)) {
|
||||
entry = container_of(q.next, struct my_data, link);
|
||||
kref_get(&entry->refcount);
|
||||
static struct my_data *get_entry()
|
||||
{
|
||||
struct my_data *entry = NULL;
|
||||
mutex_lock(&mutex);
|
||||
if (!list_empty(&q)) {
|
||||
entry = container_of(q.next, struct my_data, link);
|
||||
kref_get(&entry->refcount);
|
||||
}
|
||||
mutex_unlock(&mutex);
|
||||
return entry;
|
||||
}
|
||||
mutex_unlock(&mutex);
|
||||
return entry;
|
||||
}
|
||||
|
||||
static void release_entry(struct kref *ref)
|
||||
{
|
||||
struct my_data *entry = container_of(ref, struct my_data, refcount);
|
||||
static void release_entry(struct kref *ref)
|
||||
{
|
||||
struct my_data *entry = container_of(ref, struct my_data, refcount);
|
||||
|
||||
list_del(&entry->link);
|
||||
kfree(entry);
|
||||
}
|
||||
list_del(&entry->link);
|
||||
kfree(entry);
|
||||
}
|
||||
|
||||
static void put_entry(struct my_data *entry)
|
||||
{
|
||||
mutex_lock(&mutex);
|
||||
kref_put(&entry->refcount, release_entry);
|
||||
mutex_unlock(&mutex);
|
||||
}
|
||||
static void put_entry(struct my_data *entry)
|
||||
{
|
||||
mutex_lock(&mutex);
|
||||
kref_put(&entry->refcount, release_entry);
|
||||
mutex_unlock(&mutex);
|
||||
}
|
||||
|
||||
The kref_put() return value is useful if you do not want to hold the
|
||||
lock during the whole release operation. Say you didn't want to call
|
||||
kfree() with the lock held in the example above (since it is kind of
|
||||
pointless to do so). You could use kref_put() as follows:
|
||||
pointless to do so). You could use kref_put() as follows::
|
||||
|
||||
static void release_entry(struct kref *ref)
|
||||
{
|
||||
/* All work is done after the return from kref_put(). */
|
||||
}
|
||||
static void release_entry(struct kref *ref)
|
||||
{
|
||||
/* All work is done after the return from kref_put(). */
|
||||
}
|
||||
|
||||
static void put_entry(struct my_data *entry)
|
||||
{
|
||||
mutex_lock(&mutex);
|
||||
if (kref_put(&entry->refcount, release_entry)) {
|
||||
list_del(&entry->link);
|
||||
mutex_unlock(&mutex);
|
||||
kfree(entry);
|
||||
} else
|
||||
mutex_unlock(&mutex);
|
||||
}
|
||||
static void put_entry(struct my_data *entry)
|
||||
{
|
||||
mutex_lock(&mutex);
|
||||
if (kref_put(&entry->refcount, release_entry)) {
|
||||
list_del(&entry->link);
|
||||
mutex_unlock(&mutex);
|
||||
kfree(entry);
|
||||
} else
|
||||
mutex_unlock(&mutex);
|
||||
}
|
||||
|
||||
This is really more useful if you have to call other routines as part
|
||||
of the free operations that could take a long time or might claim the
|
||||
same lock. Note that doing everything in the release routine is still
|
||||
preferred as it is a little neater.
|
||||
|
||||
|
||||
Corey Minyard <minyard@acm.org>
|
||||
|
||||
A lot of this was lifted from Greg Kroah-Hartman's 2004 OLS paper and
|
||||
presentation on krefs, which can be found at:
|
||||
http://www.kroah.com/linux/talks/ols_2004_kref_paper/Reprint-Kroah-Hartman-OLS2004.pdf
|
||||
and:
|
||||
http://www.kroah.com/linux/talks/ols_2004_kref_talk/
|
||||
|
||||
|
||||
The above example could also be optimized using kref_get_unless_zero() in
|
||||
the following way:
|
||||
the following way::
|
||||
|
||||
static struct my_data *get_entry()
|
||||
{
|
||||
struct my_data *entry = NULL;
|
||||
mutex_lock(&mutex);
|
||||
if (!list_empty(&q)) {
|
||||
entry = container_of(q.next, struct my_data, link);
|
||||
if (!kref_get_unless_zero(&entry->refcount))
|
||||
entry = NULL;
|
||||
static struct my_data *get_entry()
|
||||
{
|
||||
struct my_data *entry = NULL;
|
||||
mutex_lock(&mutex);
|
||||
if (!list_empty(&q)) {
|
||||
entry = container_of(q.next, struct my_data, link);
|
||||
if (!kref_get_unless_zero(&entry->refcount))
|
||||
entry = NULL;
|
||||
}
|
||||
mutex_unlock(&mutex);
|
||||
return entry;
|
||||
}
|
||||
mutex_unlock(&mutex);
|
||||
return entry;
|
||||
}
|
||||
|
||||
static void release_entry(struct kref *ref)
|
||||
{
|
||||
struct my_data *entry = container_of(ref, struct my_data, refcount);
|
||||
static void release_entry(struct kref *ref)
|
||||
{
|
||||
struct my_data *entry = container_of(ref, struct my_data, refcount);
|
||||
|
||||
mutex_lock(&mutex);
|
||||
list_del(&entry->link);
|
||||
mutex_unlock(&mutex);
|
||||
kfree(entry);
|
||||
}
|
||||
mutex_lock(&mutex);
|
||||
list_del(&entry->link);
|
||||
mutex_unlock(&mutex);
|
||||
kfree(entry);
|
||||
}
|
||||
|
||||
static void put_entry(struct my_data *entry)
|
||||
{
|
||||
kref_put(&entry->refcount, release_entry);
|
||||
}
|
||||
static void put_entry(struct my_data *entry)
|
||||
{
|
||||
kref_put(&entry->refcount, release_entry);
|
||||
}
|
||||
|
||||
Which is useful to remove the mutex lock around kref_put() in put_entry(), but
|
||||
it's important that kref_get_unless_zero is enclosed in the same critical
|
||||
@ -254,51 +269,51 @@ Note that it is illegal to use kref_get_unless_zero without checking its
|
||||
return value. If you are sure (by already having a valid pointer) that
|
||||
kref_get_unless_zero() will return true, then use kref_get() instead.
|
||||
|
||||
Krefs and RCU
|
||||
=============
|
||||
|
||||
The function kref_get_unless_zero also makes it possible to use rcu
|
||||
locking for lookups in the above example:
|
||||
locking for lookups in the above example::
|
||||
|
||||
struct my_data
|
||||
{
|
||||
struct rcu_head rhead;
|
||||
.
|
||||
struct kref refcount;
|
||||
.
|
||||
.
|
||||
};
|
||||
struct my_data
|
||||
{
|
||||
struct rcu_head rhead;
|
||||
.
|
||||
struct kref refcount;
|
||||
.
|
||||
.
|
||||
};
|
||||
|
||||
static struct my_data *get_entry_rcu()
|
||||
{
|
||||
struct my_data *entry = NULL;
|
||||
rcu_read_lock();
|
||||
if (!list_empty(&q)) {
|
||||
entry = container_of(q.next, struct my_data, link);
|
||||
if (!kref_get_unless_zero(&entry->refcount))
|
||||
entry = NULL;
|
||||
static struct my_data *get_entry_rcu()
|
||||
{
|
||||
struct my_data *entry = NULL;
|
||||
rcu_read_lock();
|
||||
if (!list_empty(&q)) {
|
||||
entry = container_of(q.next, struct my_data, link);
|
||||
if (!kref_get_unless_zero(&entry->refcount))
|
||||
entry = NULL;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
return entry;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
return entry;
|
||||
}
|
||||
|
||||
static void release_entry_rcu(struct kref *ref)
|
||||
{
|
||||
struct my_data *entry = container_of(ref, struct my_data, refcount);
|
||||
static void release_entry_rcu(struct kref *ref)
|
||||
{
|
||||
struct my_data *entry = container_of(ref, struct my_data, refcount);
|
||||
|
||||
mutex_lock(&mutex);
|
||||
list_del_rcu(&entry->link);
|
||||
mutex_unlock(&mutex);
|
||||
kfree_rcu(entry, rhead);
|
||||
}
|
||||
mutex_lock(&mutex);
|
||||
list_del_rcu(&entry->link);
|
||||
mutex_unlock(&mutex);
|
||||
kfree_rcu(entry, rhead);
|
||||
}
|
||||
|
||||
static void put_entry(struct my_data *entry)
|
||||
{
|
||||
kref_put(&entry->refcount, release_entry_rcu);
|
||||
}
|
||||
static void put_entry(struct my_data *entry)
|
||||
{
|
||||
kref_put(&entry->refcount, release_entry_rcu);
|
||||
}
|
||||
|
||||
But note that the struct kref member needs to remain in valid memory for a
|
||||
rcu grace period after release_entry_rcu was called. That can be accomplished
|
||||
by using kfree_rcu(entry, rhead) as done above, or by calling synchronize_rcu()
|
||||
before using kfree, but note that synchronize_rcu() may sleep for a
|
||||
substantial amount of time.
|
||||
|
||||
|
||||
Thomas Hellstrom <thellstrom@vmware.com>
|
||||
|
@ -1,9 +1,9 @@
|
||||
==========================================
|
||||
LDM - Logical Disk Manager (Dynamic Disks)
|
||||
==========================================
|
||||
|
||||
LDM - Logical Disk Manager (Dynamic Disks)
|
||||
------------------------------------------
|
||||
|
||||
Originally Written by FlatCap - Richard Russon <ldm@flatcap.org>.
|
||||
Last Updated by Anton Altaparmakov on 30 March 2007 for Windows Vista.
|
||||
:Author: Originally Written by FlatCap - Richard Russon <ldm@flatcap.org>.
|
||||
:Last Updated: Anton Altaparmakov on 30 March 2007 for Windows Vista.
|
||||
|
||||
Overview
|
||||
--------
|
||||
@ -37,24 +37,36 @@ Example
|
||||
-------
|
||||
|
||||
Below we have a 50MiB disk, divided into seven partitions.
|
||||
N.B. The missing 1MiB at the end of the disk is where the LDM database is
|
||||
stored.
|
||||
|
||||
Device | Offset Bytes Sectors MiB | Size Bytes Sectors MiB
|
||||
-------+----------------------------+---------------------------
|
||||
hda | 0 0 0 | 52428800 102400 50
|
||||
hda1 | 51380224 100352 49 | 1048576 2048 1
|
||||
hda2 | 16384 32 0 | 6979584 13632 6
|
||||
hda3 | 6995968 13664 6 | 10485760 20480 10
|
||||
hda4 | 17481728 34144 16 | 4194304 8192 4
|
||||
hda5 | 21676032 42336 20 | 5242880 10240 5
|
||||
hda6 | 26918912 52576 25 | 10485760 20480 10
|
||||
hda7 | 37404672 73056 35 | 13959168 27264 13
|
||||
.. note::
|
||||
|
||||
The missing 1MiB at the end of the disk is where the LDM database is
|
||||
stored.
|
||||
|
||||
+-------++--------------+---------+-----++--------------+---------+----+
|
||||
|Device || Offset Bytes | Sectors | MiB || Size Bytes | Sectors | MiB|
|
||||
+=======++==============+=========+=====++==============+=========+====+
|
||||
|hda || 0 | 0 | 0 || 52428800 | 102400 | 50|
|
||||
+-------++--------------+---------+-----++--------------+---------+----+
|
||||
|hda1 || 51380224 | 100352 | 49 || 1048576 | 2048 | 1|
|
||||
+-------++--------------+---------+-----++--------------+---------+----+
|
||||
|hda2 || 16384 | 32 | 0 || 6979584 | 13632 | 6|
|
||||
+-------++--------------+---------+-----++--------------+---------+----+
|
||||
|hda3 || 6995968 | 13664 | 6 || 10485760 | 20480 | 10|
|
||||
+-------++--------------+---------+-----++--------------+---------+----+
|
||||
|hda4 || 17481728 | 34144 | 16 || 4194304 | 8192 | 4|
|
||||
+-------++--------------+---------+-----++--------------+---------+----+
|
||||
|hda5 || 21676032 | 42336 | 20 || 5242880 | 10240 | 5|
|
||||
+-------++--------------+---------+-----++--------------+---------+----+
|
||||
|hda6 || 26918912 | 52576 | 25 || 10485760 | 20480 | 10|
|
||||
+-------++--------------+---------+-----++--------------+---------+----+
|
||||
|hda7 || 37404672 | 73056 | 35 || 13959168 | 27264 | 13|
|
||||
+-------++--------------+---------+-----++--------------+---------+----+
|
||||
|
||||
The LDM Database may not store the partitions in the order that they appear on
|
||||
disk, but the driver will sort them.
|
||||
|
||||
When Linux boots, you will see something like:
|
||||
When Linux boots, you will see something like::
|
||||
|
||||
hda: 102400 sectors w/32KiB Cache, CHS=50/64/32
|
||||
hda: [LDM] hda1 hda2 hda3 hda4 hda5 hda6 hda7
|
||||
@ -65,13 +77,13 @@ Compiling LDM Support
|
||||
|
||||
To enable LDM, choose the following two options:
|
||||
|
||||
"Advanced partition selection" CONFIG_PARTITION_ADVANCED
|
||||
"Windows Logical Disk Manager (Dynamic Disk) support" CONFIG_LDM_PARTITION
|
||||
- "Advanced partition selection" CONFIG_PARTITION_ADVANCED
|
||||
- "Windows Logical Disk Manager (Dynamic Disk) support" CONFIG_LDM_PARTITION
|
||||
|
||||
If you believe the driver isn't working as it should, you can enable the extra
|
||||
debugging code. This will produce a LOT of output. The option is:
|
||||
|
||||
"Windows LDM extra logging" CONFIG_LDM_DEBUG
|
||||
- "Windows LDM extra logging" CONFIG_LDM_DEBUG
|
||||
|
||||
N.B. The partition code cannot be compiled as a module.
|
||||
|
||||
|
@ -30,7 +30,8 @@ timeout is set through the confusingly named "kernel.panic" sysctl),
|
||||
to cause the system to reboot automatically after a specified amount
|
||||
of time.
|
||||
|
||||
=== Implementation ===
|
||||
Implementation
|
||||
==============
|
||||
|
||||
The soft and hard lockup detectors are built on top of the hrtimer and
|
||||
perf subsystems, respectively. A direct consequence of this is that,
|
||||
|
@ -1,8 +1,9 @@
|
||||
|
||||
===========================================================
|
||||
LZO stream format as understood by Linux's LZO decompressor
|
||||
===========================================================
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
||||
This is not a specification. No specification seems to be publicly available
|
||||
for the LZO stream format. This document describes what input format the LZO
|
||||
@ -14,12 +15,13 @@ Introduction
|
||||
for future bug reports.
|
||||
|
||||
Description
|
||||
===========
|
||||
|
||||
The stream is composed of a series of instructions, operands, and data. The
|
||||
instructions consist in a few bits representing an opcode, and bits forming
|
||||
the operands for the instruction, whose size and position depend on the
|
||||
opcode and on the number of literals copied by previous instruction. The
|
||||
operands are used to indicate :
|
||||
operands are used to indicate:
|
||||
|
||||
- a distance when copying data from the dictionary (past output buffer)
|
||||
- a length (number of bytes to copy from dictionary)
|
||||
@ -38,7 +40,7 @@ Description
|
||||
of bits in the operand. If the number of bits isn't enough to represent the
|
||||
length, up to 255 may be added in increments by consuming more bytes with a
|
||||
rate of at most 255 per extra byte (thus the compression ratio cannot exceed
|
||||
around 255:1). The variable length encoding using #bits is always the same :
|
||||
around 255:1). The variable length encoding using #bits is always the same::
|
||||
|
||||
length = byte & ((1 << #bits) - 1)
|
||||
if (!length) {
|
||||
@ -67,15 +69,19 @@ Description
|
||||
instruction may encode this distance (0001HLLL), it takes one LE16 operand
|
||||
for the distance, thus requiring 3 bytes.
|
||||
|
||||
IMPORTANT NOTE : in the code some length checks are missing because certain
|
||||
instructions are called under the assumption that a certain number of bytes
|
||||
follow because it has already been guaranteed before parsing the instructions.
|
||||
They just have to "refill" this credit if they consume extra bytes. This is
|
||||
an implementation design choice independent on the algorithm or encoding.
|
||||
.. important::
|
||||
|
||||
In the code some length checks are missing because certain instructions
|
||||
are called under the assumption that a certain number of bytes follow
|
||||
because it has already been guaranteed before parsing the instructions.
|
||||
They just have to "refill" this credit if they consume extra bytes. This
|
||||
is an implementation design choice independent on the algorithm or
|
||||
encoding.
|
||||
|
||||
Byte sequences
|
||||
==============
|
||||
|
||||
First byte encoding :
|
||||
First byte encoding::
|
||||
|
||||
0..17 : follow regular instruction encoding, see below. It is worth
|
||||
noting that codes 16 and 17 will represent a block copy from
|
||||
@ -91,7 +97,7 @@ Byte sequences
|
||||
state = 4 [ don't copy extra literals ]
|
||||
skip byte
|
||||
|
||||
Instruction encoding :
|
||||
Instruction encoding::
|
||||
|
||||
0 0 0 0 X X X X (0..15)
|
||||
Depends on the number of literals copied by the last instruction.
|
||||
@ -156,6 +162,7 @@ Byte sequences
|
||||
distance = (H << 3) + D + 1
|
||||
|
||||
Authors
|
||||
=======
|
||||
|
||||
This document was written by Willy Tarreau <w@1wt.eu> on 2014/07/19 during an
|
||||
analysis of the decompression code available in Linux 3.16-rc5. The code is
|
||||
|
@ -1,7 +1,10 @@
|
||||
The Common Mailbox Framework
|
||||
Jassi Brar <jaswinder.singh@linaro.org>
|
||||
============================
|
||||
The Common Mailbox Framework
|
||||
============================
|
||||
|
||||
This document aims to help developers write client and controller
|
||||
:Author: Jassi Brar <jaswinder.singh@linaro.org>
|
||||
|
||||
This document aims to help developers write client and controller
|
||||
drivers for the API. But before we start, let us note that the
|
||||
client (especially) and controller drivers are likely going to be
|
||||
very platform specific because the remote firmware is likely to be
|
||||
@ -13,14 +16,17 @@ similar copies of code written for each platform. Having said that,
|
||||
nothing prevents the remote f/w to also be Linux based and use the
|
||||
same api there. However none of that helps us locally because we only
|
||||
ever deal at client's protocol level.
|
||||
Some of the choices made during implementation are the result of this
|
||||
|
||||
Some of the choices made during implementation are the result of this
|
||||
peculiarity of this "common" framework.
|
||||
|
||||
|
||||
|
||||
Part 1 - Controller Driver (See include/linux/mailbox_controller.h)
|
||||
Controller Driver (See include/linux/mailbox_controller.h)
|
||||
==========================================================
|
||||
|
||||
Allocate mbox_controller and the array of mbox_chan.
|
||||
|
||||
Allocate mbox_controller and the array of mbox_chan.
|
||||
Populate mbox_chan_ops, except peek_data() all are mandatory.
|
||||
The controller driver might know a message has been consumed
|
||||
by the remote by getting an IRQ or polling some hardware flag
|
||||
@ -30,91 +36,94 @@ the controller driver should set via 'txdone_irq' or 'txdone_poll'
|
||||
or neither.
|
||||
|
||||
|
||||
Part 2 - Client Driver (See include/linux/mailbox_client.h)
|
||||
Client Driver (See include/linux/mailbox_client.h)
|
||||
==================================================
|
||||
|
||||
The client might want to operate in blocking mode (synchronously
|
||||
|
||||
The client might want to operate in blocking mode (synchronously
|
||||
send a message through before returning) or non-blocking/async mode (submit
|
||||
a message and a callback function to the API and return immediately).
|
||||
|
||||
::
|
||||
|
||||
struct demo_client {
|
||||
struct mbox_client cl;
|
||||
struct mbox_chan *mbox;
|
||||
struct completion c;
|
||||
bool async;
|
||||
/* ... */
|
||||
};
|
||||
struct demo_client {
|
||||
struct mbox_client cl;
|
||||
struct mbox_chan *mbox;
|
||||
struct completion c;
|
||||
bool async;
|
||||
/* ... */
|
||||
};
|
||||
|
||||
/*
|
||||
* This is the handler for data received from remote. The behaviour is purely
|
||||
* dependent upon the protocol. This is just an example.
|
||||
*/
|
||||
static void message_from_remote(struct mbox_client *cl, void *mssg)
|
||||
{
|
||||
struct demo_client *dc = container_of(cl, struct demo_client, cl);
|
||||
if (dc->async) {
|
||||
if (is_an_ack(mssg)) {
|
||||
/* An ACK to our last sample sent */
|
||||
return; /* Or do something else here */
|
||||
} else { /* A new message from remote */
|
||||
queue_req(mssg);
|
||||
/*
|
||||
* This is the handler for data received from remote. The behaviour is purely
|
||||
* dependent upon the protocol. This is just an example.
|
||||
*/
|
||||
static void message_from_remote(struct mbox_client *cl, void *mssg)
|
||||
{
|
||||
struct demo_client *dc = container_of(cl, struct demo_client, cl);
|
||||
if (dc->async) {
|
||||
if (is_an_ack(mssg)) {
|
||||
/* An ACK to our last sample sent */
|
||||
return; /* Or do something else here */
|
||||
} else { /* A new message from remote */
|
||||
queue_req(mssg);
|
||||
}
|
||||
} else {
|
||||
/* Remote f/w sends only ACK packets on this channel */
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
/* Remote f/w sends only ACK packets on this channel */
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
static void sample_sent(struct mbox_client *cl, void *mssg, int r)
|
||||
{
|
||||
struct demo_client *dc = container_of(cl, struct demo_client, cl);
|
||||
complete(&dc->c);
|
||||
}
|
||||
static void sample_sent(struct mbox_client *cl, void *mssg, int r)
|
||||
{
|
||||
struct demo_client *dc = container_of(cl, struct demo_client, cl);
|
||||
complete(&dc->c);
|
||||
}
|
||||
|
||||
static void client_demo(struct platform_device *pdev)
|
||||
{
|
||||
struct demo_client *dc_sync, *dc_async;
|
||||
/* The controller already knows async_pkt and sync_pkt */
|
||||
struct async_pkt ap;
|
||||
struct sync_pkt sp;
|
||||
static void client_demo(struct platform_device *pdev)
|
||||
{
|
||||
struct demo_client *dc_sync, *dc_async;
|
||||
/* The controller already knows async_pkt and sync_pkt */
|
||||
struct async_pkt ap;
|
||||
struct sync_pkt sp;
|
||||
|
||||
dc_sync = kzalloc(sizeof(*dc_sync), GFP_KERNEL);
|
||||
dc_async = kzalloc(sizeof(*dc_async), GFP_KERNEL);
|
||||
dc_sync = kzalloc(sizeof(*dc_sync), GFP_KERNEL);
|
||||
dc_async = kzalloc(sizeof(*dc_async), GFP_KERNEL);
|
||||
|
||||
/* Populate non-blocking mode client */
|
||||
dc_async->cl.dev = &pdev->dev;
|
||||
dc_async->cl.rx_callback = message_from_remote;
|
||||
dc_async->cl.tx_done = sample_sent;
|
||||
dc_async->cl.tx_block = false;
|
||||
dc_async->cl.tx_tout = 0; /* doesn't matter here */
|
||||
dc_async->cl.knows_txdone = false; /* depending upon protocol */
|
||||
dc_async->async = true;
|
||||
init_completion(&dc_async->c);
|
||||
/* Populate non-blocking mode client */
|
||||
dc_async->cl.dev = &pdev->dev;
|
||||
dc_async->cl.rx_callback = message_from_remote;
|
||||
dc_async->cl.tx_done = sample_sent;
|
||||
dc_async->cl.tx_block = false;
|
||||
dc_async->cl.tx_tout = 0; /* doesn't matter here */
|
||||
dc_async->cl.knows_txdone = false; /* depending upon protocol */
|
||||
dc_async->async = true;
|
||||
init_completion(&dc_async->c);
|
||||
|
||||
/* Populate blocking mode client */
|
||||
dc_sync->cl.dev = &pdev->dev;
|
||||
dc_sync->cl.rx_callback = message_from_remote;
|
||||
dc_sync->cl.tx_done = NULL; /* operate in blocking mode */
|
||||
dc_sync->cl.tx_block = true;
|
||||
dc_sync->cl.tx_tout = 500; /* by half a second */
|
||||
dc_sync->cl.knows_txdone = false; /* depending upon protocol */
|
||||
dc_sync->async = false;
|
||||
/* Populate blocking mode client */
|
||||
dc_sync->cl.dev = &pdev->dev;
|
||||
dc_sync->cl.rx_callback = message_from_remote;
|
||||
dc_sync->cl.tx_done = NULL; /* operate in blocking mode */
|
||||
dc_sync->cl.tx_block = true;
|
||||
dc_sync->cl.tx_tout = 500; /* by half a second */
|
||||
dc_sync->cl.knows_txdone = false; /* depending upon protocol */
|
||||
dc_sync->async = false;
|
||||
|
||||
/* ASync mailbox is listed second in 'mboxes' property */
|
||||
dc_async->mbox = mbox_request_channel(&dc_async->cl, 1);
|
||||
/* Populate data packet */
|
||||
/* ap.xxx = 123; etc */
|
||||
/* Send async message to remote */
|
||||
mbox_send_message(dc_async->mbox, &ap);
|
||||
/* ASync mailbox is listed second in 'mboxes' property */
|
||||
dc_async->mbox = mbox_request_channel(&dc_async->cl, 1);
|
||||
/* Populate data packet */
|
||||
/* ap.xxx = 123; etc */
|
||||
/* Send async message to remote */
|
||||
mbox_send_message(dc_async->mbox, &ap);
|
||||
|
||||
/* Sync mailbox is listed first in 'mboxes' property */
|
||||
dc_sync->mbox = mbox_request_channel(&dc_sync->cl, 0);
|
||||
/* Populate data packet */
|
||||
/* sp.abc = 123; etc */
|
||||
/* Send message to remote in blocking mode */
|
||||
mbox_send_message(dc_sync->mbox, &sp);
|
||||
/* At this point 'sp' has been sent */
|
||||
/* Sync mailbox is listed first in 'mboxes' property */
|
||||
dc_sync->mbox = mbox_request_channel(&dc_sync->cl, 0);
|
||||
/* Populate data packet */
|
||||
/* sp.abc = 123; etc */
|
||||
/* Send message to remote in blocking mode */
|
||||
mbox_send_message(dc_sync->mbox, &sp);
|
||||
/* At this point 'sp' has been sent */
|
||||
|
||||
/* Now wait for async chan to be done */
|
||||
wait_for_completion(&dc_async->c);
|
||||
}
|
||||
/* Now wait for async chan to be done */
|
||||
wait_for_completion(&dc_async->c);
|
||||
}
|
||||
|
@ -1876,8 +1876,8 @@ There are some more advanced barrier functions:
|
||||
This makes sure that the death mark on the object is perceived to be set
|
||||
*before* the reference counter is decremented.
|
||||
|
||||
See Documentation/atomic_ops.txt for more information. See the "Atomic
|
||||
operations" subsection for information on where to use these.
|
||||
See Documentation/core-api/atomic_ops.rst for more information. See the
|
||||
"Atomic operations" subsection for information on where to use these.
|
||||
|
||||
|
||||
(*) lockless_dereference();
|
||||
@ -2584,7 +2584,7 @@ situations because on some CPUs the atomic instructions used imply full memory
|
||||
barriers, and so barrier instructions are superfluous in conjunction with them,
|
||||
and in such cases the special barrier primitives will be no-ops.
|
||||
|
||||
See Documentation/atomic_ops.txt for more information.
|
||||
See Documentation/core-api/atomic_ops.rst for more information.
|
||||
|
||||
|
||||
ACCESSING DEVICES
|
||||
|
@ -2,43 +2,48 @@
|
||||
Memory Hotplug
|
||||
==============
|
||||
|
||||
Created: Jul 28 2007
|
||||
Add description of notifier of memory hotplug Oct 11 2007
|
||||
:Created: Jul 28 2007
|
||||
:Updated: Add description of notifier of memory hotplug: Oct 11 2007
|
||||
|
||||
This document is about memory hotplug including how-to-use and current status.
|
||||
Because Memory Hotplug is still under development, contents of this text will
|
||||
be changed often.
|
||||
|
||||
1. Introduction
|
||||
1.1 purpose of memory hotplug
|
||||
1.2. Phases of memory hotplug
|
||||
1.3. Unit of Memory online/offline operation
|
||||
2. Kernel Configuration
|
||||
3. sysfs files for memory hotplug
|
||||
4. Physical memory hot-add phase
|
||||
4.1 Hardware(Firmware) Support
|
||||
4.2 Notify memory hot-add event by hand
|
||||
5. Logical Memory hot-add phase
|
||||
5.1. State of memory
|
||||
5.2. How to online memory
|
||||
6. Logical memory remove
|
||||
6.1 Memory offline and ZONE_MOVABLE
|
||||
6.2. How to offline memory
|
||||
7. Physical memory remove
|
||||
8. Memory hotplug event notifier
|
||||
9. Future Work List
|
||||
.. CONTENTS
|
||||
|
||||
Note(1): x86_64's has special implementation for memory hotplug.
|
||||
This text does not describe it.
|
||||
Note(2): This text assumes that sysfs is mounted at /sys.
|
||||
1. Introduction
|
||||
1.1 purpose of memory hotplug
|
||||
1.2. Phases of memory hotplug
|
||||
1.3. Unit of Memory online/offline operation
|
||||
2. Kernel Configuration
|
||||
3. sysfs files for memory hotplug
|
||||
4. Physical memory hot-add phase
|
||||
4.1 Hardware(Firmware) Support
|
||||
4.2 Notify memory hot-add event by hand
|
||||
5. Logical Memory hot-add phase
|
||||
5.1. State of memory
|
||||
5.2. How to online memory
|
||||
6. Logical memory remove
|
||||
6.1 Memory offline and ZONE_MOVABLE
|
||||
6.2. How to offline memory
|
||||
7. Physical memory remove
|
||||
8. Memory hotplug event notifier
|
||||
9. Future Work List
|
||||
|
||||
|
||||
---------------
|
||||
1. Introduction
|
||||
---------------
|
||||
.. note::
|
||||
|
||||
(1) x86_64's has special implementation for memory hotplug.
|
||||
This text does not describe it.
|
||||
(2) This text assumes that sysfs is mounted at /sys.
|
||||
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
||||
purpose of memory hotplug
|
||||
-------------------------
|
||||
|
||||
1.1 purpose of memory hotplug
|
||||
------------
|
||||
Memory Hotplug allows users to increase/decrease the amount of memory.
|
||||
Generally, there are two purposes.
|
||||
|
||||
@ -53,9 +58,11 @@ hardware which supports memory power management.
|
||||
Linux memory hotplug is designed for both purpose.
|
||||
|
||||
|
||||
1.2. Phases of memory hotplug
|
||||
---------------
|
||||
There are 2 phases in Memory Hotplug.
|
||||
Phases of memory hotplug
|
||||
------------------------
|
||||
|
||||
There are 2 phases in Memory Hotplug:
|
||||
|
||||
1) Physical Memory Hotplug phase
|
||||
2) Logical Memory Hotplug phase.
|
||||
|
||||
@ -70,7 +77,7 @@ management tables, and makes sysfs files for new memory's operation.
|
||||
If firmware supports notification of connection of new memory to OS,
|
||||
this phase is triggered automatically. ACPI can notify this event. If not,
|
||||
"probe" operation by system administration is used instead.
|
||||
(see Section 4.).
|
||||
(see :ref:`memory_hotplug_physical_mem`).
|
||||
|
||||
Logical Memory Hotplug phase is to change memory state into
|
||||
available/unavailable for users. Amount of memory from user's view is
|
||||
@ -83,11 +90,12 @@ Logical Memory Hotplug phase is triggered by write of sysfs file by system
|
||||
administrator. For the hot-add case, it must be executed after Physical Hotplug
|
||||
phase by hand.
|
||||
(However, if you writes udev's hotplug scripts for memory hotplug, these
|
||||
phases can be execute in seamless way.)
|
||||
phases can be execute in seamless way.)
|
||||
|
||||
|
||||
1.3. Unit of Memory online/offline operation
|
||||
------------
|
||||
Unit of Memory online/offline operation
|
||||
---------------------------------------
|
||||
|
||||
Memory hotplug uses SPARSEMEM memory model which allows memory to be divided
|
||||
into chunks of the same size. These chunks are called "sections". The size of
|
||||
a memory section is architecture dependent. For example, power uses 16MiB, ia64
|
||||
@ -97,46 +105,50 @@ Memory sections are combined into chunks referred to as "memory blocks". The
|
||||
size of a memory block is architecture dependent and represents the logical
|
||||
unit upon which memory online/offline operations are to be performed. The
|
||||
default size of a memory block is the same as memory section size unless an
|
||||
architecture specifies otherwise. (see Section 3.)
|
||||
architecture specifies otherwise. (see :ref:`memory_hotplug_sysfs_files`.)
|
||||
|
||||
To determine the size (in bytes) of a memory block please read this file:
|
||||
|
||||
/sys/devices/system/memory/block_size_bytes
|
||||
|
||||
|
||||
-----------------------
|
||||
2. Kernel Configuration
|
||||
-----------------------
|
||||
Kernel Configuration
|
||||
====================
|
||||
|
||||
To use memory hotplug feature, kernel must be compiled with following
|
||||
config options.
|
||||
|
||||
- For all memory hotplug
|
||||
Memory model -> Sparse Memory (CONFIG_SPARSEMEM)
|
||||
Allow for memory hot-add (CONFIG_MEMORY_HOTPLUG)
|
||||
- For all memory hotplug:
|
||||
- Memory model -> Sparse Memory (CONFIG_SPARSEMEM)
|
||||
- Allow for memory hot-add (CONFIG_MEMORY_HOTPLUG)
|
||||
|
||||
- To enable memory removal, the following are also necessary
|
||||
Allow for memory hot remove (CONFIG_MEMORY_HOTREMOVE)
|
||||
Page Migration (CONFIG_MIGRATION)
|
||||
- To enable memory removal, the following are also necessary:
|
||||
- Allow for memory hot remove (CONFIG_MEMORY_HOTREMOVE)
|
||||
- Page Migration (CONFIG_MIGRATION)
|
||||
|
||||
- For ACPI memory hotplug, the following are also necessary
|
||||
Memory hotplug (under ACPI Support menu) (CONFIG_ACPI_HOTPLUG_MEMORY)
|
||||
This option can be kernel module.
|
||||
- For ACPI memory hotplug, the following are also necessary:
|
||||
- Memory hotplug (under ACPI Support menu) (CONFIG_ACPI_HOTPLUG_MEMORY)
|
||||
- This option can be kernel module.
|
||||
|
||||
- As a related configuration, if your box has a feature of NUMA-node hotplug
|
||||
via ACPI, then this option is necessary too.
|
||||
ACPI0004,PNP0A05 and PNP0A06 Container Driver (under ACPI Support menu)
|
||||
(CONFIG_ACPI_CONTAINER).
|
||||
This option can be kernel module too.
|
||||
|
||||
- ACPI0004,PNP0A05 and PNP0A06 Container Driver (under ACPI Support menu)
|
||||
(CONFIG_ACPI_CONTAINER).
|
||||
|
||||
This option can be kernel module too.
|
||||
|
||||
|
||||
--------------------------------
|
||||
3 sysfs files for memory hotplug
|
||||
--------------------------------
|
||||
.. _memory_hotplug_sysfs_files:
|
||||
|
||||
sysfs files for memory hotplug
|
||||
==============================
|
||||
|
||||
All memory blocks have their device information in sysfs. Each memory block
|
||||
is described under /sys/devices/system/memory as
|
||||
is described under /sys/devices/system/memory as:
|
||||
|
||||
/sys/devices/system/memory/memoryXXX
|
||||
(XXX is the memory block id.)
|
||||
/sys/devices/system/memory/memoryXXX
|
||||
(XXX is the memory block id.)
|
||||
|
||||
For the memory block covered by the sysfs directory. It is expected that all
|
||||
memory sections in this range are present and no memory holes exist in the
|
||||
@ -145,43 +157,53 @@ the existence of one should not affect the hotplug capabilities of the memory
|
||||
block.
|
||||
|
||||
For example, assume 1GiB memory block size. A device for a memory starting at
|
||||
0x100000000 is /sys/device/system/memory/memory4
|
||||
(0x100000000 / 1Gib = 4)
|
||||
0x100000000 is /sys/device/system/memory/memory4::
|
||||
|
||||
(0x100000000 / 1Gib = 4)
|
||||
|
||||
This device covers address range [0x100000000 ... 0x140000000)
|
||||
|
||||
Under each memory block, you can see 5 files:
|
||||
|
||||
/sys/devices/system/memory/memoryXXX/phys_index
|
||||
/sys/devices/system/memory/memoryXXX/phys_device
|
||||
/sys/devices/system/memory/memoryXXX/state
|
||||
/sys/devices/system/memory/memoryXXX/removable
|
||||
/sys/devices/system/memory/memoryXXX/valid_zones
|
||||
- /sys/devices/system/memory/memoryXXX/phys_index
|
||||
- /sys/devices/system/memory/memoryXXX/phys_device
|
||||
- /sys/devices/system/memory/memoryXXX/state
|
||||
- /sys/devices/system/memory/memoryXXX/removable
|
||||
- /sys/devices/system/memory/memoryXXX/valid_zones
|
||||
|
||||
=================== ============================================================
|
||||
``phys_index`` read-only and contains memory block id, same as XXX.
|
||||
``state`` read-write
|
||||
|
||||
- at read: contains online/offline state of memory.
|
||||
- at write: user can specify "online_kernel",
|
||||
|
||||
'phys_index' : read-only and contains memory block id, same as XXX.
|
||||
'state' : read-write
|
||||
at read: contains online/offline state of memory.
|
||||
at write: user can specify "online_kernel",
|
||||
"online_movable", "online", "offline" command
|
||||
which will be performed on all sections in the block.
|
||||
'phys_device' : read-only: designed to show the name of physical memory
|
||||
``phys_device`` read-only: designed to show the name of physical memory
|
||||
device. This is not well implemented now.
|
||||
'removable' : read-only: contains an integer value indicating
|
||||
``removable`` read-only: contains an integer value indicating
|
||||
whether the memory block is removable or not
|
||||
removable. A value of 1 indicates that the memory
|
||||
block is removable and a value of 0 indicates that
|
||||
it is not removable. A memory block is removable only if
|
||||
every section in the block is removable.
|
||||
'valid_zones' : read-only: designed to show which zones this memory block
|
||||
``valid_zones`` read-only: designed to show which zones this memory block
|
||||
can be onlined to.
|
||||
The first column shows it's default zone.
|
||||
|
||||
The first column shows it`s default zone.
|
||||
|
||||
"memory6/valid_zones: Normal Movable" shows this memoryblock
|
||||
can be onlined to ZONE_NORMAL by default and to ZONE_MOVABLE
|
||||
by online_movable.
|
||||
|
||||
"memory7/valid_zones: Movable Normal" shows this memoryblock
|
||||
can be onlined to ZONE_MOVABLE by default and to ZONE_NORMAL
|
||||
by online_kernel.
|
||||
=================== ============================================================
|
||||
|
||||
.. note::
|
||||
|
||||
NOTE:
|
||||
These directories/files appear after physical memory hotplug phase.
|
||||
|
||||
If CONFIG_NUMA is enabled the memoryXXX/ directories can also be accessed
|
||||
@ -193,13 +215,14 @@ For example:
|
||||
A backlink will also be created:
|
||||
/sys/devices/system/memory/memory9/node0 -> ../../node/node0
|
||||
|
||||
.. _memory_hotplug_physical_mem:
|
||||
|
||||
--------------------------------
|
||||
4. Physical memory hot-add phase
|
||||
--------------------------------
|
||||
Physical memory hot-add phase
|
||||
=============================
|
||||
|
||||
Hardware(Firmware) Support
|
||||
--------------------------
|
||||
|
||||
4.1 Hardware(Firmware) Support
|
||||
------------
|
||||
On x86_64/ia64 platform, memory hotplug by ACPI is supported.
|
||||
|
||||
In general, the firmware (ACPI) which supports memory hotplug defines
|
||||
@ -209,7 +232,8 @@ script. This will be done automatically.
|
||||
|
||||
But scripts for memory hotplug are not contained in generic udev package(now).
|
||||
You may have to write it by yourself or online/offline memory by hand.
|
||||
Please see "How to online memory", "How to offline memory" in this text.
|
||||
Please see :ref:`memory_hotplug_how_to_online_memory` and
|
||||
:ref:`memory_hotplug_how_to_offline_memory`.
|
||||
|
||||
If firmware supports NUMA-node hotplug, and defines an object _HID "ACPI0004",
|
||||
"PNP0A05", or "PNP0A06", notification is asserted to it, and ACPI handler
|
||||
@ -217,8 +241,9 @@ calls hotplug code for all of objects which are defined in it.
|
||||
If memory device is found, memory hotplug code will be called.
|
||||
|
||||
|
||||
4.2 Notify memory hot-add event by hand
|
||||
------------
|
||||
Notify memory hot-add event by hand
|
||||
-----------------------------------
|
||||
|
||||
On some architectures, the firmware may not notify the kernel of a memory
|
||||
hotplug event. Therefore, the memory "probe" interface is supported to
|
||||
explicitly notify the kernel. This interface depends on
|
||||
@ -229,45 +254,48 @@ notification.
|
||||
Probe interface is located at
|
||||
/sys/devices/system/memory/probe
|
||||
|
||||
You can tell the physical address of new memory to the kernel by
|
||||
You can tell the physical address of new memory to the kernel by::
|
||||
|
||||
% echo start_address_of_new_memory > /sys/devices/system/memory/probe
|
||||
% echo start_address_of_new_memory > /sys/devices/system/memory/probe
|
||||
|
||||
Then, [start_address_of_new_memory, start_address_of_new_memory +
|
||||
memory_block_size] memory range is hot-added. In this case, hotplug script is
|
||||
not called (in current implementation). You'll have to online memory by
|
||||
yourself. Please see "How to online memory" in this text.
|
||||
yourself. Please see :ref:`memory_hotplug_how_to_online_memory`.
|
||||
|
||||
|
||||
------------------------------
|
||||
5. Logical Memory hot-add phase
|
||||
------------------------------
|
||||
Logical Memory hot-add phase
|
||||
============================
|
||||
|
||||
5.1. State of memory
|
||||
------------
|
||||
To see (online/offline) state of a memory block, read 'state' file.
|
||||
State of memory
|
||||
---------------
|
||||
|
||||
% cat /sys/device/system/memory/memoryXXX/state
|
||||
To see (online/offline) state of a memory block, read 'state' file::
|
||||
|
||||
% cat /sys/device/system/memory/memoryXXX/state
|
||||
|
||||
|
||||
If the memory block is online, you'll read "online".
|
||||
If the memory block is offline, you'll read "offline".
|
||||
- If the memory block is online, you'll read "online".
|
||||
- If the memory block is offline, you'll read "offline".
|
||||
|
||||
|
||||
5.2. How to online memory
|
||||
------------
|
||||
.. _memory_hotplug_how_to_online_memory:
|
||||
|
||||
How to online memory
|
||||
--------------------
|
||||
|
||||
When the memory is hot-added, the kernel decides whether or not to "online"
|
||||
it according to the policy which can be read from "auto_online_blocks" file:
|
||||
it according to the policy which can be read from "auto_online_blocks" file::
|
||||
|
||||
% cat /sys/devices/system/memory/auto_online_blocks
|
||||
% cat /sys/devices/system/memory/auto_online_blocks
|
||||
|
||||
The default depends on the CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config
|
||||
option. If it is disabled the default is "offline" which means the newly added
|
||||
memory is not in a ready-to-use state and you have to "online" the newly added
|
||||
memory blocks manually. Automatic onlining can be requested by writing "online"
|
||||
to "auto_online_blocks" file:
|
||||
to "auto_online_blocks" file::
|
||||
|
||||
% echo online > /sys/devices/system/memory/auto_online_blocks
|
||||
% echo online > /sys/devices/system/memory/auto_online_blocks
|
||||
|
||||
This sets a global policy and impacts all memory blocks that will subsequently
|
||||
be hotplugged. Currently offline blocks keep their state. It is possible, under
|
||||
@ -277,35 +305,43 @@ online. User space tools can check their "state" files
|
||||
|
||||
If the automatic onlining wasn't requested, failed, or some memory block was
|
||||
offlined it is possible to change the individual block's state by writing to the
|
||||
"state" file:
|
||||
"state" file::
|
||||
|
||||
% echo online > /sys/devices/system/memory/memoryXXX/state
|
||||
% echo online > /sys/devices/system/memory/memoryXXX/state
|
||||
|
||||
This onlining will not change the ZONE type of the target memory block,
|
||||
If the memory block is in ZONE_NORMAL, you can change it to ZONE_MOVABLE:
|
||||
If the memory block doesn't belong to any zone an appropriate kernel zone
|
||||
(usually ZONE_NORMAL) will be used unless movable_node kernel command line
|
||||
option is specified when ZONE_MOVABLE will be used.
|
||||
|
||||
% echo online_movable > /sys/devices/system/memory/memoryXXX/state
|
||||
(NOTE: current limit: this memory block must be adjacent to ZONE_MOVABLE)
|
||||
You can explicitly request to associate it with ZONE_MOVABLE by::
|
||||
|
||||
And if the memory block is in ZONE_MOVABLE, you can change it to ZONE_NORMAL:
|
||||
% echo online_movable > /sys/devices/system/memory/memoryXXX/state
|
||||
|
||||
% echo online_kernel > /sys/devices/system/memory/memoryXXX/state
|
||||
(NOTE: current limit: this memory block must be adjacent to ZONE_NORMAL)
|
||||
.. note:: current limit: this memory block must be adjacent to ZONE_MOVABLE
|
||||
|
||||
Or you can explicitly request a kernel zone (usually ZONE_NORMAL) by::
|
||||
|
||||
% echo online_kernel > /sys/devices/system/memory/memoryXXX/state
|
||||
|
||||
.. note:: current limit: this memory block must be adjacent to ZONE_NORMAL
|
||||
|
||||
An explicit zone onlining can fail (e.g. when the range is already within
|
||||
and existing and incompatible zone already).
|
||||
|
||||
After this, memory block XXX's state will be 'online' and the amount of
|
||||
available memory will be increased.
|
||||
|
||||
Currently, newly added memory is added as ZONE_NORMAL (for powerpc, ZONE_DMA).
|
||||
This may be changed in future.
|
||||
|
||||
|
||||
|
||||
------------------------
|
||||
6. Logical memory remove
|
||||
------------------------
|
||||
Logical memory remove
|
||||
=====================
|
||||
|
||||
Memory offline and ZONE_MOVABLE
|
||||
-------------------------------
|
||||
|
||||
6.1 Memory offline and ZONE_MOVABLE
|
||||
------------
|
||||
Memory offlining is more complicated than memory online. Because memory offline
|
||||
has to make the whole memory block be unused, memory offline can fail if
|
||||
the memory block includes memory which cannot be freed.
|
||||
@ -330,24 +366,27 @@ Assume the system has "TOTAL" amount of memory at boot time, this boot option
|
||||
creates ZONE_MOVABLE as following.
|
||||
|
||||
1) When kernelcore=YYYY boot option is used,
|
||||
Size of memory not for movable pages (not for offline) is YYYY.
|
||||
Size of memory for movable pages (for offline) is TOTAL-YYYY.
|
||||
Size of memory not for movable pages (not for offline) is YYYY.
|
||||
Size of memory for movable pages (for offline) is TOTAL-YYYY.
|
||||
|
||||
2) When movablecore=ZZZZ boot option is used,
|
||||
Size of memory not for movable pages (not for offline) is TOTAL - ZZZZ.
|
||||
Size of memory for movable pages (for offline) is ZZZZ.
|
||||
Size of memory not for movable pages (not for offline) is TOTAL - ZZZZ.
|
||||
Size of memory for movable pages (for offline) is ZZZZ.
|
||||
|
||||
.. note::
|
||||
|
||||
Note: Unfortunately, there is no information to show which memory block belongs
|
||||
to ZONE_MOVABLE. This is TBD.
|
||||
Unfortunately, there is no information to show which memory block belongs
|
||||
to ZONE_MOVABLE. This is TBD.
|
||||
|
||||
.. _memory_hotplug_how_to_offline_memory:
|
||||
|
||||
How to offline memory
|
||||
---------------------
|
||||
|
||||
6.2. How to offline memory
|
||||
------------
|
||||
You can offline a memory block by using the same sysfs interface that was used
|
||||
in memory onlining.
|
||||
in memory onlining::
|
||||
|
||||
% echo offline > /sys/devices/system/memory/memoryXXX/state
|
||||
% echo offline > /sys/devices/system/memory/memoryXXX/state
|
||||
|
||||
If offline succeeds, the state of the memory block is changed to be "offline".
|
||||
If it fails, some error core (like -EBUSY) will be returned by the kernel.
|
||||
@ -361,22 +400,22 @@ able to offline it (or not). (For example, a page is referred to by some kernel
|
||||
internal call and released soon.)
|
||||
|
||||
Consideration:
|
||||
Memory hotplug's design direction is to make the possibility of memory offlining
|
||||
higher and to guarantee unplugging memory under any situation. But it needs
|
||||
more work. Returning -EBUSY under some situation may be good because the user
|
||||
can decide to retry more or not by himself. Currently, memory offlining code
|
||||
does some amount of retry with 120 seconds timeout.
|
||||
Memory hotplug's design direction is to make the possibility of memory
|
||||
offlining higher and to guarantee unplugging memory under any situation. But
|
||||
it needs more work. Returning -EBUSY under some situation may be good because
|
||||
the user can decide to retry more or not by himself. Currently, memory
|
||||
offlining code does some amount of retry with 120 seconds timeout.
|
||||
|
||||
Physical memory remove
|
||||
======================
|
||||
|
||||
-------------------------
|
||||
7. Physical memory remove
|
||||
-------------------------
|
||||
Need more implementation yet....
|
||||
- Notification completion of remove works by OS to firmware.
|
||||
- Guard from remove if not yet.
|
||||
|
||||
--------------------------------
|
||||
8. Memory hotplug event notifier
|
||||
--------------------------------
|
||||
Memory hotplug event notifier
|
||||
=============================
|
||||
|
||||
Hotplugging events are sent to a notification queue.
|
||||
|
||||
There are six types of notification defined in include/linux/memory.h:
|
||||
@ -406,14 +445,14 @@ MEM_CANCEL_OFFLINE
|
||||
MEM_OFFLINE
|
||||
Generated after offlining memory is complete.
|
||||
|
||||
A callback routine can be registered by calling
|
||||
A callback routine can be registered by calling::
|
||||
|
||||
hotplug_memory_notifier(callback_func, priority)
|
||||
|
||||
Callback functions with higher values of priority are called before callback
|
||||
functions with lower values.
|
||||
|
||||
A callback function must have the following prototype:
|
||||
A callback function must have the following prototype::
|
||||
|
||||
int callback_func(
|
||||
struct notifier_block *self, unsigned long action, void *arg);
|
||||
@ -421,27 +460,28 @@ A callback function must have the following prototype:
|
||||
The first argument of the callback function (self) is a pointer to the block
|
||||
of the notifier chain that points to the callback function itself.
|
||||
The second argument (action) is one of the event types described above.
|
||||
The third argument (arg) passes a pointer of struct memory_notify.
|
||||
The third argument (arg) passes a pointer of struct memory_notify::
|
||||
|
||||
struct memory_notify {
|
||||
unsigned long start_pfn;
|
||||
unsigned long nr_pages;
|
||||
int status_change_nid_normal;
|
||||
int status_change_nid_high;
|
||||
int status_change_nid;
|
||||
}
|
||||
struct memory_notify {
|
||||
unsigned long start_pfn;
|
||||
unsigned long nr_pages;
|
||||
int status_change_nid_normal;
|
||||
int status_change_nid_high;
|
||||
int status_change_nid;
|
||||
}
|
||||
|
||||
start_pfn is start_pfn of online/offline memory.
|
||||
nr_pages is # of pages of online/offline memory.
|
||||
status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask
|
||||
is (will be) set/clear, if this is -1, then nodemask status is not changed.
|
||||
status_change_nid_high is set node id when N_HIGH_MEMORY of nodemask
|
||||
is (will be) set/clear, if this is -1, then nodemask status is not changed.
|
||||
status_change_nid is set node id when N_MEMORY of nodemask is (will be)
|
||||
set/clear. It means a new(memoryless) node gets new memory by online and a
|
||||
node loses all memory. If this is -1, then nodemask status is not changed.
|
||||
If status_changed_nid* >= 0, callback should create/discard structures for the
|
||||
node if necessary.
|
||||
- start_pfn is start_pfn of online/offline memory.
|
||||
- nr_pages is # of pages of online/offline memory.
|
||||
- status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask
|
||||
is (will be) set/clear, if this is -1, then nodemask status is not changed.
|
||||
- status_change_nid_high is set node id when N_HIGH_MEMORY of nodemask
|
||||
is (will be) set/clear, if this is -1, then nodemask status is not changed.
|
||||
- status_change_nid is set node id when N_MEMORY of nodemask is (will be)
|
||||
set/clear. It means a new(memoryless) node gets new memory by online and a
|
||||
node loses all memory. If this is -1, then nodemask status is not changed.
|
||||
|
||||
If status_changed_nid* >= 0, callback should create/discard structures for the
|
||||
node if necessary.
|
||||
|
||||
The callback routine shall return one of the values
|
||||
NOTIFY_DONE, NOTIFY_OK, NOTIFY_BAD, NOTIFY_STOP
|
||||
@ -455,9 +495,9 @@ further processing of the notification queue.
|
||||
|
||||
NOTIFY_STOP stops further processing of the notification queue.
|
||||
|
||||
--------------
|
||||
9. Future Work
|
||||
--------------
|
||||
Future Work
|
||||
===========
|
||||
|
||||
- allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like
|
||||
sysctl or new control file.
|
||||
- showing memory block and physical device relationship.
|
||||
@ -465,4 +505,3 @@ NOTIFY_STOP stops further processing of the notification queue.
|
||||
- support HugeTLB page migration and offlining.
|
||||
- memmap removing at memory offline.
|
||||
- physical remove memory.
|
||||
|
||||
|
@ -1,163 +1,175 @@
|
||||
MEN Chameleon Bus
|
||||
=================
|
||||
|
||||
Table of Contents
|
||||
=================
|
||||
1 Introduction
|
||||
1.1 Scope of this Document
|
||||
1.2 Limitations of the current implementation
|
||||
2 Architecture
|
||||
2.1 MEN Chameleon Bus
|
||||
2.2 Carrier Devices
|
||||
2.3 Parser
|
||||
3 Resource handling
|
||||
3.1 Memory Resources
|
||||
3.2 IRQs
|
||||
4 Writing an MCB driver
|
||||
4.1 The driver structure
|
||||
4.2 Probing and attaching
|
||||
4.3 Initializing the driver
|
||||
MEN Chameleon Bus
|
||||
=================
|
||||
|
||||
.. Table of Contents
|
||||
=================
|
||||
1 Introduction
|
||||
1.1 Scope of this Document
|
||||
1.2 Limitations of the current implementation
|
||||
2 Architecture
|
||||
2.1 MEN Chameleon Bus
|
||||
2.2 Carrier Devices
|
||||
2.3 Parser
|
||||
3 Resource handling
|
||||
3.1 Memory Resources
|
||||
3.2 IRQs
|
||||
4 Writing an MCB driver
|
||||
4.1 The driver structure
|
||||
4.2 Probing and attaching
|
||||
4.3 Initializing the driver
|
||||
|
||||
|
||||
1 Introduction
|
||||
===============
|
||||
This document describes the architecture and implementation of the MEN
|
||||
Chameleon Bus (called MCB throughout this document).
|
||||
Introduction
|
||||
============
|
||||
|
||||
1.1 Scope of this Document
|
||||
---------------------------
|
||||
This document is intended to be a short overview of the current
|
||||
implementation and does by no means describe the complete possibilities of MCB
|
||||
based devices.
|
||||
This document describes the architecture and implementation of the MEN
|
||||
Chameleon Bus (called MCB throughout this document).
|
||||
|
||||
1.2 Limitations of the current implementation
|
||||
----------------------------------------------
|
||||
The current implementation is limited to PCI and PCIe based carrier devices
|
||||
that only use a single memory resource and share the PCI legacy IRQ. Not
|
||||
implemented are:
|
||||
- Multi-resource MCB devices like the VME Controller or M-Module carrier.
|
||||
- MCB devices that need another MCB device, like SRAM for a DMA Controller's
|
||||
buffer descriptors or a video controller's video memory.
|
||||
- A per-carrier IRQ domain for carrier devices that have one (or more) IRQs
|
||||
per MCB device like PCIe based carriers with MSI or MSI-X support.
|
||||
|
||||
2 Architecture
|
||||
===============
|
||||
MCB is divided into 3 functional blocks:
|
||||
- The MEN Chameleon Bus itself,
|
||||
- drivers for MCB Carrier Devices and
|
||||
- the parser for the Chameleon table.
|
||||
|
||||
2.1 MEN Chameleon Bus
|
||||
Scope of this Document
|
||||
----------------------
|
||||
The MEN Chameleon Bus is an artificial bus system that attaches to a so
|
||||
called Chameleon FPGA device found on some hardware produced my MEN Mikro
|
||||
Elektronik GmbH. These devices are multi-function devices implemented in a
|
||||
single FPGA and usually attached via some sort of PCI or PCIe link. Each
|
||||
FPGA contains a header section describing the content of the FPGA. The
|
||||
header lists the device id, PCI BAR, offset from the beginning of the PCI
|
||||
BAR, size in the FPGA, interrupt number and some other properties currently
|
||||
not handled by the MCB implementation.
|
||||
|
||||
2.2 Carrier Devices
|
||||
This document is intended to be a short overview of the current
|
||||
implementation and does by no means describe the complete possibilities of MCB
|
||||
based devices.
|
||||
|
||||
Limitations of the current implementation
|
||||
-----------------------------------------
|
||||
|
||||
The current implementation is limited to PCI and PCIe based carrier devices
|
||||
that only use a single memory resource and share the PCI legacy IRQ. Not
|
||||
implemented are:
|
||||
|
||||
- Multi-resource MCB devices like the VME Controller or M-Module carrier.
|
||||
- MCB devices that need another MCB device, like SRAM for a DMA Controller's
|
||||
buffer descriptors or a video controller's video memory.
|
||||
- A per-carrier IRQ domain for carrier devices that have one (or more) IRQs
|
||||
per MCB device like PCIe based carriers with MSI or MSI-X support.
|
||||
|
||||
Architecture
|
||||
============
|
||||
|
||||
MCB is divided into 3 functional blocks:
|
||||
|
||||
- The MEN Chameleon Bus itself,
|
||||
- drivers for MCB Carrier Devices and
|
||||
- the parser for the Chameleon table.
|
||||
|
||||
MEN Chameleon Bus
|
||||
-----------------
|
||||
|
||||
The MEN Chameleon Bus is an artificial bus system that attaches to a so
|
||||
called Chameleon FPGA device found on some hardware produced my MEN Mikro
|
||||
Elektronik GmbH. These devices are multi-function devices implemented in a
|
||||
single FPGA and usually attached via some sort of PCI or PCIe link. Each
|
||||
FPGA contains a header section describing the content of the FPGA. The
|
||||
header lists the device id, PCI BAR, offset from the beginning of the PCI
|
||||
BAR, size in the FPGA, interrupt number and some other properties currently
|
||||
not handled by the MCB implementation.
|
||||
|
||||
Carrier Devices
|
||||
---------------
|
||||
|
||||
A carrier device is just an abstraction for the real world physical bus the
|
||||
Chameleon FPGA is attached to. Some IP Core drivers may need to interact with
|
||||
properties of the carrier device (like querying the IRQ number of a PCI
|
||||
device). To provide abstraction from the real hardware bus, an MCB carrier
|
||||
device provides callback methods to translate the driver's MCB function calls
|
||||
to hardware related function calls. For example a carrier device may
|
||||
implement the get_irq() method which can be translated into a hardware bus
|
||||
query for the IRQ number the device should use.
|
||||
|
||||
Parser
|
||||
------
|
||||
|
||||
The parser reads the first 512 bytes of a Chameleon device and parses the
|
||||
Chameleon table. Currently the parser only supports the Chameleon v2 variant
|
||||
of the Chameleon table but can easily be adopted to support an older or
|
||||
possible future variant. While parsing the table's entries new MCB devices
|
||||
are allocated and their resources are assigned according to the resource
|
||||
assignment in the Chameleon table. After resource assignment is finished, the
|
||||
MCB devices are registered at the MCB and thus at the driver core of the
|
||||
Linux kernel.
|
||||
|
||||
Resource handling
|
||||
=================
|
||||
|
||||
The current implementation assigns exactly one memory and one IRQ resource
|
||||
per MCB device. But this is likely going to change in the future.
|
||||
|
||||
Memory Resources
|
||||
----------------
|
||||
|
||||
Each MCB device has exactly one memory resource, which can be requested from
|
||||
the MCB bus. This memory resource is the physical address of the MCB device
|
||||
inside the carrier and is intended to be passed to ioremap() and friends. It
|
||||
is already requested from the kernel by calling request_mem_region().
|
||||
|
||||
IRQs
|
||||
----
|
||||
|
||||
Each MCB device has exactly one IRQ resource, which can be requested from the
|
||||
MCB bus. If a carrier device driver implements the ->get_irq() callback
|
||||
method, the IRQ number assigned by the carrier device will be returned,
|
||||
otherwise the IRQ number inside the Chameleon table will be returned. This
|
||||
number is suitable to be passed to request_irq().
|
||||
|
||||
Writing an MCB driver
|
||||
=====================
|
||||
|
||||
The driver structure
|
||||
--------------------
|
||||
A carrier device is just an abstraction for the real world physical bus the
|
||||
Chameleon FPGA is attached to. Some IP Core drivers may need to interact with
|
||||
properties of the carrier device (like querying the IRQ number of a PCI
|
||||
device). To provide abstraction from the real hardware bus, an MCB carrier
|
||||
device provides callback methods to translate the driver's MCB function calls
|
||||
to hardware related function calls. For example a carrier device may
|
||||
implement the get_irq() method which can be translated into a hardware bus
|
||||
query for the IRQ number the device should use.
|
||||
|
||||
2.3 Parser
|
||||
-----------
|
||||
The parser reads the first 512 bytes of a Chameleon device and parses the
|
||||
Chameleon table. Currently the parser only supports the Chameleon v2 variant
|
||||
of the Chameleon table but can easily be adopted to support an older or
|
||||
possible future variant. While parsing the table's entries new MCB devices
|
||||
are allocated and their resources are assigned according to the resource
|
||||
assignment in the Chameleon table. After resource assignment is finished, the
|
||||
MCB devices are registered at the MCB and thus at the driver core of the
|
||||
Linux kernel.
|
||||
Each MCB driver has a structure to identify the device driver as well as
|
||||
device ids which identify the IP Core inside the FPGA. The driver structure
|
||||
also contains callback methods which get executed on driver probe and
|
||||
removal from the system::
|
||||
|
||||
3 Resource handling
|
||||
====================
|
||||
The current implementation assigns exactly one memory and one IRQ resource
|
||||
per MCB device. But this is likely going to change in the future.
|
||||
static const struct mcb_device_id foo_ids[] = {
|
||||
{ .device = 0x123 },
|
||||
{ }
|
||||
};
|
||||
MODULE_DEVICE_TABLE(mcb, foo_ids);
|
||||
|
||||
3.1 Memory Resources
|
||||
static struct mcb_driver foo_driver = {
|
||||
driver = {
|
||||
.name = "foo-bar",
|
||||
.owner = THIS_MODULE,
|
||||
},
|
||||
.probe = foo_probe,
|
||||
.remove = foo_remove,
|
||||
.id_table = foo_ids,
|
||||
};
|
||||
|
||||
Probing and attaching
|
||||
---------------------
|
||||
Each MCB device has exactly one memory resource, which can be requested from
|
||||
the MCB bus. This memory resource is the physical address of the MCB device
|
||||
inside the carrier and is intended to be passed to ioremap() and friends. It
|
||||
is already requested from the kernel by calling request_mem_region().
|
||||
|
||||
3.2 IRQs
|
||||
---------
|
||||
Each MCB device has exactly one IRQ resource, which can be requested from the
|
||||
MCB bus. If a carrier device driver implements the ->get_irq() callback
|
||||
method, the IRQ number assigned by the carrier device will be returned,
|
||||
otherwise the IRQ number inside the Chameleon table will be returned. This
|
||||
number is suitable to be passed to request_irq().
|
||||
When a driver is loaded and the MCB devices it services are found, the MCB
|
||||
core will call the driver's probe callback method. When the driver is removed
|
||||
from the system, the MCB core will call the driver's remove callback method::
|
||||
|
||||
4 Writing an MCB driver
|
||||
=======================
|
||||
static init foo_probe(struct mcb_device *mdev, const struct mcb_device_id *id);
|
||||
static void foo_remove(struct mcb_device *mdev);
|
||||
|
||||
4.1 The driver structure
|
||||
-------------------------
|
||||
Each MCB driver has a structure to identify the device driver as well as
|
||||
device ids which identify the IP Core inside the FPGA. The driver structure
|
||||
also contains callback methods which get executed on driver probe and
|
||||
removal from the system.
|
||||
Initializing the driver
|
||||
-----------------------
|
||||
|
||||
When the kernel is booted or your foo driver module is inserted, you have to
|
||||
perform driver initialization. Usually it is enough to register your driver
|
||||
module at the MCB core::
|
||||
|
||||
static const struct mcb_device_id foo_ids[] = {
|
||||
{ .device = 0x123 },
|
||||
{ }
|
||||
};
|
||||
MODULE_DEVICE_TABLE(mcb, foo_ids);
|
||||
static int __init foo_init(void)
|
||||
{
|
||||
return mcb_register_driver(&foo_driver);
|
||||
}
|
||||
module_init(foo_init);
|
||||
|
||||
static struct mcb_driver foo_driver = {
|
||||
driver = {
|
||||
.name = "foo-bar",
|
||||
.owner = THIS_MODULE,
|
||||
},
|
||||
.probe = foo_probe,
|
||||
.remove = foo_remove,
|
||||
.id_table = foo_ids,
|
||||
};
|
||||
static void __exit foo_exit(void)
|
||||
{
|
||||
mcb_unregister_driver(&foo_driver);
|
||||
}
|
||||
module_exit(foo_exit);
|
||||
|
||||
4.2 Probing and attaching
|
||||
--------------------------
|
||||
When a driver is loaded and the MCB devices it services are found, the MCB
|
||||
core will call the driver's probe callback method. When the driver is removed
|
||||
from the system, the MCB core will call the driver's remove callback method.
|
||||
The module_mcb_driver() macro can be used to reduce the above code::
|
||||
|
||||
|
||||
static init foo_probe(struct mcb_device *mdev, const struct mcb_device_id *id);
|
||||
static void foo_remove(struct mcb_device *mdev);
|
||||
|
||||
4.3 Initializing the driver
|
||||
----------------------------
|
||||
When the kernel is booted or your foo driver module is inserted, you have to
|
||||
perform driver initialization. Usually it is enough to register your driver
|
||||
module at the MCB core.
|
||||
|
||||
|
||||
static int __init foo_init(void)
|
||||
{
|
||||
return mcb_register_driver(&foo_driver);
|
||||
}
|
||||
module_init(foo_init);
|
||||
|
||||
static void __exit foo_exit(void)
|
||||
{
|
||||
mcb_unregister_driver(&foo_driver);
|
||||
}
|
||||
module_exit(foo_exit);
|
||||
|
||||
The module_mcb_driver() macro can be used to reduce the above code.
|
||||
|
||||
|
||||
module_mcb_driver(foo_driver);
|
||||
module_mcb_driver(foo_driver);
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user