mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-15 18:04:36 +00:00
Merge branch 'x86/asm' into x86/atomic
Merge reason: Conflict between LOCK_PREFIX_HERE and relative alternatives pointers Resolved Conflicts: arch/x86/include/asm/alternative.h arch/x86/kernel/alternative.c Signed-off-by: H. Peter Anvin <hpa@zytor.com>
This commit is contained in:
commit
d9c5841e22
18
.gitignore
vendored
18
.gitignore
vendored
@ -34,12 +34,18 @@ modules.builtin
|
||||
#
|
||||
# Top-level generic files
|
||||
#
|
||||
tags
|
||||
TAGS
|
||||
vmlinux
|
||||
System.map
|
||||
Module.markers
|
||||
Module.symvers
|
||||
/tags
|
||||
/TAGS
|
||||
/linux
|
||||
/vmlinux
|
||||
/vmlinuz
|
||||
/System.map
|
||||
/Module.markers
|
||||
/Module.symvers
|
||||
|
||||
#
|
||||
# git files that we don't want to ignore even it they are dot-files
|
||||
#
|
||||
!.gitignore
|
||||
!.mailmap
|
||||
|
||||
|
7
Documentation/ABI/stable/sysfs-devices-node
Normal file
7
Documentation/ABI/stable/sysfs-devices-node
Normal file
@ -0,0 +1,7 @@
|
||||
What: /sys/devices/system/node/nodeX
|
||||
Date: October 2002
|
||||
Contact: Linux Memory Management list <linux-mm@kvack.org>
|
||||
Description:
|
||||
When CONFIG_NUMA is enabled, this is a directory containing
|
||||
information on node X such as what CPUs are local to the
|
||||
node.
|
@ -20,7 +20,7 @@ Description:
|
||||
lsm: [[subj_user=] [subj_role=] [subj_type=]
|
||||
[obj_user=] [obj_role=] [obj_type=]]
|
||||
|
||||
base: func:= [BPRM_CHECK][FILE_MMAP][INODE_PERMISSION]
|
||||
base: func:= [BPRM_CHECK][FILE_MMAP][FILE_CHECK]
|
||||
mask:= [MAY_READ] [MAY_WRITE] [MAY_APPEND] [MAY_EXEC]
|
||||
fsmagic:= hex value
|
||||
uid:= decimal value
|
||||
@ -40,11 +40,11 @@ Description:
|
||||
|
||||
measure func=BPRM_CHECK
|
||||
measure func=FILE_MMAP mask=MAY_EXEC
|
||||
measure func=INODE_PERM mask=MAY_READ uid=0
|
||||
measure func=FILE_CHECK mask=MAY_READ uid=0
|
||||
|
||||
The default policy measures all executables in bprm_check,
|
||||
all files mmapped executable in file_mmap, and all files
|
||||
open for read by root in inode_permission.
|
||||
open for read by root in do_filp_open.
|
||||
|
||||
Examples of LSM specific definitions:
|
||||
|
||||
@ -54,8 +54,8 @@ Description:
|
||||
|
||||
dont_measure obj_type=var_log_t
|
||||
dont_measure obj_type=auditd_log_t
|
||||
measure subj_user=system_u func=INODE_PERM mask=MAY_READ
|
||||
measure subj_role=system_r func=INODE_PERM mask=MAY_READ
|
||||
measure subj_user=system_u func=FILE_CHECK mask=MAY_READ
|
||||
measure subj_role=system_r func=FILE_CHECK mask=MAY_READ
|
||||
|
||||
Smack:
|
||||
measure subj_user=_ func=INODE_PERM mask=MAY_READ
|
||||
measure subj_user=_ func=FILE_CHECK mask=MAY_READ
|
||||
|
@ -128,3 +128,17 @@ Description:
|
||||
preferred request size for workloads where sustained
|
||||
throughput is desired. If no optimal I/O size is
|
||||
reported this file contains 0.
|
||||
|
||||
What: /sys/block/<disk>/queue/nomerges
|
||||
Date: January 2010
|
||||
Contact:
|
||||
Description:
|
||||
Standard I/O elevator operations include attempts to
|
||||
merge contiguous I/Os. For known random I/O loads these
|
||||
attempts will always fail and result in extra cycles
|
||||
being spent in the kernel. This allows one to turn off
|
||||
this behavior on one of two ways: When set to 1, complex
|
||||
merge checks are disabled, but the simple one-shot merges
|
||||
with the previous I/O request are enabled. When set to 2,
|
||||
all merge tries are disabled. The default value is 0 -
|
||||
which enables all types of merge tries.
|
||||
|
@ -159,3 +159,14 @@ Description:
|
||||
device. This is useful to ensure auto probing won't
|
||||
match the driver to the device. For example:
|
||||
# echo "046d c315" > /sys/bus/usb/drivers/foo/remove_id
|
||||
|
||||
What: /sys/bus/usb/device/.../avoid_reset_quirk
|
||||
Date: December 2009
|
||||
Contact: Oliver Neukum <oliver@neukum.org>
|
||||
Description:
|
||||
Writing 1 to this file tells the kernel that this
|
||||
device will morph into another mode when it is reset.
|
||||
Drivers will not use reset for error handling for
|
||||
such devices.
|
||||
Users:
|
||||
usb_modeswitch
|
||||
|
79
Documentation/ABI/testing/sysfs-devices-power
Normal file
79
Documentation/ABI/testing/sysfs-devices-power
Normal file
@ -0,0 +1,79 @@
|
||||
What: /sys/devices/.../power/
|
||||
Date: January 2009
|
||||
Contact: Rafael J. Wysocki <rjw@sisk.pl>
|
||||
Description:
|
||||
The /sys/devices/.../power directory contains attributes
|
||||
allowing the user space to check and modify some power
|
||||
management related properties of given device.
|
||||
|
||||
What: /sys/devices/.../power/wakeup
|
||||
Date: January 2009
|
||||
Contact: Rafael J. Wysocki <rjw@sisk.pl>
|
||||
Description:
|
||||
The /sys/devices/.../power/wakeup attribute allows the user
|
||||
space to check if the device is enabled to wake up the system
|
||||
from sleep states, such as the memory sleep state (suspend to
|
||||
RAM) and hibernation (suspend to disk), and to enable or disable
|
||||
it to do that as desired.
|
||||
|
||||
Some devices support "wakeup" events, which are hardware signals
|
||||
used to activate the system from a sleep state. Such devices
|
||||
have one of the following two values for the sysfs power/wakeup
|
||||
file:
|
||||
|
||||
+ "enabled\n" to issue the events;
|
||||
+ "disabled\n" not to do so;
|
||||
|
||||
In that cases the user space can change the setting represented
|
||||
by the contents of this file by writing either "enabled", or
|
||||
"disabled" to it.
|
||||
|
||||
For the devices that are not capable of generating system wakeup
|
||||
events this file contains "\n". In that cases the user space
|
||||
cannot modify the contents of this file and the device cannot be
|
||||
enabled to wake up the system.
|
||||
|
||||
What: /sys/devices/.../power/control
|
||||
Date: January 2009
|
||||
Contact: Rafael J. Wysocki <rjw@sisk.pl>
|
||||
Description:
|
||||
The /sys/devices/.../power/control attribute allows the user
|
||||
space to control the run-time power management of the device.
|
||||
|
||||
All devices have one of the following two values for the
|
||||
power/control file:
|
||||
|
||||
+ "auto\n" to allow the device to be power managed at run time;
|
||||
+ "on\n" to prevent the device from being power managed;
|
||||
|
||||
The default for all devices is "auto", which means that they may
|
||||
be subject to automatic power management, depending on their
|
||||
drivers. Changing this attribute to "on" prevents the driver
|
||||
from power managing the device at run time. Doing that while
|
||||
the device is suspended causes it to be woken up.
|
||||
|
||||
What: /sys/devices/.../power/async
|
||||
Date: January 2009
|
||||
Contact: Rafael J. Wysocki <rjw@sisk.pl>
|
||||
Description:
|
||||
The /sys/devices/.../async attribute allows the user space to
|
||||
enable or diasble the device's suspend and resume callbacks to
|
||||
be executed asynchronously (ie. in separate threads, in parallel
|
||||
with the main suspend/resume thread) during system-wide power
|
||||
transitions (eg. suspend to RAM, hibernation).
|
||||
|
||||
All devices have one of the following two values for the
|
||||
power/async file:
|
||||
|
||||
+ "enabled\n" to permit the asynchronous suspend/resume;
|
||||
+ "disabled\n" to forbid it;
|
||||
|
||||
The value of this attribute may be changed by writing either
|
||||
"enabled", or "disabled" to it.
|
||||
|
||||
It generally is unsafe to permit the asynchronous suspend/resume
|
||||
of a device unless it is certain that all of the PM dependencies
|
||||
of the device are known to the PM core. However, for some
|
||||
devices this attribute is set to "enabled" by bus type code or
|
||||
device drivers and in that cases it should be safe to leave the
|
||||
default value.
|
@ -1,4 +1,4 @@
|
||||
What: /sys/devices/platform/asus-laptop/display
|
||||
What: /sys/devices/platform/asus_laptop/display
|
||||
Date: January 2007
|
||||
KernelVersion: 2.6.20
|
||||
Contact: "Corentin Chary" <corentincj@iksaif.net>
|
||||
@ -13,7 +13,7 @@ Description:
|
||||
Ex: - 0 (0000b) means no display
|
||||
- 3 (0011b) CRT+LCD.
|
||||
|
||||
What: /sys/devices/platform/asus-laptop/gps
|
||||
What: /sys/devices/platform/asus_laptop/gps
|
||||
Date: January 2007
|
||||
KernelVersion: 2.6.20
|
||||
Contact: "Corentin Chary" <corentincj@iksaif.net>
|
||||
@ -21,7 +21,7 @@ Description:
|
||||
Control the gps device. 1 means on, 0 means off.
|
||||
Users: Lapsus
|
||||
|
||||
What: /sys/devices/platform/asus-laptop/ledd
|
||||
What: /sys/devices/platform/asus_laptop/ledd
|
||||
Date: January 2007
|
||||
KernelVersion: 2.6.20
|
||||
Contact: "Corentin Chary" <corentincj@iksaif.net>
|
||||
@ -29,11 +29,11 @@ Description:
|
||||
Some models like the W1N have a LED display that can be
|
||||
used to display several informations.
|
||||
To control the LED display, use the following :
|
||||
echo 0x0T000DDD > /sys/devices/platform/asus-laptop/
|
||||
echo 0x0T000DDD > /sys/devices/platform/asus_laptop/
|
||||
where T control the 3 letters display, and DDD the 3 digits display.
|
||||
The DDD table can be found in Documentation/laptops/asus-laptop.txt
|
||||
|
||||
What: /sys/devices/platform/asus-laptop/bluetooth
|
||||
What: /sys/devices/platform/asus_laptop/bluetooth
|
||||
Date: January 2007
|
||||
KernelVersion: 2.6.20
|
||||
Contact: "Corentin Chary" <corentincj@iksaif.net>
|
||||
@ -42,7 +42,7 @@ Description:
|
||||
This may control the led, the device or both.
|
||||
Users: Lapsus
|
||||
|
||||
What: /sys/devices/platform/asus-laptop/wlan
|
||||
What: /sys/devices/platform/asus_laptop/wlan
|
||||
Date: January 2007
|
||||
KernelVersion: 2.6.20
|
||||
Contact: "Corentin Chary" <corentincj@iksaif.net>
|
||||
|
@ -1,4 +1,4 @@
|
||||
What: /sys/devices/platform/eeepc-laptop/disp
|
||||
What: /sys/devices/platform/eeepc/disp
|
||||
Date: May 2008
|
||||
KernelVersion: 2.6.26
|
||||
Contact: "Corentin Chary" <corentincj@iksaif.net>
|
||||
@ -9,21 +9,21 @@ Description:
|
||||
- 3 = LCD+CRT
|
||||
If you run X11, you should use xrandr instead.
|
||||
|
||||
What: /sys/devices/platform/eeepc-laptop/camera
|
||||
What: /sys/devices/platform/eeepc/camera
|
||||
Date: May 2008
|
||||
KernelVersion: 2.6.26
|
||||
Contact: "Corentin Chary" <corentincj@iksaif.net>
|
||||
Description:
|
||||
Control the camera. 1 means on, 0 means off.
|
||||
|
||||
What: /sys/devices/platform/eeepc-laptop/cardr
|
||||
What: /sys/devices/platform/eeepc/cardr
|
||||
Date: May 2008
|
||||
KernelVersion: 2.6.26
|
||||
Contact: "Corentin Chary" <corentincj@iksaif.net>
|
||||
Description:
|
||||
Control the card reader. 1 means on, 0 means off.
|
||||
|
||||
What: /sys/devices/platform/eeepc-laptop/cpufv
|
||||
What: /sys/devices/platform/eeepc/cpufv
|
||||
Date: Jun 2009
|
||||
KernelVersion: 2.6.31
|
||||
Contact: "Corentin Chary" <corentincj@iksaif.net>
|
||||
@ -42,7 +42,7 @@ Description:
|
||||
`------------ Availables modes
|
||||
For example, 0x301 means: mode 1 selected, 3 available modes.
|
||||
|
||||
What: /sys/devices/platform/eeepc-laptop/available_cpufv
|
||||
What: /sys/devices/platform/eeepc/available_cpufv
|
||||
Date: Jun 2009
|
||||
KernelVersion: 2.6.31
|
||||
Contact: "Corentin Chary" <corentincj@iksaif.net>
|
||||
|
@ -101,3 +101,16 @@ Description:
|
||||
|
||||
CAUTION: Using it will cause your machine's real-time (CMOS)
|
||||
clock to be set to a random invalid time after a resume.
|
||||
|
||||
What: /sys/power/pm_async
|
||||
Date: January 2009
|
||||
Contact: Rafael J. Wysocki <rjw@sisk.pl>
|
||||
Description:
|
||||
The /sys/power/pm_async file controls the switch allowing the
|
||||
user space to enable or disable asynchronous suspend and resume
|
||||
of devices. If enabled, this feature will cause some device
|
||||
drivers' suspend and resume callbacks to be executed in parallel
|
||||
with each other and with the main suspend thread. It is enabled
|
||||
if this file contains "1", which is the default. It may be
|
||||
disabled by writing "0" to this file, in which case all devices
|
||||
will be suspended and resumed synchronously.
|
||||
|
@ -1,12 +1,12 @@
|
||||
Dynamic DMA mapping
|
||||
===================
|
||||
Dynamic DMA mapping Guide
|
||||
=========================
|
||||
|
||||
David S. Miller <davem@redhat.com>
|
||||
Richard Henderson <rth@cygnus.com>
|
||||
Jakub Jelinek <jakub@redhat.com>
|
||||
|
||||
This document describes the DMA mapping system in terms of the pci_
|
||||
API. For a similar API that works for generic devices, see
|
||||
This is a guide to device driver writers on how to use the DMA API
|
||||
with example pseudo-code. For a concise description of the API, see
|
||||
DMA-API.txt.
|
||||
|
||||
Most of the 64bit platforms have special hardware that translates bus
|
||||
@ -26,12 +26,15 @@ mapped only for the time they are actually used and unmapped after the DMA
|
||||
transfer.
|
||||
|
||||
The following API will work of course even on platforms where no such
|
||||
hardware exists, see e.g. arch/x86/include/asm/pci.h for how it is implemented on
|
||||
top of the virt_to_bus interface.
|
||||
hardware exists.
|
||||
|
||||
Note that the DMA API works with any bus independent of the underlying
|
||||
microprocessor architecture. You should use the DMA API rather than
|
||||
the bus specific DMA API (e.g. pci_dma_*).
|
||||
|
||||
First of all, you should make sure
|
||||
|
||||
#include <linux/pci.h>
|
||||
#include <linux/dma-mapping.h>
|
||||
|
||||
is in your driver. This file will obtain for you the definition of the
|
||||
dma_addr_t (which can hold any valid DMA address for the platform)
|
||||
@ -78,44 +81,43 @@ for you to DMA from/to.
|
||||
DMA addressing limitations
|
||||
|
||||
Does your device have any DMA addressing limitations? For example, is
|
||||
your device only capable of driving the low order 24-bits of address
|
||||
on the PCI bus for SAC DMA transfers? If so, you need to inform the
|
||||
PCI layer of this fact.
|
||||
your device only capable of driving the low order 24-bits of address?
|
||||
If so, you need to inform the kernel of this fact.
|
||||
|
||||
By default, the kernel assumes that your device can address the full
|
||||
32-bits in a SAC cycle. For a 64-bit DAC capable device, this needs
|
||||
to be increased. And for a device with limitations, as discussed in
|
||||
the previous paragraph, it needs to be decreased.
|
||||
32-bits. For a 64-bit capable device, this needs to be increased.
|
||||
And for a device with limitations, as discussed in the previous
|
||||
paragraph, it needs to be decreased.
|
||||
|
||||
pci_alloc_consistent() by default will return 32-bit DMA addresses.
|
||||
PCI-X specification requires PCI-X devices to support 64-bit
|
||||
addressing (DAC) for all transactions. And at least one platform (SGI
|
||||
SN2) requires 64-bit consistent allocations to operate correctly when
|
||||
the IO bus is in PCI-X mode. Therefore, like with pci_set_dma_mask(),
|
||||
it's good practice to call pci_set_consistent_dma_mask() to set the
|
||||
appropriate mask even if your device only supports 32-bit DMA
|
||||
(default) and especially if it's a PCI-X device.
|
||||
Special note about PCI: PCI-X specification requires PCI-X devices to
|
||||
support 64-bit addressing (DAC) for all transactions. And at least
|
||||
one platform (SGI SN2) requires 64-bit consistent allocations to
|
||||
operate correctly when the IO bus is in PCI-X mode.
|
||||
|
||||
For correct operation, you must interrogate the PCI layer in your
|
||||
device probe routine to see if the PCI controller on the machine can
|
||||
properly support the DMA addressing limitation your device has. It is
|
||||
good style to do this even if your device holds the default setting,
|
||||
For correct operation, you must interrogate the kernel in your device
|
||||
probe routine to see if the DMA controller on the machine can properly
|
||||
support the DMA addressing limitation your device has. It is good
|
||||
style to do this even if your device holds the default setting,
|
||||
because this shows that you did think about these issues wrt. your
|
||||
device.
|
||||
|
||||
The query is performed via a call to pci_set_dma_mask():
|
||||
The query is performed via a call to dma_set_mask():
|
||||
|
||||
int pci_set_dma_mask(struct pci_dev *pdev, u64 device_mask);
|
||||
int dma_set_mask(struct device *dev, u64 mask);
|
||||
|
||||
The query for consistent allocations is performed via a call to
|
||||
pci_set_consistent_dma_mask():
|
||||
dma_set_coherent_mask():
|
||||
|
||||
int pci_set_consistent_dma_mask(struct pci_dev *pdev, u64 device_mask);
|
||||
int dma_set_coherent_mask(struct device *dev, u64 mask);
|
||||
|
||||
Here, pdev is a pointer to the PCI device struct of your device, and
|
||||
device_mask is a bit mask describing which bits of a PCI address your
|
||||
device supports. It returns zero if your card can perform DMA
|
||||
properly on the machine given the address mask you provided.
|
||||
Here, dev is a pointer to the device struct of your device, and mask
|
||||
is a bit mask describing which bits of an address your device
|
||||
supports. It returns zero if your card can perform DMA properly on
|
||||
the machine given the address mask you provided. In general, the
|
||||
device struct of your device is embedded in the bus specific device
|
||||
struct of your device. For example, a pointer to the device struct of
|
||||
your PCI device is pdev->dev (pdev is a pointer to the PCI device
|
||||
struct of your device).
|
||||
|
||||
If it returns non-zero, your device cannot perform DMA properly on
|
||||
this platform, and attempting to do so will result in undefined
|
||||
@ -133,31 +135,30 @@ of your driver reports that performance is bad or that the device is not
|
||||
even detected, you can ask them for the kernel messages to find out
|
||||
exactly why.
|
||||
|
||||
The standard 32-bit addressing PCI device would do something like
|
||||
this:
|
||||
The standard 32-bit addressing device would do something like this:
|
||||
|
||||
if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
|
||||
if (dma_set_mask(dev, DMA_BIT_MASK(32))) {
|
||||
printk(KERN_WARNING
|
||||
"mydev: No suitable DMA available.\n");
|
||||
goto ignore_this_device;
|
||||
}
|
||||
|
||||
Another common scenario is a 64-bit capable device. The approach
|
||||
here is to try for 64-bit DAC addressing, but back down to a
|
||||
32-bit mask should that fail. The PCI platform code may fail the
|
||||
64-bit mask not because the platform is not capable of 64-bit
|
||||
addressing. Rather, it may fail in this case simply because
|
||||
32-bit SAC addressing is done more efficiently than DAC addressing.
|
||||
Sparc64 is one platform which behaves in this way.
|
||||
Another common scenario is a 64-bit capable device. The approach here
|
||||
is to try for 64-bit addressing, but back down to a 32-bit mask that
|
||||
should not fail. The kernel may fail the 64-bit mask not because the
|
||||
platform is not capable of 64-bit addressing. Rather, it may fail in
|
||||
this case simply because 32-bit addressing is done more efficiently
|
||||
than 64-bit addressing. For example, Sparc64 PCI SAC addressing is
|
||||
more efficient than DAC addressing.
|
||||
|
||||
Here is how you would handle a 64-bit capable device which can drive
|
||||
all 64-bits when accessing streaming DMA:
|
||||
|
||||
int using_dac;
|
||||
|
||||
if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
|
||||
if (!dma_set_mask(dev, DMA_BIT_MASK(64))) {
|
||||
using_dac = 1;
|
||||
} else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
|
||||
} else if (!dma_set_mask(dev, DMA_BIT_MASK(32))) {
|
||||
using_dac = 0;
|
||||
} else {
|
||||
printk(KERN_WARNING
|
||||
@ -170,36 +171,36 @@ the case would look like this:
|
||||
|
||||
int using_dac, consistent_using_dac;
|
||||
|
||||
if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
|
||||
if (!dma_set_mask(dev, DMA_BIT_MASK(64))) {
|
||||
using_dac = 1;
|
||||
consistent_using_dac = 1;
|
||||
pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
|
||||
} else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
|
||||
dma_set_coherent_mask(dev, DMA_BIT_MASK(64));
|
||||
} else if (!dma_set_mask(dev, DMA_BIT_MASK(32))) {
|
||||
using_dac = 0;
|
||||
consistent_using_dac = 0;
|
||||
pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
|
||||
dma_set_coherent_mask(dev, DMA_BIT_MASK(32));
|
||||
} else {
|
||||
printk(KERN_WARNING
|
||||
"mydev: No suitable DMA available.\n");
|
||||
goto ignore_this_device;
|
||||
}
|
||||
|
||||
pci_set_consistent_dma_mask() will always be able to set the same or a
|
||||
smaller mask as pci_set_dma_mask(). However for the rare case that a
|
||||
dma_set_coherent_mask() will always be able to set the same or a
|
||||
smaller mask as dma_set_mask(). However for the rare case that a
|
||||
device driver only uses consistent allocations, one would have to
|
||||
check the return value from pci_set_consistent_dma_mask().
|
||||
check the return value from dma_set_coherent_mask().
|
||||
|
||||
Finally, if your device can only drive the low 24-bits of
|
||||
address during PCI bus mastering you might do something like:
|
||||
address you might do something like:
|
||||
|
||||
if (pci_set_dma_mask(pdev, DMA_BIT_MASK(24))) {
|
||||
if (dma_set_mask(dev, DMA_BIT_MASK(24))) {
|
||||
printk(KERN_WARNING
|
||||
"mydev: 24-bit DMA addressing not available.\n");
|
||||
goto ignore_this_device;
|
||||
}
|
||||
|
||||
When pci_set_dma_mask() is successful, and returns zero, the PCI layer
|
||||
saves away this mask you have provided. The PCI layer will use this
|
||||
When dma_set_mask() is successful, and returns zero, the kernel saves
|
||||
away this mask you have provided. The kernel will use this
|
||||
information later when you make DMA mappings.
|
||||
|
||||
There is a case which we are aware of at this time, which is worth
|
||||
@ -208,7 +209,7 @@ functions (for example a sound card provides playback and record
|
||||
functions) and the various different functions have _different_
|
||||
DMA addressing limitations, you may wish to probe each mask and
|
||||
only provide the functionality which the machine can handle. It
|
||||
is important that the last call to pci_set_dma_mask() be for the
|
||||
is important that the last call to dma_set_mask() be for the
|
||||
most specific mask.
|
||||
|
||||
Here is pseudo-code showing how this might be done:
|
||||
@ -217,17 +218,17 @@ Here is pseudo-code showing how this might be done:
|
||||
#define RECORD_ADDRESS_BITS DMA_BIT_MASK(24)
|
||||
|
||||
struct my_sound_card *card;
|
||||
struct pci_dev *pdev;
|
||||
struct device *dev;
|
||||
|
||||
...
|
||||
if (!pci_set_dma_mask(pdev, PLAYBACK_ADDRESS_BITS)) {
|
||||
if (!dma_set_mask(dev, PLAYBACK_ADDRESS_BITS)) {
|
||||
card->playback_enabled = 1;
|
||||
} else {
|
||||
card->playback_enabled = 0;
|
||||
printk(KERN_WARNING "%s: Playback disabled due to DMA limitations.\n",
|
||||
card->name);
|
||||
}
|
||||
if (!pci_set_dma_mask(pdev, RECORD_ADDRESS_BITS)) {
|
||||
if (!dma_set_mask(dev, RECORD_ADDRESS_BITS)) {
|
||||
card->record_enabled = 1;
|
||||
} else {
|
||||
card->record_enabled = 0;
|
||||
@ -252,8 +253,8 @@ There are two types of DMA mappings:
|
||||
Think of "consistent" as "synchronous" or "coherent".
|
||||
|
||||
The current default is to return consistent memory in the low 32
|
||||
bits of the PCI bus space. However, for future compatibility you
|
||||
should set the consistent mask even if this default is fine for your
|
||||
bits of the bus space. However, for future compatibility you should
|
||||
set the consistent mask even if this default is fine for your
|
||||
driver.
|
||||
|
||||
Good examples of what to use consistent mappings for are:
|
||||
@ -285,9 +286,9 @@ There are two types of DMA mappings:
|
||||
found in PCI bridges (such as by reading a register's value
|
||||
after writing it).
|
||||
|
||||
- Streaming DMA mappings which are usually mapped for one DMA transfer,
|
||||
unmapped right after it (unless you use pci_dma_sync_* below) and for which
|
||||
hardware can optimize for sequential accesses.
|
||||
- Streaming DMA mappings which are usually mapped for one DMA
|
||||
transfer, unmapped right after it (unless you use dma_sync_* below)
|
||||
and for which hardware can optimize for sequential accesses.
|
||||
|
||||
This of "streaming" as "asynchronous" or "outside the coherency
|
||||
domain".
|
||||
@ -302,8 +303,8 @@ There are two types of DMA mappings:
|
||||
optimizations the hardware allows. To this end, when using
|
||||
such mappings you must be explicit about what you want to happen.
|
||||
|
||||
Neither type of DMA mapping has alignment restrictions that come
|
||||
from PCI, although some devices may have such restrictions.
|
||||
Neither type of DMA mapping has alignment restrictions that come from
|
||||
the underlying bus, although some devices may have such restrictions.
|
||||
Also, systems with caches that aren't DMA-coherent will work better
|
||||
when the underlying buffers don't share cache lines with other data.
|
||||
|
||||
@ -315,33 +316,27 @@ you should do:
|
||||
|
||||
dma_addr_t dma_handle;
|
||||
|
||||
cpu_addr = pci_alloc_consistent(pdev, size, &dma_handle);
|
||||
cpu_addr = dma_alloc_coherent(dev, size, &dma_handle, gfp);
|
||||
|
||||
where pdev is a struct pci_dev *. This may be called in interrupt context.
|
||||
You should use dma_alloc_coherent (see DMA-API.txt) for buses
|
||||
where devices don't have struct pci_dev (like ISA, EISA).
|
||||
|
||||
This argument is needed because the DMA translations may be bus
|
||||
specific (and often is private to the bus which the device is attached
|
||||
to).
|
||||
where device is a struct device *. This may be called in interrupt
|
||||
context with the GFP_ATOMIC flag.
|
||||
|
||||
Size is the length of the region you want to allocate, in bytes.
|
||||
|
||||
This routine will allocate RAM for that region, so it acts similarly to
|
||||
__get_free_pages (but takes size instead of a page order). If your
|
||||
driver needs regions sized smaller than a page, you may prefer using
|
||||
the pci_pool interface, described below.
|
||||
the dma_pool interface, described below.
|
||||
|
||||
The consistent DMA mapping interfaces, for non-NULL pdev, will by
|
||||
default return a DMA address which is SAC (Single Address Cycle)
|
||||
addressable. Even if the device indicates (via PCI dma mask) that it
|
||||
may address the upper 32-bits and thus perform DAC cycles, consistent
|
||||
allocation will only return > 32-bit PCI addresses for DMA if the
|
||||
consistent dma mask has been explicitly changed via
|
||||
pci_set_consistent_dma_mask(). This is true of the pci_pool interface
|
||||
as well.
|
||||
The consistent DMA mapping interfaces, for non-NULL dev, will by
|
||||
default return a DMA address which is 32-bit addressable. Even if the
|
||||
device indicates (via DMA mask) that it may address the upper 32-bits,
|
||||
consistent allocation will only return > 32-bit addresses for DMA if
|
||||
the consistent DMA mask has been explicitly changed via
|
||||
dma_set_coherent_mask(). This is true of the dma_pool interface as
|
||||
well.
|
||||
|
||||
pci_alloc_consistent returns two values: the virtual address which you
|
||||
dma_alloc_coherent returns two values: the virtual address which you
|
||||
can use to access it from the CPU and dma_handle which you pass to the
|
||||
card.
|
||||
|
||||
@ -354,54 +349,54 @@ buffer you receive will not cross a 64K boundary.
|
||||
|
||||
To unmap and free such a DMA region, you call:
|
||||
|
||||
pci_free_consistent(pdev, size, cpu_addr, dma_handle);
|
||||
dma_free_coherent(dev, size, cpu_addr, dma_handle);
|
||||
|
||||
where pdev, size are the same as in the above call and cpu_addr and
|
||||
dma_handle are the values pci_alloc_consistent returned to you.
|
||||
where dev, size are the same as in the above call and cpu_addr and
|
||||
dma_handle are the values dma_alloc_coherent returned to you.
|
||||
This function may not be called in interrupt context.
|
||||
|
||||
If your driver needs lots of smaller memory regions, you can write
|
||||
custom code to subdivide pages returned by pci_alloc_consistent,
|
||||
or you can use the pci_pool API to do that. A pci_pool is like
|
||||
a kmem_cache, but it uses pci_alloc_consistent not __get_free_pages.
|
||||
custom code to subdivide pages returned by dma_alloc_coherent,
|
||||
or you can use the dma_pool API to do that. A dma_pool is like
|
||||
a kmem_cache, but it uses dma_alloc_coherent not __get_free_pages.
|
||||
Also, it understands common hardware constraints for alignment,
|
||||
like queue heads needing to be aligned on N byte boundaries.
|
||||
|
||||
Create a pci_pool like this:
|
||||
Create a dma_pool like this:
|
||||
|
||||
struct pci_pool *pool;
|
||||
struct dma_pool *pool;
|
||||
|
||||
pool = pci_pool_create(name, pdev, size, align, alloc);
|
||||
pool = dma_pool_create(name, dev, size, align, alloc);
|
||||
|
||||
The "name" is for diagnostics (like a kmem_cache name); pdev and size
|
||||
The "name" is for diagnostics (like a kmem_cache name); dev and size
|
||||
are as above. The device's hardware alignment requirement for this
|
||||
type of data is "align" (which is expressed in bytes, and must be a
|
||||
power of two). If your device has no boundary crossing restrictions,
|
||||
pass 0 for alloc; passing 4096 says memory allocated from this pool
|
||||
must not cross 4KByte boundaries (but at that time it may be better to
|
||||
go for pci_alloc_consistent directly instead).
|
||||
go for dma_alloc_coherent directly instead).
|
||||
|
||||
Allocate memory from a pci pool like this:
|
||||
Allocate memory from a dma pool like this:
|
||||
|
||||
cpu_addr = pci_pool_alloc(pool, flags, &dma_handle);
|
||||
cpu_addr = dma_pool_alloc(pool, flags, &dma_handle);
|
||||
|
||||
flags are SLAB_KERNEL if blocking is permitted (not in_interrupt nor
|
||||
holding SMP locks), SLAB_ATOMIC otherwise. Like pci_alloc_consistent,
|
||||
holding SMP locks), SLAB_ATOMIC otherwise. Like dma_alloc_coherent,
|
||||
this returns two values, cpu_addr and dma_handle.
|
||||
|
||||
Free memory that was allocated from a pci_pool like this:
|
||||
Free memory that was allocated from a dma_pool like this:
|
||||
|
||||
pci_pool_free(pool, cpu_addr, dma_handle);
|
||||
dma_pool_free(pool, cpu_addr, dma_handle);
|
||||
|
||||
where pool is what you passed to pci_pool_alloc, and cpu_addr and
|
||||
dma_handle are the values pci_pool_alloc returned. This function
|
||||
where pool is what you passed to dma_pool_alloc, and cpu_addr and
|
||||
dma_handle are the values dma_pool_alloc returned. This function
|
||||
may be called in interrupt context.
|
||||
|
||||
Destroy a pci_pool by calling:
|
||||
Destroy a dma_pool by calling:
|
||||
|
||||
pci_pool_destroy(pool);
|
||||
dma_pool_destroy(pool);
|
||||
|
||||
Make sure you've called pci_pool_free for all memory allocated
|
||||
Make sure you've called dma_pool_free for all memory allocated
|
||||
from a pool before you destroy the pool. This function may not
|
||||
be called in interrupt context.
|
||||
|
||||
@ -411,15 +406,15 @@ The interfaces described in subsequent portions of this document
|
||||
take a DMA direction argument, which is an integer and takes on
|
||||
one of the following values:
|
||||
|
||||
PCI_DMA_BIDIRECTIONAL
|
||||
PCI_DMA_TODEVICE
|
||||
PCI_DMA_FROMDEVICE
|
||||
PCI_DMA_NONE
|
||||
DMA_BIDIRECTIONAL
|
||||
DMA_TO_DEVICE
|
||||
DMA_FROM_DEVICE
|
||||
DMA_NONE
|
||||
|
||||
One should provide the exact DMA direction if you know it.
|
||||
|
||||
PCI_DMA_TODEVICE means "from main memory to the PCI device"
|
||||
PCI_DMA_FROMDEVICE means "from the PCI device to main memory"
|
||||
DMA_TO_DEVICE means "from main memory to the device"
|
||||
DMA_FROM_DEVICE means "from the device to main memory"
|
||||
It is the direction in which the data moves during the DMA
|
||||
transfer.
|
||||
|
||||
@ -427,12 +422,12 @@ You are _strongly_ encouraged to specify this as precisely
|
||||
as you possibly can.
|
||||
|
||||
If you absolutely cannot know the direction of the DMA transfer,
|
||||
specify PCI_DMA_BIDIRECTIONAL. It means that the DMA can go in
|
||||
specify DMA_BIDIRECTIONAL. It means that the DMA can go in
|
||||
either direction. The platform guarantees that you may legally
|
||||
specify this, and that it will work, but this may be at the
|
||||
cost of performance for example.
|
||||
|
||||
The value PCI_DMA_NONE is to be used for debugging. One can
|
||||
The value DMA_NONE is to be used for debugging. One can
|
||||
hold this in a data structure before you come to know the
|
||||
precise direction, and this will help catch cases where your
|
||||
direction tracking logic has failed to set things up properly.
|
||||
@ -442,21 +437,21 @@ potential platform-specific optimizations of such) is for debugging.
|
||||
Some platforms actually have a write permission boolean which DMA
|
||||
mappings can be marked with, much like page protections in the user
|
||||
program address space. Such platforms can and do report errors in the
|
||||
kernel logs when the PCI controller hardware detects violation of the
|
||||
kernel logs when the DMA controller hardware detects violation of the
|
||||
permission setting.
|
||||
|
||||
Only streaming mappings specify a direction, consistent mappings
|
||||
implicitly have a direction attribute setting of
|
||||
PCI_DMA_BIDIRECTIONAL.
|
||||
DMA_BIDIRECTIONAL.
|
||||
|
||||
The SCSI subsystem tells you the direction to use in the
|
||||
'sc_data_direction' member of the SCSI command your driver is
|
||||
working on.
|
||||
|
||||
For Networking drivers, it's a rather simple affair. For transmit
|
||||
packets, map/unmap them with the PCI_DMA_TODEVICE direction
|
||||
packets, map/unmap them with the DMA_TO_DEVICE direction
|
||||
specifier. For receive packets, just the opposite, map/unmap them
|
||||
with the PCI_DMA_FROMDEVICE direction specifier.
|
||||
with the DMA_FROM_DEVICE direction specifier.
|
||||
|
||||
Using Streaming DMA mappings
|
||||
|
||||
@ -467,43 +462,43 @@ scatterlist.
|
||||
|
||||
To map a single region, you do:
|
||||
|
||||
struct pci_dev *pdev = mydev->pdev;
|
||||
struct device *dev = &my_dev->dev;
|
||||
dma_addr_t dma_handle;
|
||||
void *addr = buffer->ptr;
|
||||
size_t size = buffer->len;
|
||||
|
||||
dma_handle = pci_map_single(pdev, addr, size, direction);
|
||||
dma_handle = dma_map_single(dev, addr, size, direction);
|
||||
|
||||
and to unmap it:
|
||||
|
||||
pci_unmap_single(pdev, dma_handle, size, direction);
|
||||
dma_unmap_single(dev, dma_handle, size, direction);
|
||||
|
||||
You should call pci_unmap_single when the DMA activity is finished, e.g.
|
||||
You should call dma_unmap_single when the DMA activity is finished, e.g.
|
||||
from the interrupt which told you that the DMA transfer is done.
|
||||
|
||||
Using cpu pointers like this for single mappings has a disadvantage,
|
||||
you cannot reference HIGHMEM memory in this way. Thus, there is a
|
||||
map/unmap interface pair akin to pci_{map,unmap}_single. These
|
||||
map/unmap interface pair akin to dma_{map,unmap}_single. These
|
||||
interfaces deal with page/offset pairs instead of cpu pointers.
|
||||
Specifically:
|
||||
|
||||
struct pci_dev *pdev = mydev->pdev;
|
||||
struct device *dev = &my_dev->dev;
|
||||
dma_addr_t dma_handle;
|
||||
struct page *page = buffer->page;
|
||||
unsigned long offset = buffer->offset;
|
||||
size_t size = buffer->len;
|
||||
|
||||
dma_handle = pci_map_page(pdev, page, offset, size, direction);
|
||||
dma_handle = dma_map_page(dev, page, offset, size, direction);
|
||||
|
||||
...
|
||||
|
||||
pci_unmap_page(pdev, dma_handle, size, direction);
|
||||
dma_unmap_page(dev, dma_handle, size, direction);
|
||||
|
||||
Here, "offset" means byte offset within the given page.
|
||||
|
||||
With scatterlists, you map a region gathered from several regions by:
|
||||
|
||||
int i, count = pci_map_sg(pdev, sglist, nents, direction);
|
||||
int i, count = dma_map_sg(dev, sglist, nents, direction);
|
||||
struct scatterlist *sg;
|
||||
|
||||
for_each_sg(sglist, sg, count, i) {
|
||||
@ -527,16 +522,16 @@ accessed sg->address and sg->length as shown above.
|
||||
|
||||
To unmap a scatterlist, just call:
|
||||
|
||||
pci_unmap_sg(pdev, sglist, nents, direction);
|
||||
dma_unmap_sg(dev, sglist, nents, direction);
|
||||
|
||||
Again, make sure DMA activity has already finished.
|
||||
|
||||
PLEASE NOTE: The 'nents' argument to the pci_unmap_sg call must be
|
||||
the _same_ one you passed into the pci_map_sg call,
|
||||
PLEASE NOTE: The 'nents' argument to the dma_unmap_sg call must be
|
||||
the _same_ one you passed into the dma_map_sg call,
|
||||
it should _NOT_ be the 'count' value _returned_ from the
|
||||
pci_map_sg call.
|
||||
dma_map_sg call.
|
||||
|
||||
Every pci_map_{single,sg} call should have its pci_unmap_{single,sg}
|
||||
Every dma_map_{single,sg} call should have its dma_unmap_{single,sg}
|
||||
counterpart, because the bus address space is a shared resource (although
|
||||
in some ports the mapping is per each BUS so less devices contend for the
|
||||
same bus address space) and you could render the machine unusable by eating
|
||||
@ -547,14 +542,14 @@ the data in between the DMA transfers, the buffer needs to be synced
|
||||
properly in order for the cpu and device to see the most uptodate and
|
||||
correct copy of the DMA buffer.
|
||||
|
||||
So, firstly, just map it with pci_map_{single,sg}, and after each DMA
|
||||
So, firstly, just map it with dma_map_{single,sg}, and after each DMA
|
||||
transfer call either:
|
||||
|
||||
pci_dma_sync_single_for_cpu(pdev, dma_handle, size, direction);
|
||||
dma_sync_single_for_cpu(dev, dma_handle, size, direction);
|
||||
|
||||
or:
|
||||
|
||||
pci_dma_sync_sg_for_cpu(pdev, sglist, nents, direction);
|
||||
dma_sync_sg_for_cpu(dev, sglist, nents, direction);
|
||||
|
||||
as appropriate.
|
||||
|
||||
@ -562,27 +557,27 @@ Then, if you wish to let the device get at the DMA area again,
|
||||
finish accessing the data with the cpu, and then before actually
|
||||
giving the buffer to the hardware call either:
|
||||
|
||||
pci_dma_sync_single_for_device(pdev, dma_handle, size, direction);
|
||||
dma_sync_single_for_device(dev, dma_handle, size, direction);
|
||||
|
||||
or:
|
||||
|
||||
pci_dma_sync_sg_for_device(dev, sglist, nents, direction);
|
||||
dma_sync_sg_for_device(dev, sglist, nents, direction);
|
||||
|
||||
as appropriate.
|
||||
|
||||
After the last DMA transfer call one of the DMA unmap routines
|
||||
pci_unmap_{single,sg}. If you don't touch the data from the first pci_map_*
|
||||
call till pci_unmap_*, then you don't have to call the pci_dma_sync_*
|
||||
dma_unmap_{single,sg}. If you don't touch the data from the first dma_map_*
|
||||
call till dma_unmap_*, then you don't have to call the dma_sync_*
|
||||
routines at all.
|
||||
|
||||
Here is pseudo code which shows a situation in which you would need
|
||||
to use the pci_dma_sync_*() interfaces.
|
||||
to use the dma_sync_*() interfaces.
|
||||
|
||||
my_card_setup_receive_buffer(struct my_card *cp, char *buffer, int len)
|
||||
{
|
||||
dma_addr_t mapping;
|
||||
|
||||
mapping = pci_map_single(cp->pdev, buffer, len, PCI_DMA_FROMDEVICE);
|
||||
mapping = dma_map_single(cp->dev, buffer, len, DMA_FROM_DEVICE);
|
||||
|
||||
cp->rx_buf = buffer;
|
||||
cp->rx_len = len;
|
||||
@ -606,25 +601,25 @@ to use the pci_dma_sync_*() interfaces.
|
||||
* the DMA transfer with the CPU first
|
||||
* so that we see updated contents.
|
||||
*/
|
||||
pci_dma_sync_single_for_cpu(cp->pdev, cp->rx_dma,
|
||||
dma_sync_single_for_cpu(&cp->dev, cp->rx_dma,
|
||||
cp->rx_len,
|
||||
PCI_DMA_FROMDEVICE);
|
||||
DMA_FROM_DEVICE);
|
||||
|
||||
/* Now it is safe to examine the buffer. */
|
||||
hp = (struct my_card_header *) cp->rx_buf;
|
||||
if (header_is_ok(hp)) {
|
||||
pci_unmap_single(cp->pdev, cp->rx_dma, cp->rx_len,
|
||||
PCI_DMA_FROMDEVICE);
|
||||
dma_unmap_single(&cp->dev, cp->rx_dma, cp->rx_len,
|
||||
DMA_FROM_DEVICE);
|
||||
pass_to_upper_layers(cp->rx_buf);
|
||||
make_and_setup_new_rx_buf(cp);
|
||||
} else {
|
||||
/* Just sync the buffer and give it back
|
||||
* to the card.
|
||||
*/
|
||||
pci_dma_sync_single_for_device(cp->pdev,
|
||||
dma_sync_single_for_device(&cp->dev,
|
||||
cp->rx_dma,
|
||||
cp->rx_len,
|
||||
PCI_DMA_FROMDEVICE);
|
||||
DMA_FROM_DEVICE);
|
||||
give_rx_buf_to_card(cp);
|
||||
}
|
||||
}
|
||||
@ -634,19 +629,19 @@ Drivers converted fully to this interface should not use virt_to_bus any
|
||||
longer, nor should they use bus_to_virt. Some drivers have to be changed a
|
||||
little bit, because there is no longer an equivalent to bus_to_virt in the
|
||||
dynamic DMA mapping scheme - you have to always store the DMA addresses
|
||||
returned by the pci_alloc_consistent, pci_pool_alloc, and pci_map_single
|
||||
calls (pci_map_sg stores them in the scatterlist itself if the platform
|
||||
returned by the dma_alloc_coherent, dma_pool_alloc, and dma_map_single
|
||||
calls (dma_map_sg stores them in the scatterlist itself if the platform
|
||||
supports dynamic DMA mapping in hardware) in your driver structures and/or
|
||||
in the card registers.
|
||||
|
||||
All PCI drivers should be using these interfaces with no exceptions.
|
||||
It is planned to completely remove virt_to_bus() and bus_to_virt() as
|
||||
All drivers should be using these interfaces with no exceptions. It
|
||||
is planned to completely remove virt_to_bus() and bus_to_virt() as
|
||||
they are entirely deprecated. Some ports already do not provide these
|
||||
as it is impossible to correctly support them.
|
||||
|
||||
Optimizing Unmap State Space Consumption
|
||||
|
||||
On many platforms, pci_unmap_{single,page}() is simply a nop.
|
||||
On many platforms, dma_unmap_{single,page}() is simply a nop.
|
||||
Therefore, keeping track of the mapping address and length is a waste
|
||||
of space. Instead of filling your drivers up with ifdefs and the like
|
||||
to "work around" this (which would defeat the whole purpose of a
|
||||
@ -655,7 +650,7 @@ portable API) the following facilities are provided.
|
||||
Actually, instead of describing the macros one by one, we'll
|
||||
transform some example code.
|
||||
|
||||
1) Use DECLARE_PCI_UNMAP_{ADDR,LEN} in state saving structures.
|
||||
1) Use DEFINE_DMA_UNMAP_{ADDR,LEN} in state saving structures.
|
||||
Example, before:
|
||||
|
||||
struct ring_state {
|
||||
@ -668,14 +663,11 @@ transform some example code.
|
||||
|
||||
struct ring_state {
|
||||
struct sk_buff *skb;
|
||||
DECLARE_PCI_UNMAP_ADDR(mapping)
|
||||
DECLARE_PCI_UNMAP_LEN(len)
|
||||
DEFINE_DMA_UNMAP_ADDR(mapping);
|
||||
DEFINE_DMA_UNMAP_LEN(len);
|
||||
};
|
||||
|
||||
NOTE: DO NOT put a semicolon at the end of the DECLARE_*()
|
||||
macro.
|
||||
|
||||
2) Use pci_unmap_{addr,len}_set to set these values.
|
||||
2) Use dma_unmap_{addr,len}_set to set these values.
|
||||
Example, before:
|
||||
|
||||
ringp->mapping = FOO;
|
||||
@ -683,21 +675,21 @@ transform some example code.
|
||||
|
||||
after:
|
||||
|
||||
pci_unmap_addr_set(ringp, mapping, FOO);
|
||||
pci_unmap_len_set(ringp, len, BAR);
|
||||
dma_unmap_addr_set(ringp, mapping, FOO);
|
||||
dma_unmap_len_set(ringp, len, BAR);
|
||||
|
||||
3) Use pci_unmap_{addr,len} to access these values.
|
||||
3) Use dma_unmap_{addr,len} to access these values.
|
||||
Example, before:
|
||||
|
||||
pci_unmap_single(pdev, ringp->mapping, ringp->len,
|
||||
PCI_DMA_FROMDEVICE);
|
||||
dma_unmap_single(dev, ringp->mapping, ringp->len,
|
||||
DMA_FROM_DEVICE);
|
||||
|
||||
after:
|
||||
|
||||
pci_unmap_single(pdev,
|
||||
pci_unmap_addr(ringp, mapping),
|
||||
pci_unmap_len(ringp, len),
|
||||
PCI_DMA_FROMDEVICE);
|
||||
dma_unmap_single(dev,
|
||||
dma_unmap_addr(ringp, mapping),
|
||||
dma_unmap_len(ringp, len),
|
||||
DMA_FROM_DEVICE);
|
||||
|
||||
It really should be self-explanatory. We treat the ADDR and LEN
|
||||
separately, because it is possible for an implementation to only
|
||||
@ -732,15 +724,15 @@ to "Closing".
|
||||
DMA address space is limited on some architectures and an allocation
|
||||
failure can be determined by:
|
||||
|
||||
- checking if pci_alloc_consistent returns NULL or pci_map_sg returns 0
|
||||
- checking if dma_alloc_coherent returns NULL or dma_map_sg returns 0
|
||||
|
||||
- checking the returned dma_addr_t of pci_map_single and pci_map_page
|
||||
by using pci_dma_mapping_error():
|
||||
- checking the returned dma_addr_t of dma_map_single and dma_map_page
|
||||
by using dma_mapping_error():
|
||||
|
||||
dma_addr_t dma_handle;
|
||||
|
||||
dma_handle = pci_map_single(pdev, addr, size, direction);
|
||||
if (pci_dma_mapping_error(pdev, dma_handle)) {
|
||||
dma_handle = dma_map_single(dev, addr, size, direction);
|
||||
if (dma_mapping_error(dev, dma_handle)) {
|
||||
/*
|
||||
* reduce current DMA mapping usage,
|
||||
* delay and try again later or
|
@ -4,20 +4,18 @@
|
||||
James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
|
||||
|
||||
This document describes the DMA API. For a more gentle introduction
|
||||
phrased in terms of the pci_ equivalents (and actual examples) see
|
||||
Documentation/PCI/PCI-DMA-mapping.txt.
|
||||
of the API (and actual examples) see
|
||||
Documentation/DMA-API-HOWTO.txt.
|
||||
|
||||
This API is split into two pieces. Part I describes the API and the
|
||||
corresponding pci_ API. Part II describes the extensions to the API
|
||||
for supporting non-consistent memory machines. Unless you know that
|
||||
your driver absolutely has to support non-consistent platforms (this
|
||||
is usually only legacy platforms) you should only use the API
|
||||
described in part I.
|
||||
This API is split into two pieces. Part I describes the API. Part II
|
||||
describes the extensions to the API for supporting non-consistent
|
||||
memory machines. Unless you know that your driver absolutely has to
|
||||
support non-consistent platforms (this is usually only legacy
|
||||
platforms) you should only use the API described in part I.
|
||||
|
||||
Part I - pci_ and dma_ Equivalent API
|
||||
Part I - dma_ API
|
||||
-------------------------------------
|
||||
|
||||
To get the pci_ API, you must #include <linux/pci.h>
|
||||
To get the dma_ API, you must #include <linux/dma-mapping.h>
|
||||
|
||||
|
||||
@ -27,9 +25,6 @@ Part Ia - Using large dma-coherent buffers
|
||||
void *
|
||||
dma_alloc_coherent(struct device *dev, size_t size,
|
||||
dma_addr_t *dma_handle, gfp_t flag)
|
||||
void *
|
||||
pci_alloc_consistent(struct pci_dev *dev, size_t size,
|
||||
dma_addr_t *dma_handle)
|
||||
|
||||
Consistent memory is memory for which a write by either the device or
|
||||
the processor can immediately be read by the processor or device
|
||||
@ -53,15 +48,11 @@ The simplest way to do that is to use the dma_pool calls (see below).
|
||||
The flag parameter (dma_alloc_coherent only) allows the caller to
|
||||
specify the GFP_ flags (see kmalloc) for the allocation (the
|
||||
implementation may choose to ignore flags that affect the location of
|
||||
the returned memory, like GFP_DMA). For pci_alloc_consistent, you
|
||||
must assume GFP_ATOMIC behaviour.
|
||||
the returned memory, like GFP_DMA).
|
||||
|
||||
void
|
||||
dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
|
||||
dma_addr_t dma_handle)
|
||||
void
|
||||
pci_free_consistent(struct pci_dev *dev, size_t size, void *cpu_addr,
|
||||
dma_addr_t dma_handle)
|
||||
|
||||
Free the region of consistent memory you previously allocated. dev,
|
||||
size and dma_handle must all be the same as those passed into the
|
||||
@ -89,10 +80,6 @@ for alignment, like queue heads needing to be aligned on N-byte boundaries.
|
||||
dma_pool_create(const char *name, struct device *dev,
|
||||
size_t size, size_t align, size_t alloc);
|
||||
|
||||
struct pci_pool *
|
||||
pci_pool_create(const char *name, struct pci_device *dev,
|
||||
size_t size, size_t align, size_t alloc);
|
||||
|
||||
The pool create() routines initialize a pool of dma-coherent buffers
|
||||
for use with a given device. It must be called in a context which
|
||||
can sleep.
|
||||
@ -108,9 +95,6 @@ from this pool must not cross 4KByte boundaries.
|
||||
void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags,
|
||||
dma_addr_t *dma_handle);
|
||||
|
||||
void *pci_pool_alloc(struct pci_pool *pool, gfp_t gfp_flags,
|
||||
dma_addr_t *dma_handle);
|
||||
|
||||
This allocates memory from the pool; the returned memory will meet the size
|
||||
and alignment requirements specified at creation time. Pass GFP_ATOMIC to
|
||||
prevent blocking, or if it's permitted (not in_interrupt, not holding SMP locks),
|
||||
@ -122,9 +106,6 @@ pool's device.
|
||||
void dma_pool_free(struct dma_pool *pool, void *vaddr,
|
||||
dma_addr_t addr);
|
||||
|
||||
void pci_pool_free(struct pci_pool *pool, void *vaddr,
|
||||
dma_addr_t addr);
|
||||
|
||||
This puts memory back into the pool. The pool is what was passed to
|
||||
the pool allocation routine; the cpu (vaddr) and dma addresses are what
|
||||
were returned when that routine allocated the memory being freed.
|
||||
@ -132,8 +113,6 @@ were returned when that routine allocated the memory being freed.
|
||||
|
||||
void dma_pool_destroy(struct dma_pool *pool);
|
||||
|
||||
void pci_pool_destroy(struct pci_pool *pool);
|
||||
|
||||
The pool destroy() routines free the resources of the pool. They must be
|
||||
called in a context which can sleep. Make sure you've freed all allocated
|
||||
memory back to the pool before you destroy it.
|
||||
@ -144,8 +123,6 @@ Part Ic - DMA addressing limitations
|
||||
|
||||
int
|
||||
dma_supported(struct device *dev, u64 mask)
|
||||
int
|
||||
pci_dma_supported(struct pci_dev *hwdev, u64 mask)
|
||||
|
||||
Checks to see if the device can support DMA to the memory described by
|
||||
mask.
|
||||
@ -159,8 +136,14 @@ driver writers.
|
||||
|
||||
int
|
||||
dma_set_mask(struct device *dev, u64 mask)
|
||||
|
||||
Checks to see if the mask is possible and updates the device
|
||||
parameters if it is.
|
||||
|
||||
Returns: 0 if successful and a negative error if not.
|
||||
|
||||
int
|
||||
pci_set_dma_mask(struct pci_device *dev, u64 mask)
|
||||
dma_set_coherent_mask(struct device *dev, u64 mask)
|
||||
|
||||
Checks to see if the mask is possible and updates the device
|
||||
parameters if it is.
|
||||
@ -187,9 +170,6 @@ Part Id - Streaming DMA mappings
|
||||
dma_addr_t
|
||||
dma_map_single(struct device *dev, void *cpu_addr, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
dma_addr_t
|
||||
pci_map_single(struct pci_dev *hwdev, void *cpu_addr, size_t size,
|
||||
int direction)
|
||||
|
||||
Maps a piece of processor virtual memory so it can be accessed by the
|
||||
device and returns the physical handle of the memory.
|
||||
@ -198,14 +178,10 @@ The direction for both api's may be converted freely by casting.
|
||||
However the dma_ API uses a strongly typed enumerator for its
|
||||
direction:
|
||||
|
||||
DMA_NONE = PCI_DMA_NONE no direction (used for
|
||||
debugging)
|
||||
DMA_TO_DEVICE = PCI_DMA_TODEVICE data is going from the
|
||||
memory to the device
|
||||
DMA_FROM_DEVICE = PCI_DMA_FROMDEVICE data is coming from
|
||||
the device to the
|
||||
memory
|
||||
DMA_BIDIRECTIONAL = PCI_DMA_BIDIRECTIONAL direction isn't known
|
||||
DMA_NONE no direction (used for debugging)
|
||||
DMA_TO_DEVICE data is going from the memory to the device
|
||||
DMA_FROM_DEVICE data is coming from the device to the memory
|
||||
DMA_BIDIRECTIONAL direction isn't known
|
||||
|
||||
Notes: Not all memory regions in a machine can be mapped by this
|
||||
API. Further, regions that appear to be physically contiguous in
|
||||
@ -268,9 +244,6 @@ cache lines are updated with data that the device may have changed).
|
||||
void
|
||||
dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
void
|
||||
pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr,
|
||||
size_t size, int direction)
|
||||
|
||||
Unmaps the region previously mapped. All the parameters passed in
|
||||
must be identical to those passed in (and returned) by the mapping
|
||||
@ -280,15 +253,9 @@ dma_addr_t
|
||||
dma_map_page(struct device *dev, struct page *page,
|
||||
unsigned long offset, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
dma_addr_t
|
||||
pci_map_page(struct pci_dev *hwdev, struct page *page,
|
||||
unsigned long offset, size_t size, int direction)
|
||||
void
|
||||
dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
void
|
||||
pci_unmap_page(struct pci_dev *hwdev, dma_addr_t dma_address,
|
||||
size_t size, int direction)
|
||||
|
||||
API for mapping and unmapping for pages. All the notes and warnings
|
||||
for the other mapping APIs apply here. Also, although the <offset>
|
||||
@ -299,9 +266,6 @@ cache width is.
|
||||
int
|
||||
dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
|
||||
|
||||
int
|
||||
pci_dma_mapping_error(struct pci_dev *hwdev, dma_addr_t dma_addr)
|
||||
|
||||
In some circumstances dma_map_single and dma_map_page will fail to create
|
||||
a mapping. A driver can check for these errors by testing the returned
|
||||
dma address with dma_mapping_error(). A non-zero return value means the mapping
|
||||
@ -311,9 +275,6 @@ reduce current DMA mapping usage or delay and try again later).
|
||||
int
|
||||
dma_map_sg(struct device *dev, struct scatterlist *sg,
|
||||
int nents, enum dma_data_direction direction)
|
||||
int
|
||||
pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg,
|
||||
int nents, int direction)
|
||||
|
||||
Returns: the number of physical segments mapped (this may be shorter
|
||||
than <nents> passed in if some elements of the scatter/gather list are
|
||||
@ -353,9 +314,6 @@ accessed sg->address and sg->length as shown above.
|
||||
void
|
||||
dma_unmap_sg(struct device *dev, struct scatterlist *sg,
|
||||
int nhwentries, enum dma_data_direction direction)
|
||||
void
|
||||
pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg,
|
||||
int nents, int direction)
|
||||
|
||||
Unmap the previously mapped scatter/gather list. All the parameters
|
||||
must be the same as those and passed in to the scatter/gather mapping
|
||||
@ -365,21 +323,23 @@ Note: <nents> must be the number you passed in, *not* the number of
|
||||
physical entries returned.
|
||||
|
||||
void
|
||||
dma_sync_single(struct device *dev, dma_addr_t dma_handle, size_t size,
|
||||
dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
void
|
||||
pci_dma_sync_single(struct pci_dev *hwdev, dma_addr_t dma_handle,
|
||||
size_t size, int direction)
|
||||
void
|
||||
dma_sync_sg(struct device *dev, struct scatterlist *sg, int nelems,
|
||||
dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
void
|
||||
pci_dma_sync_sg(struct pci_dev *hwdev, struct scatterlist *sg,
|
||||
int nelems, int direction)
|
||||
dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
|
||||
enum dma_data_direction direction)
|
||||
void
|
||||
dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
|
||||
enum dma_data_direction direction)
|
||||
|
||||
Synchronise a single contiguous or scatter/gather mapping. All the
|
||||
parameters must be the same as those passed into the single mapping
|
||||
API.
|
||||
Synchronise a single contiguous or scatter/gather mapping for the cpu
|
||||
and device. With the sync_sg API, all the parameters must be the same
|
||||
as those passed into the single mapping API. With the sync_single API,
|
||||
you can use dma_handle and size parameters that aren't identical to
|
||||
those passed into the single mapping API to do a partial sync.
|
||||
|
||||
Notes: You must do this:
|
||||
|
||||
@ -461,9 +421,9 @@ void whizco_dma_map_sg_attrs(struct device *dev, dma_addr_t dma_addr,
|
||||
Part II - Advanced dma_ usage
|
||||
-----------------------------
|
||||
|
||||
Warning: These pieces of the DMA API have no PCI equivalent. They
|
||||
should also not be used in the majority of cases, since they cater for
|
||||
unlikely corner cases that don't belong in usual drivers.
|
||||
Warning: These pieces of the DMA API should not be used in the
|
||||
majority of cases, since they cater for unlikely corner cases that
|
||||
don't belong in usual drivers.
|
||||
|
||||
If you don't understand how cache line coherency works between a
|
||||
processor and an I/O device, you should not be using this part of the
|
||||
@ -513,16 +473,6 @@ line, but it will guarantee that one or more cache lines fit exactly
|
||||
into the width returned by this call. It will also always be a power
|
||||
of two for easy alignment.
|
||||
|
||||
void
|
||||
dma_sync_single_range(struct device *dev, dma_addr_t dma_handle,
|
||||
unsigned long offset, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
|
||||
Does a partial sync, starting at offset and continuing for size. You
|
||||
must be careful to observe the cache alignment and width when doing
|
||||
anything like this. You must also be extra careful about accessing
|
||||
memory you intend to sync partially.
|
||||
|
||||
void
|
||||
dma_cache_sync(struct device *dev, void *vaddr, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
|
@ -45,7 +45,7 @@
|
||||
</sect1>
|
||||
|
||||
<sect1><title>Atomic and pointer manipulation</title>
|
||||
!Iarch/x86/include/asm/atomic_32.h
|
||||
!Iarch/x86/include/asm/atomic.h
|
||||
!Iarch/x86/include/asm/unaligned.h
|
||||
</sect1>
|
||||
|
||||
|
@ -316,7 +316,7 @@ CPU B: spin_unlock_irqrestore(&dev_lock, flags)
|
||||
|
||||
<chapter id="pubfunctions">
|
||||
<title>Public Functions Provided</title>
|
||||
!Iarch/x86/include/asm/io_32.h
|
||||
!Iarch/x86/include/asm/io.h
|
||||
!Elib/iomap.c
|
||||
</chapter>
|
||||
|
||||
|
@ -144,7 +144,7 @@ usage should require reading the full document.
|
||||
this though and the recommendation to allow only a single
|
||||
interface in STA mode at first!
|
||||
</para>
|
||||
!Finclude/net/mac80211.h ieee80211_if_init_conf
|
||||
!Finclude/net/mac80211.h ieee80211_vif
|
||||
</chapter>
|
||||
|
||||
<chapter id="rx-tx">
|
||||
@ -234,7 +234,6 @@ usage should require reading the full document.
|
||||
<title>Multiple queues and QoS support</title>
|
||||
<para>TBD</para>
|
||||
!Finclude/net/mac80211.h ieee80211_tx_queue_params
|
||||
!Finclude/net/mac80211.h ieee80211_tx_queue_stats
|
||||
</chapter>
|
||||
|
||||
<chapter id="AP">
|
||||
|
@ -174,7 +174,7 @@
|
||||
</para>
|
||||
<programlisting>
|
||||
static struct mtd_info *board_mtd;
|
||||
static unsigned long baseaddr;
|
||||
static void __iomem *baseaddr;
|
||||
</programlisting>
|
||||
<para>
|
||||
Static example
|
||||
@ -182,7 +182,7 @@ static unsigned long baseaddr;
|
||||
<programlisting>
|
||||
static struct mtd_info board_mtd;
|
||||
static struct nand_chip board_chip;
|
||||
static unsigned long baseaddr;
|
||||
static void __iomem *baseaddr;
|
||||
</programlisting>
|
||||
</sect1>
|
||||
<sect1 id="Partition_defines">
|
||||
@ -283,8 +283,8 @@ int __init board_init (void)
|
||||
}
|
||||
|
||||
/* map physical address */
|
||||
baseaddr = (unsigned long)ioremap(CHIP_PHYSICAL_ADDRESS, 1024);
|
||||
if(!baseaddr){
|
||||
baseaddr = ioremap(CHIP_PHYSICAL_ADDRESS, 1024);
|
||||
if (!baseaddr) {
|
||||
printk("Ioremap to access NAND chip failed\n");
|
||||
err = -EIO;
|
||||
goto out_mtd;
|
||||
@ -316,7 +316,7 @@ int __init board_init (void)
|
||||
goto out;
|
||||
|
||||
out_ior:
|
||||
iounmap((void *)baseaddr);
|
||||
iounmap(baseaddr);
|
||||
out_mtd:
|
||||
kfree (board_mtd);
|
||||
out:
|
||||
@ -341,7 +341,7 @@ static void __exit board_cleanup (void)
|
||||
nand_release (board_mtd);
|
||||
|
||||
/* unmap physical address */
|
||||
iounmap((void *)baseaddr);
|
||||
iounmap(baseaddr);
|
||||
|
||||
/* Free the MTD device structure */
|
||||
kfree (board_mtd);
|
||||
@ -488,7 +488,7 @@ static void board_select_chip (struct mtd_info *mtd, int chip)
|
||||
The ECC bytes must be placed immidiately after the data
|
||||
bytes in order to make the syndrome generator work. This
|
||||
is contrary to the usual layout used by software ECC. The
|
||||
seperation of data and out of band area is not longer
|
||||
separation of data and out of band area is not longer
|
||||
possible. The nand driver code handles this layout and
|
||||
the remaining free bytes in the oob area are managed by
|
||||
the autoplacement code. Provide a matching oob-layout
|
||||
@ -560,7 +560,7 @@ static void board_select_chip (struct mtd_info *mtd, int chip)
|
||||
bad blocks. They have factory marked good blocks. The marker pattern
|
||||
is erased when the block is erased to be reused. So in case of
|
||||
powerloss before writing the pattern back to the chip this block
|
||||
would be lost and added to the bad blocks. Therefor we scan the
|
||||
would be lost and added to the bad blocks. Therefore we scan the
|
||||
chip(s) when we detect them the first time for good blocks and
|
||||
store this information in a bad block table before erasing any
|
||||
of the blocks.
|
||||
@ -1094,7 +1094,7 @@ in this page</entry>
|
||||
manufacturers specifications. This applies similar to the spare area.
|
||||
</para>
|
||||
<para>
|
||||
Therefor NAND aware filesystems must either write in page size chunks
|
||||
Therefore NAND aware filesystems must either write in page size chunks
|
||||
or hold a writebuffer to collect smaller writes until they sum up to
|
||||
pagesize. Available NAND aware filesystems: JFFS2, YAFFS.
|
||||
</para>
|
||||
|
@ -16,6 +16,15 @@
|
||||
</address>
|
||||
</affiliation>
|
||||
</author>
|
||||
<author>
|
||||
<firstname>William</firstname>
|
||||
<surname>Cohen</surname>
|
||||
<affiliation>
|
||||
<address>
|
||||
<email>wcohen@redhat.com</email>
|
||||
</address>
|
||||
</affiliation>
|
||||
</author>
|
||||
</authorgroup>
|
||||
|
||||
<legalnotice>
|
||||
@ -91,4 +100,8 @@
|
||||
!Iinclude/trace/events/signal.h
|
||||
</chapter>
|
||||
|
||||
<chapter id="block">
|
||||
<title>Block IO</title>
|
||||
!Iinclude/trace/events/block.h
|
||||
</chapter>
|
||||
</book>
|
||||
|
@ -1170,7 +1170,7 @@ frames per second. If less than this number of frames is to be
|
||||
captured or output, applications can request frame skipping or
|
||||
duplicating on the driver side. This is especially useful when using
|
||||
the &func-read; or &func-write;, which are not augmented by timestamps
|
||||
or sequence counters, and to avoid unneccessary data copying.</para>
|
||||
or sequence counters, and to avoid unnecessary data copying.</para>
|
||||
|
||||
<para>Finally these ioctls can be used to determine the number of
|
||||
buffers used internally by a driver in read/write mode. For
|
||||
|
@ -589,7 +589,8 @@ number of a video input as in &v4l2-input; field
|
||||
<entry></entry>
|
||||
<entry>A place holder for future extensions and custom
|
||||
(driver defined) buffer types
|
||||
<constant>V4L2_BUF_TYPE_PRIVATE</constant> and higher.</entry>
|
||||
<constant>V4L2_BUF_TYPE_PRIVATE</constant> and higher. Applications
|
||||
should set this to 0.</entry>
|
||||
</row>
|
||||
</tbody>
|
||||
</tgroup>
|
||||
|
@ -55,7 +55,7 @@ captured or output, applications can request frame skipping or
|
||||
duplicating on the driver side. This is especially useful when using
|
||||
the <function>read()</function> or <function>write()</function>, which
|
||||
are not augmented by timestamps or sequence counters, and to avoid
|
||||
unneccessary data copying.</para>
|
||||
unnecessary data copying.</para>
|
||||
|
||||
<para>Further these ioctls can be used to determine the number of
|
||||
buffers used internally by a driver in read/write mode. For
|
||||
|
@ -54,12 +54,10 @@ to enqueue an empty (capturing) or filled (output) buffer in the
|
||||
driver's incoming queue. The semantics depend on the selected I/O
|
||||
method.</para>
|
||||
|
||||
<para>To enqueue a <link linkend="mmap">memory mapped</link>
|
||||
buffer applications set the <structfield>type</structfield> field of a
|
||||
&v4l2-buffer; to the same buffer type as previously &v4l2-format;
|
||||
<structfield>type</structfield> and &v4l2-requestbuffers;
|
||||
<structfield>type</structfield>, the <structfield>memory</structfield>
|
||||
field to <constant>V4L2_MEMORY_MMAP</constant> and the
|
||||
<para>To enqueue a buffer applications set the <structfield>type</structfield>
|
||||
field of a &v4l2-buffer; to the same buffer type as was previously used
|
||||
with &v4l2-format; <structfield>type</structfield> and &v4l2-requestbuffers;
|
||||
<structfield>type</structfield>. Applications must also set the
|
||||
<structfield>index</structfield> field. Valid index numbers range from
|
||||
zero to the number of buffers allocated with &VIDIOC-REQBUFS;
|
||||
(&v4l2-requestbuffers; <structfield>count</structfield>) minus one. The
|
||||
@ -70,8 +68,19 @@ intended for output (<structfield>type</structfield> is
|
||||
<constant>V4L2_BUF_TYPE_VBI_OUTPUT</constant>) applications must also
|
||||
initialize the <structfield>bytesused</structfield>,
|
||||
<structfield>field</structfield> and
|
||||
<structfield>timestamp</structfield> fields. See <xref
|
||||
linkend="buffer" /> for details. When
|
||||
<structfield>timestamp</structfield> fields, see <xref
|
||||
linkend="buffer" /> for details.
|
||||
Applications must also set <structfield>flags</structfield> to 0. If a driver
|
||||
supports capturing from specific video inputs and you want to specify a video
|
||||
input, then <structfield>flags</structfield> should be set to
|
||||
<constant>V4L2_BUF_FLAG_INPUT</constant> and the field
|
||||
<structfield>input</structfield> must be initialized to the desired input.
|
||||
The <structfield>reserved</structfield> field must be set to 0.
|
||||
</para>
|
||||
|
||||
<para>To enqueue a <link linkend="mmap">memory mapped</link>
|
||||
buffer applications set the <structfield>memory</structfield>
|
||||
field to <constant>V4L2_MEMORY_MMAP</constant>. When
|
||||
<constant>VIDIOC_QBUF</constant> is called with a pointer to this
|
||||
structure the driver sets the
|
||||
<constant>V4L2_BUF_FLAG_MAPPED</constant> and
|
||||
@ -81,14 +90,10 @@ structure the driver sets the
|
||||
&EINVAL;.</para>
|
||||
|
||||
<para>To enqueue a <link linkend="userp">user pointer</link>
|
||||
buffer applications set the <structfield>type</structfield> field of a
|
||||
&v4l2-buffer; to the same buffer type as previously &v4l2-format;
|
||||
<structfield>type</structfield> and &v4l2-requestbuffers;
|
||||
<structfield>type</structfield>, the <structfield>memory</structfield>
|
||||
field to <constant>V4L2_MEMORY_USERPTR</constant> and the
|
||||
buffer applications set the <structfield>memory</structfield>
|
||||
field to <constant>V4L2_MEMORY_USERPTR</constant>, the
|
||||
<structfield>m.userptr</structfield> field to the address of the
|
||||
buffer and <structfield>length</structfield> to its size. When the
|
||||
buffer is intended for output additional fields must be set as above.
|
||||
buffer and <structfield>length</structfield> to its size.
|
||||
When <constant>VIDIOC_QBUF</constant> is called with a pointer to this
|
||||
structure the driver sets the <constant>V4L2_BUF_FLAG_QUEUED</constant>
|
||||
flag and clears the <constant>V4L2_BUF_FLAG_MAPPED</constant> and
|
||||
@ -96,13 +101,14 @@ flag and clears the <constant>V4L2_BUF_FLAG_MAPPED</constant> and
|
||||
<structfield>flags</structfield> field, or it returns an error code.
|
||||
This ioctl locks the memory pages of the buffer in physical memory,
|
||||
they cannot be swapped out to disk. Buffers remain locked until
|
||||
dequeued, until the &VIDIOC-STREAMOFF; or &VIDIOC-REQBUFS; ioctl are
|
||||
dequeued, until the &VIDIOC-STREAMOFF; or &VIDIOC-REQBUFS; ioctl is
|
||||
called, or until the device is closed.</para>
|
||||
|
||||
<para>Applications call the <constant>VIDIOC_DQBUF</constant>
|
||||
ioctl to dequeue a filled (capturing) or displayed (output) buffer
|
||||
from the driver's outgoing queue. They just set the
|
||||
<structfield>type</structfield> and <structfield>memory</structfield>
|
||||
<structfield>type</structfield>, <structfield>memory</structfield>
|
||||
and <structfield>reserved</structfield>
|
||||
fields of a &v4l2-buffer; as above, when <constant>VIDIOC_DQBUF</constant>
|
||||
is called with a pointer to this structure the driver fills the
|
||||
remaining fields or returns an error code.</para>
|
||||
|
@ -54,12 +54,13 @@ buffer at any time after buffers have been allocated with the
|
||||
&VIDIOC-REQBUFS; ioctl.</para>
|
||||
|
||||
<para>Applications set the <structfield>type</structfield> field
|
||||
of a &v4l2-buffer; to the same buffer type as previously
|
||||
of a &v4l2-buffer; to the same buffer type as was previously used with
|
||||
&v4l2-format; <structfield>type</structfield> and &v4l2-requestbuffers;
|
||||
<structfield>type</structfield>, and the <structfield>index</structfield>
|
||||
field. Valid index numbers range from zero
|
||||
to the number of buffers allocated with &VIDIOC-REQBUFS;
|
||||
(&v4l2-requestbuffers; <structfield>count</structfield>) minus one.
|
||||
The <structfield>reserved</structfield> field should to set to 0.
|
||||
After calling <constant>VIDIOC_QUERYBUF</constant> with a pointer to
|
||||
this structure drivers return an error code or fill the rest of
|
||||
the structure.</para>
|
||||
@ -68,8 +69,8 @@ the structure.</para>
|
||||
<constant>V4L2_BUF_FLAG_MAPPED</constant>,
|
||||
<constant>V4L2_BUF_FLAG_QUEUED</constant> and
|
||||
<constant>V4L2_BUF_FLAG_DONE</constant> flags will be valid. The
|
||||
<structfield>memory</structfield> field will be set to
|
||||
<constant>V4L2_MEMORY_MMAP</constant>, the <structfield>m.offset</structfield>
|
||||
<structfield>memory</structfield> field will be set to the current
|
||||
I/O method, the <structfield>m.offset</structfield>
|
||||
contains the offset of the buffer from the start of the device memory,
|
||||
the <structfield>length</structfield> field its size. The driver may
|
||||
or may not set the remaining fields and flags, they are meaningless in
|
||||
|
@ -54,23 +54,23 @@ I/O. Memory mapped buffers are located in device memory and must be
|
||||
allocated with this ioctl before they can be mapped into the
|
||||
application's address space. User buffers are allocated by
|
||||
applications themselves, and this ioctl is merely used to switch the
|
||||
driver into user pointer I/O mode.</para>
|
||||
driver into user pointer I/O mode and to setup some internal structures.</para>
|
||||
|
||||
<para>To allocate device buffers applications initialize three
|
||||
fields of a <structname>v4l2_requestbuffers</structname> structure.
|
||||
<para>To allocate device buffers applications initialize all
|
||||
fields of the <structname>v4l2_requestbuffers</structname> structure.
|
||||
They set the <structfield>type</structfield> field to the respective
|
||||
stream or buffer type, the <structfield>count</structfield> field to
|
||||
the desired number of buffers, and <structfield>memory</structfield>
|
||||
must be set to <constant>V4L2_MEMORY_MMAP</constant>. When the ioctl
|
||||
is called with a pointer to this structure the driver attempts to
|
||||
allocate the requested number of buffers and stores the actual number
|
||||
the desired number of buffers, <structfield>memory</structfield>
|
||||
must be set to the requested I/O method and the reserved array
|
||||
must be zeroed. When the ioctl
|
||||
is called with a pointer to this structure the driver will attempt to allocate
|
||||
the requested number of buffers and it stores the actual number
|
||||
allocated in the <structfield>count</structfield> field. It can be
|
||||
smaller than the number requested, even zero, when the driver runs out
|
||||
of free memory. A larger number is possible when the driver requires
|
||||
more buffers to function correctly.<footnote>
|
||||
<para>For example video output requires at least two buffers,
|
||||
of free memory. A larger number is also possible when the driver requires
|
||||
more buffers to function correctly. For example video output requires at least two buffers,
|
||||
one displayed and one filled by the application.</para>
|
||||
</footnote> When memory mapping I/O is not supported the ioctl
|
||||
<para>When the I/O method is not supported the ioctl
|
||||
returns an &EINVAL;.</para>
|
||||
|
||||
<para>Applications can call <constant>VIDIOC_REQBUFS</constant>
|
||||
@ -81,14 +81,6 @@ in progress, an implicit &VIDIOC-STREAMOFF;. <!-- mhs: I see no
|
||||
reason why munmap()ping one or even all buffers must imply
|
||||
streamoff.--></para>
|
||||
|
||||
<para>To negotiate user pointer I/O, applications initialize only
|
||||
the <structfield>type</structfield> field and set
|
||||
<structfield>memory</structfield> to
|
||||
<constant>V4L2_MEMORY_USERPTR</constant>. When the ioctl is called
|
||||
with a pointer to this structure the driver prepares for user pointer
|
||||
I/O, when this I/O method is not supported the ioctl returns an
|
||||
&EINVAL;.</para>
|
||||
|
||||
<table pgwide="1" frame="none" id="v4l2-requestbuffers">
|
||||
<title>struct <structname>v4l2_requestbuffers</structname></title>
|
||||
<tgroup cols="3">
|
||||
@ -97,9 +89,7 @@ I/O, when this I/O method is not supported the ioctl returns an
|
||||
<row>
|
||||
<entry>__u32</entry>
|
||||
<entry><structfield>count</structfield></entry>
|
||||
<entry>The number of buffers requested or granted. This
|
||||
field is only used when <structfield>memory</structfield> is set to
|
||||
<constant>V4L2_MEMORY_MMAP</constant>.</entry>
|
||||
<entry>The number of buffers requested or granted.</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>&v4l2-buf-type;</entry>
|
||||
@ -120,7 +110,7 @@ as the &v4l2-format; <structfield>type</structfield> field. See <xref
|
||||
<entry><structfield>reserved</structfield>[2]</entry>
|
||||
<entry>A place holder for future extensions and custom
|
||||
(driver defined) buffer types <constant>V4L2_BUF_TYPE_PRIVATE</constant> and
|
||||
higher.</entry>
|
||||
higher. This array should be zeroed by applications.</entry>
|
||||
</row>
|
||||
</tbody>
|
||||
</tgroup>
|
||||
|
@ -221,8 +221,8 @@ branches. These different branches are:
|
||||
- main 2.6.x kernel tree
|
||||
- 2.6.x.y -stable kernel tree
|
||||
- 2.6.x -git kernel patches
|
||||
- 2.6.x -mm kernel patches
|
||||
- subsystem specific kernel trees and patches
|
||||
- the 2.6.x -next kernel tree for integration tests
|
||||
|
||||
2.6.x kernel tree
|
||||
-----------------
|
||||
@ -232,9 +232,9 @@ process is as follows:
|
||||
- As soon as a new kernel is released a two weeks window is open,
|
||||
during this period of time maintainers can submit big diffs to
|
||||
Linus, usually the patches that have already been included in the
|
||||
-mm kernel for a few weeks. The preferred way to submit big changes
|
||||
-next kernel for a few weeks. The preferred way to submit big changes
|
||||
is using git (the kernel's source management tool, more information
|
||||
can be found at http://git.or.cz/) but plain patches are also just
|
||||
can be found at http://git-scm.com/) but plain patches are also just
|
||||
fine.
|
||||
- After two weeks a -rc1 kernel is released it is now possible to push
|
||||
only patches that do not include new features that could affect the
|
||||
@ -293,84 +293,43 @@ daily and represent the current state of Linus' tree. They are more
|
||||
experimental than -rc kernels since they are generated automatically
|
||||
without even a cursory glance to see if they are sane.
|
||||
|
||||
2.6.x -mm kernel patches
|
||||
------------------------
|
||||
These are experimental kernel patches released by Andrew Morton. Andrew
|
||||
takes all of the different subsystem kernel trees and patches and mushes
|
||||
them together, along with a lot of patches that have been plucked from
|
||||
the linux-kernel mailing list. This tree serves as a proving ground for
|
||||
new features and patches. Once a patch has proved its worth in -mm for
|
||||
a while Andrew or the subsystem maintainer pushes it on to Linus for
|
||||
inclusion in mainline.
|
||||
|
||||
It is heavily encouraged that all new patches get tested in the -mm tree
|
||||
before they are sent to Linus for inclusion in the main kernel tree. Code
|
||||
which does not make an appearance in -mm before the opening of the merge
|
||||
window will prove hard to merge into the mainline.
|
||||
|
||||
These kernels are not appropriate for use on systems that are supposed
|
||||
to be stable and they are more risky to run than any of the other
|
||||
branches.
|
||||
|
||||
If you wish to help out with the kernel development process, please test
|
||||
and use these kernel releases and provide feedback to the linux-kernel
|
||||
mailing list if you have any problems, and if everything works properly.
|
||||
|
||||
In addition to all the other experimental patches, these kernels usually
|
||||
also contain any changes in the mainline -git kernels available at the
|
||||
time of release.
|
||||
|
||||
The -mm kernels are not released on a fixed schedule, but usually a few
|
||||
-mm kernels are released in between each -rc kernel (1 to 3 is common).
|
||||
|
||||
Subsystem Specific kernel trees and patches
|
||||
-------------------------------------------
|
||||
A number of the different kernel subsystem developers expose their
|
||||
development trees so that others can see what is happening in the
|
||||
different areas of the kernel. These trees are pulled into the -mm
|
||||
kernel releases as described above.
|
||||
The maintainers of the various kernel subsystems --- and also many
|
||||
kernel subsystem developers --- expose their current state of
|
||||
development in source repositories. That way, others can see what is
|
||||
happening in the different areas of the kernel. In areas where
|
||||
development is rapid, a developer may be asked to base his submissions
|
||||
onto such a subsystem kernel tree so that conflicts between the
|
||||
submission and other already ongoing work are avoided.
|
||||
|
||||
Here is a list of some of the different kernel trees available:
|
||||
git trees:
|
||||
- Kbuild development tree, Sam Ravnborg <sam@ravnborg.org>
|
||||
git.kernel.org:/pub/scm/linux/kernel/git/sam/kbuild.git
|
||||
Most of these repositories are git trees, but there are also other SCMs
|
||||
in use, or patch queues being published as quilt series. Addresses of
|
||||
these subsystem repositories are listed in the MAINTAINERS file. Many
|
||||
of them can be browsed at http://git.kernel.org/.
|
||||
|
||||
- ACPI development tree, Len Brown <len.brown@intel.com>
|
||||
git.kernel.org:/pub/scm/linux/kernel/git/lenb/linux-acpi-2.6.git
|
||||
Before a proposed patch is committed to such a subsystem tree, it is
|
||||
subject to review which primarily happens on mailing lists (see the
|
||||
respective section below). For several kernel subsystems, this review
|
||||
process is tracked with the tool patchwork. Patchwork offers a web
|
||||
interface which shows patch postings, any comments on a patch or
|
||||
revisions to it, and maintainers can mark patches as under review,
|
||||
accepted, or rejected. Most of these patchwork sites are listed at
|
||||
http://patchwork.kernel.org/ or http://patchwork.ozlabs.org/.
|
||||
|
||||
- Block development tree, Jens Axboe <jens.axboe@oracle.com>
|
||||
git.kernel.org:/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git
|
||||
2.6.x -next kernel tree for integration tests
|
||||
---------------------------------------------
|
||||
Before updates from subsystem trees are merged into the mainline 2.6.x
|
||||
tree, they need to be integration-tested. For this purpose, a special
|
||||
testing repository exists into which virtually all subsystem trees are
|
||||
pulled on an almost daily basis:
|
||||
http://git.kernel.org/?p=linux/kernel/git/sfr/linux-next.git
|
||||
http://linux.f-seidel.de/linux-next/pmwiki/
|
||||
|
||||
- DRM development tree, Dave Airlie <airlied@linux.ie>
|
||||
git.kernel.org:/pub/scm/linux/kernel/git/airlied/drm-2.6.git
|
||||
This way, the -next kernel gives a summary outlook onto what will be
|
||||
expected to go into the mainline kernel at the next merge period.
|
||||
Adventurous testers are very welcome to runtime-test the -next kernel.
|
||||
|
||||
- ia64 development tree, Tony Luck <tony.luck@intel.com>
|
||||
git.kernel.org:/pub/scm/linux/kernel/git/aegl/linux-2.6.git
|
||||
|
||||
- infiniband, Roland Dreier <rolandd@cisco.com>
|
||||
git.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git
|
||||
|
||||
- libata, Jeff Garzik <jgarzik@pobox.com>
|
||||
git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/libata-dev.git
|
||||
|
||||
- network drivers, Jeff Garzik <jgarzik@pobox.com>
|
||||
git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/netdev-2.6.git
|
||||
|
||||
- pcmcia, Dominik Brodowski <linux@dominikbrodowski.net>
|
||||
git.kernel.org:/pub/scm/linux/kernel/git/brodo/pcmcia-2.6.git
|
||||
|
||||
- SCSI, James Bottomley <James.Bottomley@hansenpartnership.com>
|
||||
git.kernel.org:/pub/scm/linux/kernel/git/jejb/scsi-misc-2.6.git
|
||||
|
||||
- x86, Ingo Molnar <mingo@elte.hu>
|
||||
git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86.git
|
||||
|
||||
quilt trees:
|
||||
- USB, Driver Core, and I2C, Greg Kroah-Hartman <gregkh@suse.de>
|
||||
kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/
|
||||
|
||||
Other kernel trees can be found listed at http://git.kernel.org/ and in
|
||||
the MAINTAINERS file.
|
||||
|
||||
Bug Reporting
|
||||
-------------
|
||||
|
@ -157,7 +157,7 @@ For such memory, you can do things like
|
||||
* access only the 640k-1MB area, so anything else
|
||||
* has to be remapped.
|
||||
*/
|
||||
char * baseptr = ioremap(0xFC000000, 1024*1024);
|
||||
void __iomem *baseptr = ioremap(0xFC000000, 1024*1024);
|
||||
|
||||
/* write a 'A' to the offset 10 of the area */
|
||||
writeb('A',baseptr+10);
|
||||
|
@ -365,6 +365,7 @@ You can change this at module load time (for a module) with:
|
||||
regshifts=<shift1>,<shift2>,...
|
||||
slave_addrs=<addr1>,<addr2>,...
|
||||
force_kipmid=<enable1>,<enable2>,...
|
||||
kipmid_max_busy_us=<ustime1>,<ustime2>,...
|
||||
unload_when_empty=[0|1]
|
||||
|
||||
Each of these except si_trydefaults is a list, the first item for the
|
||||
@ -433,6 +434,7 @@ kernel command line as:
|
||||
ipmi_si.regshifts=<shift1>,<shift2>,...
|
||||
ipmi_si.slave_addrs=<addr1>,<addr2>,...
|
||||
ipmi_si.force_kipmid=<enable1>,<enable2>,...
|
||||
ipmi_si.kipmid_max_busy_us=<ustime1>,<ustime2>,...
|
||||
|
||||
It works the same as the module parameters of the same names.
|
||||
|
||||
@ -450,6 +452,16 @@ force this thread on or off. If you force it off and don't have
|
||||
interrupts, the driver will run VERY slowly. Don't blame me,
|
||||
these interfaces suck.
|
||||
|
||||
Unfortunately, this thread can use a lot of CPU depending on the
|
||||
interface's performance. This can waste a lot of CPU and cause
|
||||
various issues with detecting idle CPU and using extra power. To
|
||||
avoid this, the kipmid_max_busy_us sets the maximum amount of time, in
|
||||
microseconds, that kipmid will spin before sleeping for a tick. This
|
||||
value sets a balance between performance and CPU waste and needs to be
|
||||
tuned to your needs. Maybe, someday, auto-tuning will be added, but
|
||||
that's not a simple thing and even the auto-tuning would need to be
|
||||
tuned to the user's desired performance.
|
||||
|
||||
The driver supports a hot add and remove of interfaces. This way,
|
||||
interfaces can be added or removed after the kernel is up and running.
|
||||
This is done using /sys/modules/ipmi_si/parameters/hotmod, which is a
|
||||
|
@ -1,3 +1,3 @@
|
||||
obj-m := DocBook/ accounting/ auxdisplay/ connector/ \
|
||||
filesystems/configfs/ ia64/ networking/ \
|
||||
pcmcia/ spi/ video4linux/ vm/ watchdog/src/
|
||||
filesystems/ filesystems/configfs/ ia64/ laptops/ networking/ \
|
||||
pcmcia/ spi/ timers/ video4linux/ vm/ watchdog/src/
|
||||
|
@ -6,16 +6,22 @@ checklist.txt
|
||||
- Review Checklist for RCU Patches
|
||||
listRCU.txt
|
||||
- Using RCU to Protect Read-Mostly Linked Lists
|
||||
lockdep.txt
|
||||
- RCU and lockdep checking
|
||||
NMI-RCU.txt
|
||||
- Using RCU to Protect Dynamic NMI Handlers
|
||||
rcubarrier.txt
|
||||
- RCU and Unloadable Modules
|
||||
rculist_nulls.txt
|
||||
- RCU list primitives for use with SLAB_DESTROY_BY_RCU
|
||||
rcuref.txt
|
||||
- Reference-count design for elements of lists/arrays protected by RCU
|
||||
rcu.txt
|
||||
- RCU Concepts
|
||||
rcubarrier.txt
|
||||
- Unloading modules that use RCU callbacks
|
||||
RTFP.txt
|
||||
- List of RCU papers (bibliography) going back to 1980.
|
||||
stallwarn.txt
|
||||
- RCU CPU stall warnings (CONFIG_RCU_CPU_STALL_DETECTOR)
|
||||
torture.txt
|
||||
- RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST)
|
||||
trace.txt
|
||||
|
@ -34,7 +34,7 @@ NMI handler.
|
||||
cpu = smp_processor_id();
|
||||
++nmi_count(cpu);
|
||||
|
||||
if (!rcu_dereference(nmi_callback)(regs, cpu))
|
||||
if (!rcu_dereference_sched(nmi_callback)(regs, cpu))
|
||||
default_do_nmi(regs);
|
||||
|
||||
nmi_exit();
|
||||
@ -47,12 +47,13 @@ function pointer. If this handler returns zero, do_nmi() invokes the
|
||||
default_do_nmi() function to handle a machine-specific NMI. Finally,
|
||||
preemption is restored.
|
||||
|
||||
Strictly speaking, rcu_dereference() is not needed, since this code runs
|
||||
only on i386, which does not need rcu_dereference() anyway. However,
|
||||
it is a good documentation aid, particularly for anyone attempting to
|
||||
do something similar on Alpha.
|
||||
In theory, rcu_dereference_sched() is not needed, since this code runs
|
||||
only on i386, which in theory does not need rcu_dereference_sched()
|
||||
anyway. However, in practice it is a good documentation aid, particularly
|
||||
for anyone attempting to do something similar on Alpha or on systems
|
||||
with aggressive optimizing compilers.
|
||||
|
||||
Quick Quiz: Why might the rcu_dereference() be necessary on Alpha,
|
||||
Quick Quiz: Why might the rcu_dereference_sched() be necessary on Alpha,
|
||||
given that the code referenced by the pointer is read-only?
|
||||
|
||||
|
||||
@ -99,17 +100,21 @@ invoke irq_enter() and irq_exit() on NMI entry and exit, respectively.
|
||||
|
||||
Answer to Quick Quiz
|
||||
|
||||
Why might the rcu_dereference() be necessary on Alpha, given
|
||||
Why might the rcu_dereference_sched() be necessary on Alpha, given
|
||||
that the code referenced by the pointer is read-only?
|
||||
|
||||
Answer: The caller to set_nmi_callback() might well have
|
||||
initialized some data that is to be used by the
|
||||
new NMI handler. In this case, the rcu_dereference()
|
||||
would be needed, because otherwise a CPU that received
|
||||
an NMI just after the new handler was set might see
|
||||
the pointer to the new NMI handler, but the old
|
||||
pre-initialized version of the handler's data.
|
||||
initialized some data that is to be used by the new NMI
|
||||
handler. In this case, the rcu_dereference_sched() would
|
||||
be needed, because otherwise a CPU that received an NMI
|
||||
just after the new handler was set might see the pointer
|
||||
to the new NMI handler, but the old pre-initialized
|
||||
version of the handler's data.
|
||||
|
||||
More important, the rcu_dereference() makes it clear
|
||||
to someone reading the code that the pointer is being
|
||||
protected by RCU.
|
||||
This same sad story can happen on other CPUs when using
|
||||
a compiler with aggressive pointer-value speculation
|
||||
optimizations.
|
||||
|
||||
More important, the rcu_dereference_sched() makes it
|
||||
clear to someone reading the code that the pointer is
|
||||
being protected by RCU-sched.
|
||||
|
@ -25,10 +25,10 @@ to be referencing the data structure. However, this mechanism was not
|
||||
optimized for modern computer systems, which is not surprising given
|
||||
that these overheads were not so expensive in the mid-80s. Nonetheless,
|
||||
passive serialization appears to be the first deferred-destruction
|
||||
mechanism to be used in production. Furthermore, the relevant patent has
|
||||
lapsed, so this approach may be used in non-GPL software, if desired.
|
||||
(In contrast, use of RCU is permitted only in software licensed under
|
||||
GPL. Sorry!!!)
|
||||
mechanism to be used in production. Furthermore, the relevant patent
|
||||
has lapsed, so this approach may be used in non-GPL software, if desired.
|
||||
(In contrast, implementation of RCU is permitted only in software licensed
|
||||
under either GPL or LGPL. Sorry!!!)
|
||||
|
||||
In 1990, Pugh [Pugh90] noted that explicitly tracking which threads
|
||||
were reading a given data structure permitted deferred free to operate
|
||||
@ -150,6 +150,18 @@ preemptible RCU [PaulEMcKenney2007PreemptibleRCU], and the three-part
|
||||
LWN "What is RCU?" series [PaulEMcKenney2007WhatIsRCUFundamentally,
|
||||
PaulEMcKenney2008WhatIsRCUUsage, and PaulEMcKenney2008WhatIsRCUAPI].
|
||||
|
||||
2008 saw a journal paper on real-time RCU [DinakarGuniguntala2008IBMSysJ],
|
||||
a history of how Linux changed RCU more than RCU changed Linux
|
||||
[PaulEMcKenney2008RCUOSR], and a design overview of hierarchical RCU
|
||||
[PaulEMcKenney2008HierarchicalRCU].
|
||||
|
||||
2009 introduced user-level RCU algorithms [PaulEMcKenney2009MaliciousURCU],
|
||||
which Mathieu Desnoyers is now maintaining [MathieuDesnoyers2009URCU]
|
||||
[MathieuDesnoyersPhD]. TINY_RCU [PaulEMcKenney2009BloatWatchRCU] made
|
||||
its appearance, as did expedited RCU [PaulEMcKenney2009expeditedRCU].
|
||||
The problem of resizeable RCU-protected hash tables may now be on a path
|
||||
to a solution [JoshTriplett2009RPHash].
|
||||
|
||||
Bibtex Entries
|
||||
|
||||
@article{Kung80
|
||||
@ -730,6 +742,11 @@ Revised:
|
||||
"
|
||||
}
|
||||
|
||||
#
|
||||
# "What is RCU?" LWN series.
|
||||
#
|
||||
########################################################################
|
||||
|
||||
@article{DinakarGuniguntala2008IBMSysJ
|
||||
,author="D. Guniguntala and P. E. McKenney and J. Triplett and J. Walpole"
|
||||
,title="The read-copy-update mechanism for supporting real-time applications on shared-memory multiprocessor systems with {Linux}"
|
||||
@ -820,3 +837,39 @@ Revised:
|
||||
Uniprocessor assumptions allow simplified RCU implementation.
|
||||
"
|
||||
}
|
||||
|
||||
@unpublished{PaulEMcKenney2009expeditedRCU
|
||||
,Author="Paul E. McKenney"
|
||||
,Title="[{PATCH} -tip 0/3] expedited 'big hammer' {RCU} grace periods"
|
||||
,month="June"
|
||||
,day="25"
|
||||
,year="2009"
|
||||
,note="Available:
|
||||
\url{http://lkml.org/lkml/2009/6/25/306}
|
||||
[Viewed August 16, 2009]"
|
||||
,annotation="
|
||||
First posting of expedited RCU to be accepted into -tip.
|
||||
"
|
||||
}
|
||||
|
||||
@unpublished{JoshTriplett2009RPHash
|
||||
,Author="Josh Triplett"
|
||||
,Title="Scalable concurrent hash tables via relativistic programming"
|
||||
,month="September"
|
||||
,year="2009"
|
||||
,note="Linux Plumbers Conference presentation"
|
||||
,annotation="
|
||||
RP fun with hash tables.
|
||||
"
|
||||
}
|
||||
|
||||
@phdthesis{MathieuDesnoyersPhD
|
||||
, title = "Low-Impact Operating System Tracing"
|
||||
, author = "Mathieu Desnoyers"
|
||||
, school = "Ecole Polytechnique de Montr\'{e}al"
|
||||
, month = "December"
|
||||
, year = 2009
|
||||
,note="Available:
|
||||
\url{http://www.lttng.org/pub/thesis/desnoyers-dissertation-2009-12.pdf}
|
||||
[Viewed December 9, 2009]"
|
||||
}
|
||||
|
@ -8,13 +8,12 @@ would cause. This list is based on experiences reviewing such patches
|
||||
over a rather long period of time, but improvements are always welcome!
|
||||
|
||||
0. Is RCU being applied to a read-mostly situation? If the data
|
||||
structure is updated more than about 10% of the time, then
|
||||
you should strongly consider some other approach, unless
|
||||
detailed performance measurements show that RCU is nonetheless
|
||||
the right tool for the job. Yes, you might think of RCU
|
||||
as simply cutting overhead off of the readers and imposing it
|
||||
on the writers. That is exactly why normal uses of RCU will
|
||||
do much more reading than updating.
|
||||
structure is updated more than about 10% of the time, then you
|
||||
should strongly consider some other approach, unless detailed
|
||||
performance measurements show that RCU is nonetheless the right
|
||||
tool for the job. Yes, RCU does reduce read-side overhead by
|
||||
increasing write-side overhead, which is exactly why normal uses
|
||||
of RCU will do much more reading than updating.
|
||||
|
||||
Another exception is where performance is not an issue, and RCU
|
||||
provides a simpler implementation. An example of this situation
|
||||
@ -35,13 +34,13 @@ over a rather long period of time, but improvements are always welcome!
|
||||
|
||||
If you choose #b, be prepared to describe how you have handled
|
||||
memory barriers on weakly ordered machines (pretty much all of
|
||||
them -- even x86 allows reads to be reordered), and be prepared
|
||||
to explain why this added complexity is worthwhile. If you
|
||||
choose #c, be prepared to explain how this single task does not
|
||||
become a major bottleneck on big multiprocessor machines (for
|
||||
example, if the task is updating information relating to itself
|
||||
that other tasks can read, there by definition can be no
|
||||
bottleneck).
|
||||
them -- even x86 allows later loads to be reordered to precede
|
||||
earlier stores), and be prepared to explain why this added
|
||||
complexity is worthwhile. If you choose #c, be prepared to
|
||||
explain how this single task does not become a major bottleneck on
|
||||
big multiprocessor machines (for example, if the task is updating
|
||||
information relating to itself that other tasks can read, there
|
||||
by definition can be no bottleneck).
|
||||
|
||||
2. Do the RCU read-side critical sections make proper use of
|
||||
rcu_read_lock() and friends? These primitives are needed
|
||||
@ -51,8 +50,10 @@ over a rather long period of time, but improvements are always welcome!
|
||||
actuarial risk of your kernel.
|
||||
|
||||
As a rough rule of thumb, any dereference of an RCU-protected
|
||||
pointer must be covered by rcu_read_lock() or rcu_read_lock_bh()
|
||||
or by the appropriate update-side lock.
|
||||
pointer must be covered by rcu_read_lock(), rcu_read_lock_bh(),
|
||||
rcu_read_lock_sched(), or by the appropriate update-side lock.
|
||||
Disabling of preemption can serve as rcu_read_lock_sched(), but
|
||||
is less readable.
|
||||
|
||||
3. Does the update code tolerate concurrent accesses?
|
||||
|
||||
@ -62,25 +63,27 @@ over a rather long period of time, but improvements are always welcome!
|
||||
of ways to handle this concurrency, depending on the situation:
|
||||
|
||||
a. Use the RCU variants of the list and hlist update
|
||||
primitives to add, remove, and replace elements on an
|
||||
RCU-protected list. Alternatively, use the RCU-protected
|
||||
trees that have been added to the Linux kernel.
|
||||
primitives to add, remove, and replace elements on
|
||||
an RCU-protected list. Alternatively, use the other
|
||||
RCU-protected data structures that have been added to
|
||||
the Linux kernel.
|
||||
|
||||
This is almost always the best approach.
|
||||
|
||||
b. Proceed as in (a) above, but also maintain per-element
|
||||
locks (that are acquired by both readers and writers)
|
||||
that guard per-element state. Of course, fields that
|
||||
the readers refrain from accessing can be guarded by the
|
||||
update-side lock.
|
||||
the readers refrain from accessing can be guarded by
|
||||
some other lock acquired only by updaters, if desired.
|
||||
|
||||
This works quite well, also.
|
||||
|
||||
c. Make updates appear atomic to readers. For example,
|
||||
pointer updates to properly aligned fields will appear
|
||||
atomic, as will individual atomic primitives. Operations
|
||||
performed under a lock and sequences of multiple atomic
|
||||
primitives will -not- appear to be atomic.
|
||||
pointer updates to properly aligned fields will
|
||||
appear atomic, as will individual atomic primitives.
|
||||
Sequences of perations performed under a lock will -not-
|
||||
appear to be atomic to RCU readers, nor will sequences
|
||||
of multiple atomic primitives.
|
||||
|
||||
This can work, but is starting to get a bit tricky.
|
||||
|
||||
@ -98,9 +101,9 @@ over a rather long period of time, but improvements are always welcome!
|
||||
a new structure containing updated values.
|
||||
|
||||
4. Weakly ordered CPUs pose special challenges. Almost all CPUs
|
||||
are weakly ordered -- even i386 CPUs allow reads to be reordered.
|
||||
RCU code must take all of the following measures to prevent
|
||||
memory-corruption problems:
|
||||
are weakly ordered -- even x86 CPUs allow later loads to be
|
||||
reordered to precede earlier stores. RCU code must take all of
|
||||
the following measures to prevent memory-corruption problems:
|
||||
|
||||
a. Readers must maintain proper ordering of their memory
|
||||
accesses. The rcu_dereference() primitive ensures that
|
||||
@ -113,14 +116,25 @@ over a rather long period of time, but improvements are always welcome!
|
||||
The rcu_dereference() primitive is also an excellent
|
||||
documentation aid, letting the person reading the code
|
||||
know exactly which pointers are protected by RCU.
|
||||
Please note that compilers can also reorder code, and
|
||||
they are becoming increasingly aggressive about doing
|
||||
just that. The rcu_dereference() primitive therefore
|
||||
also prevents destructive compiler optimizations.
|
||||
|
||||
The rcu_dereference() primitive is used by the various
|
||||
"_rcu()" list-traversal primitives, such as the
|
||||
list_for_each_entry_rcu(). Note that it is perfectly
|
||||
legal (if redundant) for update-side code to use
|
||||
rcu_dereference() and the "_rcu()" list-traversal
|
||||
primitives. This is particularly useful in code
|
||||
that is common to readers and updaters.
|
||||
The rcu_dereference() primitive is used by the
|
||||
various "_rcu()" list-traversal primitives, such
|
||||
as the list_for_each_entry_rcu(). Note that it is
|
||||
perfectly legal (if redundant) for update-side code to
|
||||
use rcu_dereference() and the "_rcu()" list-traversal
|
||||
primitives. This is particularly useful in code that
|
||||
is common to readers and updaters. However, lockdep
|
||||
will complain if you access rcu_dereference() outside
|
||||
of an RCU read-side critical section. See lockdep.txt
|
||||
to learn what to do about this.
|
||||
|
||||
Of course, neither rcu_dereference() nor the "_rcu()"
|
||||
list-traversal primitives can substitute for a good
|
||||
concurrency design coordinating among multiple updaters.
|
||||
|
||||
b. If the list macros are being used, the list_add_tail_rcu()
|
||||
and list_add_rcu() primitives must be used in order
|
||||
@ -135,11 +149,14 @@ over a rather long period of time, but improvements are always welcome!
|
||||
readers. Similarly, if the hlist macros are being used,
|
||||
the hlist_del_rcu() primitive is required.
|
||||
|
||||
The list_replace_rcu() primitive may be used to
|
||||
replace an old structure with a new one in an
|
||||
RCU-protected list.
|
||||
The list_replace_rcu() and hlist_replace_rcu() primitives
|
||||
may be used to replace an old structure with a new one
|
||||
in their respective types of RCU-protected lists.
|
||||
|
||||
d. Updates must ensure that initialization of a given
|
||||
d. Rules similar to (4b) and (4c) apply to the "hlist_nulls"
|
||||
type of RCU-protected linked lists.
|
||||
|
||||
e. Updates must ensure that initialization of a given
|
||||
structure happens before pointers to that structure are
|
||||
publicized. Use the rcu_assign_pointer() primitive
|
||||
when publicizing a pointer to a structure that can
|
||||
@ -151,16 +168,31 @@ over a rather long period of time, but improvements are always welcome!
|
||||
it cannot block.
|
||||
|
||||
6. Since synchronize_rcu() can block, it cannot be called from
|
||||
any sort of irq context. Ditto for synchronize_sched() and
|
||||
synchronize_srcu().
|
||||
any sort of irq context. The same rule applies for
|
||||
synchronize_rcu_bh(), synchronize_sched(), synchronize_srcu(),
|
||||
synchronize_rcu_expedited(), synchronize_rcu_bh_expedited(),
|
||||
synchronize_sched_expedite(), and synchronize_srcu_expedited().
|
||||
|
||||
7. If the updater uses call_rcu(), then the corresponding readers
|
||||
must use rcu_read_lock() and rcu_read_unlock(). If the updater
|
||||
uses call_rcu_bh(), then the corresponding readers must use
|
||||
rcu_read_lock_bh() and rcu_read_unlock_bh(). If the updater
|
||||
uses call_rcu_sched(), then the corresponding readers must
|
||||
disable preemption. Mixing things up will result in confusion
|
||||
and broken kernels.
|
||||
The expedited forms of these primitives have the same semantics
|
||||
as the non-expedited forms, but expediting is both expensive
|
||||
and unfriendly to real-time workloads. Use of the expedited
|
||||
primitives should be restricted to rare configuration-change
|
||||
operations that would not normally be undertaken while a real-time
|
||||
workload is running.
|
||||
|
||||
7. If the updater uses call_rcu() or synchronize_rcu(), then the
|
||||
corresponding readers must use rcu_read_lock() and
|
||||
rcu_read_unlock(). If the updater uses call_rcu_bh() or
|
||||
synchronize_rcu_bh(), then the corresponding readers must
|
||||
use rcu_read_lock_bh() and rcu_read_unlock_bh(). If the
|
||||
updater uses call_rcu_sched() or synchronize_sched(), then
|
||||
the corresponding readers must disable preemption, possibly
|
||||
by calling rcu_read_lock_sched() and rcu_read_unlock_sched().
|
||||
If the updater uses synchronize_srcu(), the the corresponding
|
||||
readers must use srcu_read_lock() and srcu_read_unlock(),
|
||||
and with the same srcu_struct. The rules for the expedited
|
||||
primitives are the same as for their non-expedited counterparts.
|
||||
Mixing things up will result in confusion and broken kernels.
|
||||
|
||||
One exception to this rule: rcu_read_lock() and rcu_read_unlock()
|
||||
may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh()
|
||||
@ -212,6 +244,8 @@ over a rather long period of time, but improvements are always welcome!
|
||||
e. Periodically invoke synchronize_rcu(), permitting a limited
|
||||
number of updates per grace period.
|
||||
|
||||
The same cautions apply to call_rcu_bh() and call_rcu_sched().
|
||||
|
||||
9. All RCU list-traversal primitives, which include
|
||||
rcu_dereference(), list_for_each_entry_rcu(),
|
||||
list_for_each_continue_rcu(), and list_for_each_safe_rcu(),
|
||||
@ -219,17 +253,21 @@ over a rather long period of time, but improvements are always welcome!
|
||||
must be protected by appropriate update-side locks. RCU
|
||||
read-side critical sections are delimited by rcu_read_lock()
|
||||
and rcu_read_unlock(), or by similar primitives such as
|
||||
rcu_read_lock_bh() and rcu_read_unlock_bh().
|
||||
rcu_read_lock_bh() and rcu_read_unlock_bh(), in which case
|
||||
the matching rcu_dereference() primitive must be used in order
|
||||
to keep lockdep happy, in this case, rcu_dereference_bh().
|
||||
|
||||
The reason that it is permissible to use RCU list-traversal
|
||||
primitives when the update-side lock is held is that doing so
|
||||
can be quite helpful in reducing code bloat when common code is
|
||||
shared between readers and updaters.
|
||||
shared between readers and updaters. Additional primitives
|
||||
are provided for this case, as discussed in lockdep.txt.
|
||||
|
||||
10. Conversely, if you are in an RCU read-side critical section,
|
||||
and you don't hold the appropriate update-side lock, you -must-
|
||||
use the "_rcu()" variants of the list macros. Failing to do so
|
||||
will break Alpha and confuse people reading your code.
|
||||
will break Alpha, cause aggressive compilers to generate bad code,
|
||||
and confuse people trying to read your code.
|
||||
|
||||
11. Note that synchronize_rcu() -only- guarantees to wait until
|
||||
all currently executing rcu_read_lock()-protected RCU read-side
|
||||
@ -239,15 +277,21 @@ over a rather long period of time, but improvements are always welcome!
|
||||
rcu_read_lock()-protected read-side critical sections, do -not-
|
||||
use synchronize_rcu().
|
||||
|
||||
If you want to wait for some of these other things, you might
|
||||
instead need to use synchronize_irq() or synchronize_sched().
|
||||
Similarly, disabling preemption is not an acceptable substitute
|
||||
for rcu_read_lock(). Code that attempts to use preemption
|
||||
disabling where it should be using rcu_read_lock() will break
|
||||
in real-time kernel builds.
|
||||
|
||||
If you want to wait for interrupt handlers, NMI handlers, and
|
||||
code under the influence of preempt_disable(), you instead
|
||||
need to use synchronize_irq() or synchronize_sched().
|
||||
|
||||
12. Any lock acquired by an RCU callback must be acquired elsewhere
|
||||
with softirq disabled, e.g., via spin_lock_irqsave(),
|
||||
spin_lock_bh(), etc. Failing to disable irq on a given
|
||||
acquisition of that lock will result in deadlock as soon as the
|
||||
RCU callback happens to interrupt that acquisition's critical
|
||||
section.
|
||||
acquisition of that lock will result in deadlock as soon as
|
||||
the RCU softirq handler happens to run your RCU callback while
|
||||
interrupting that acquisition's critical section.
|
||||
|
||||
13. RCU callbacks can be and are executed in parallel. In many cases,
|
||||
the callback code simply wrappers around kfree(), so that this
|
||||
@ -265,29 +309,30 @@ over a rather long period of time, but improvements are always welcome!
|
||||
not the case, a self-spawning RCU callback would prevent the
|
||||
victim CPU from ever going offline.)
|
||||
|
||||
14. SRCU (srcu_read_lock(), srcu_read_unlock(), and synchronize_srcu())
|
||||
may only be invoked from process context. Unlike other forms of
|
||||
RCU, it -is- permissible to block in an SRCU read-side critical
|
||||
section (demarked by srcu_read_lock() and srcu_read_unlock()),
|
||||
hence the "SRCU": "sleepable RCU". Please note that if you
|
||||
don't need to sleep in read-side critical sections, you should
|
||||
be using RCU rather than SRCU, because RCU is almost always
|
||||
faster and easier to use than is SRCU.
|
||||
14. SRCU (srcu_read_lock(), srcu_read_unlock(), srcu_dereference(),
|
||||
synchronize_srcu(), and synchronize_srcu_expedited()) may only
|
||||
be invoked from process context. Unlike other forms of RCU, it
|
||||
-is- permissible to block in an SRCU read-side critical section
|
||||
(demarked by srcu_read_lock() and srcu_read_unlock()), hence the
|
||||
"SRCU": "sleepable RCU". Please note that if you don't need
|
||||
to sleep in read-side critical sections, you should be using
|
||||
RCU rather than SRCU, because RCU is almost always faster and
|
||||
easier to use than is SRCU.
|
||||
|
||||
Also unlike other forms of RCU, explicit initialization
|
||||
and cleanup is required via init_srcu_struct() and
|
||||
cleanup_srcu_struct(). These are passed a "struct srcu_struct"
|
||||
that defines the scope of a given SRCU domain. Once initialized,
|
||||
the srcu_struct is passed to srcu_read_lock(), srcu_read_unlock()
|
||||
and synchronize_srcu(). A given synchronize_srcu() waits only
|
||||
for SRCU read-side critical sections governed by srcu_read_lock()
|
||||
and srcu_read_unlock() calls that have been passd the same
|
||||
srcu_struct. This property is what makes sleeping read-side
|
||||
critical sections tolerable -- a given subsystem delays only
|
||||
its own updates, not those of other subsystems using SRCU.
|
||||
Therefore, SRCU is less prone to OOM the system than RCU would
|
||||
be if RCU's read-side critical sections were permitted to
|
||||
sleep.
|
||||
synchronize_srcu(), and synchronize_srcu_expedited(). A given
|
||||
synchronize_srcu() waits only for SRCU read-side critical
|
||||
sections governed by srcu_read_lock() and srcu_read_unlock()
|
||||
calls that have been passed the same srcu_struct. This property
|
||||
is what makes sleeping read-side critical sections tolerable --
|
||||
a given subsystem delays only its own updates, not those of other
|
||||
subsystems using SRCU. Therefore, SRCU is less prone to OOM the
|
||||
system than RCU would be if RCU's read-side critical sections
|
||||
were permitted to sleep.
|
||||
|
||||
The ability to sleep in read-side critical sections does not
|
||||
come for free. First, corresponding srcu_read_lock() and
|
||||
@ -300,8 +345,8 @@ over a rather long period of time, but improvements are always welcome!
|
||||
requiring SRCU's read-side deadlock immunity or low read-side
|
||||
realtime latency.
|
||||
|
||||
Note that, rcu_assign_pointer() and rcu_dereference() relate to
|
||||
SRCU just as they do to other forms of RCU.
|
||||
Note that, rcu_assign_pointer() relates to SRCU just as they do
|
||||
to other forms of RCU.
|
||||
|
||||
15. The whole point of call_rcu(), synchronize_rcu(), and friends
|
||||
is to wait until all pre-existing readers have finished before
|
||||
@ -311,12 +356,12 @@ over a rather long period of time, but improvements are always welcome!
|
||||
destructive operation, and -only- -then- invoke call_rcu(),
|
||||
synchronize_rcu(), or friends.
|
||||
|
||||
Because these primitives only wait for pre-existing readers,
|
||||
it is the caller's responsibility to guarantee safety to
|
||||
any subsequent readers.
|
||||
Because these primitives only wait for pre-existing readers, it
|
||||
is the caller's responsibility to guarantee that any subsequent
|
||||
readers will execute safely.
|
||||
|
||||
16. The various RCU read-side primitives do -not- contain memory
|
||||
barriers. The CPU (and in some cases, the compiler) is free
|
||||
to reorder code into and out of RCU read-side critical sections.
|
||||
It is the responsibility of the RCU update-side primitives to
|
||||
deal with this.
|
||||
16. The various RCU read-side primitives do -not- necessarily contain
|
||||
memory barriers. You should therefore plan for the CPU
|
||||
and the compiler to freely reorder code into and out of RCU
|
||||
read-side critical sections. It is the responsibility of the
|
||||
RCU update-side primitives to deal with this.
|
||||
|
91
Documentation/RCU/lockdep.txt
Normal file
91
Documentation/RCU/lockdep.txt
Normal file
@ -0,0 +1,91 @@
|
||||
RCU and lockdep checking
|
||||
|
||||
All flavors of RCU have lockdep checking available, so that lockdep is
|
||||
aware of when each task enters and leaves any flavor of RCU read-side
|
||||
critical section. Each flavor of RCU is tracked separately (but note
|
||||
that this is not the case in 2.6.32 and earlier). This allows lockdep's
|
||||
tracking to include RCU state, which can sometimes help when debugging
|
||||
deadlocks and the like.
|
||||
|
||||
In addition, RCU provides the following primitives that check lockdep's
|
||||
state:
|
||||
|
||||
rcu_read_lock_held() for normal RCU.
|
||||
rcu_read_lock_bh_held() for RCU-bh.
|
||||
rcu_read_lock_sched_held() for RCU-sched.
|
||||
srcu_read_lock_held() for SRCU.
|
||||
|
||||
These functions are conservative, and will therefore return 1 if they
|
||||
aren't certain (for example, if CONFIG_DEBUG_LOCK_ALLOC is not set).
|
||||
This prevents things like WARN_ON(!rcu_read_lock_held()) from giving false
|
||||
positives when lockdep is disabled.
|
||||
|
||||
In addition, a separate kernel config parameter CONFIG_PROVE_RCU enables
|
||||
checking of rcu_dereference() primitives:
|
||||
|
||||
rcu_dereference(p):
|
||||
Check for RCU read-side critical section.
|
||||
rcu_dereference_bh(p):
|
||||
Check for RCU-bh read-side critical section.
|
||||
rcu_dereference_sched(p):
|
||||
Check for RCU-sched read-side critical section.
|
||||
srcu_dereference(p, sp):
|
||||
Check for SRCU read-side critical section.
|
||||
rcu_dereference_check(p, c):
|
||||
Use explicit check expression "c". This is useful in
|
||||
code that is invoked by both readers and updaters.
|
||||
rcu_dereference_raw(p)
|
||||
Don't check. (Use sparingly, if at all.)
|
||||
rcu_dereference_protected(p, c):
|
||||
Use explicit check expression "c", and omit all barriers
|
||||
and compiler constraints. This is useful when the data
|
||||
structure cannot change, for example, in code that is
|
||||
invoked only by updaters.
|
||||
rcu_access_pointer(p):
|
||||
Return the value of the pointer and omit all barriers,
|
||||
but retain the compiler constraints that prevent duplicating
|
||||
or coalescsing. This is useful when when testing the
|
||||
value of the pointer itself, for example, against NULL.
|
||||
|
||||
The rcu_dereference_check() check expression can be any boolean
|
||||
expression, but would normally include one of the rcu_read_lock_held()
|
||||
family of functions and a lockdep expression. However, any boolean
|
||||
expression can be used. For a moderately ornate example, consider
|
||||
the following:
|
||||
|
||||
file = rcu_dereference_check(fdt->fd[fd],
|
||||
rcu_read_lock_held() ||
|
||||
lockdep_is_held(&files->file_lock) ||
|
||||
atomic_read(&files->count) == 1);
|
||||
|
||||
This expression picks up the pointer "fdt->fd[fd]" in an RCU-safe manner,
|
||||
and, if CONFIG_PROVE_RCU is configured, verifies that this expression
|
||||
is used in:
|
||||
|
||||
1. An RCU read-side critical section, or
|
||||
2. with files->file_lock held, or
|
||||
3. on an unshared files_struct.
|
||||
|
||||
In case (1), the pointer is picked up in an RCU-safe manner for vanilla
|
||||
RCU read-side critical sections, in case (2) the ->file_lock prevents
|
||||
any change from taking place, and finally, in case (3) the current task
|
||||
is the only task accessing the file_struct, again preventing any change
|
||||
from taking place. If the above statement was invoked only from updater
|
||||
code, it could instead be written as follows:
|
||||
|
||||
file = rcu_dereference_protected(fdt->fd[fd],
|
||||
lockdep_is_held(&files->file_lock) ||
|
||||
atomic_read(&files->count) == 1);
|
||||
|
||||
This would verify cases #2 and #3 above, and furthermore lockdep would
|
||||
complain if this was used in an RCU read-side critical section unless one
|
||||
of these two cases held. Because rcu_dereference_protected() omits all
|
||||
barriers and compiler constraints, it generates better code than do the
|
||||
other flavors of rcu_dereference(). On the other hand, it is illegal
|
||||
to use rcu_dereference_protected() if either the RCU-protected pointer
|
||||
or the RCU-protected data that it points to can change concurrently.
|
||||
|
||||
There are currently only "universal" versions of the rcu_assign_pointer()
|
||||
and RCU list-/tree-traversal primitives, which do not (yet) check for
|
||||
being in an RCU read-side critical section. In the future, separate
|
||||
versions of these primitives might be created.
|
@ -75,6 +75,8 @@ o I hear that RCU is patented? What is with that?
|
||||
search for the string "Patent" in RTFP.txt to find them.
|
||||
Of these, one was allowed to lapse by the assignee, and the
|
||||
others have been contributed to the Linux kernel under GPL.
|
||||
There are now also LGPL implementations of user-level RCU
|
||||
available (http://lttng.org/?q=node/18).
|
||||
|
||||
o I hear that RCU needs work in order to support realtime kernels?
|
||||
|
||||
@ -91,48 +93,4 @@ o Where can I find more information on RCU?
|
||||
|
||||
o What are all these files in this directory?
|
||||
|
||||
|
||||
NMI-RCU.txt
|
||||
|
||||
Describes how to use RCU to implement dynamic
|
||||
NMI handlers, which can be revectored on the fly,
|
||||
without rebooting.
|
||||
|
||||
RTFP.txt
|
||||
|
||||
List of RCU-related publications and web sites.
|
||||
|
||||
UP.txt
|
||||
|
||||
Discussion of RCU usage in UP kernels.
|
||||
|
||||
arrayRCU.txt
|
||||
|
||||
Describes how to use RCU to protect arrays, with
|
||||
resizeable arrays whose elements reference other
|
||||
data structures being of the most interest.
|
||||
|
||||
checklist.txt
|
||||
|
||||
Lists things to check for when inspecting code that
|
||||
uses RCU.
|
||||
|
||||
listRCU.txt
|
||||
|
||||
Describes how to use RCU to protect linked lists.
|
||||
This is the simplest and most common use of RCU
|
||||
in the Linux kernel.
|
||||
|
||||
rcu.txt
|
||||
|
||||
You are reading it!
|
||||
|
||||
rcuref.txt
|
||||
|
||||
Describes how to combine use of reference counts
|
||||
with RCU.
|
||||
|
||||
whatisRCU.txt
|
||||
|
||||
Overview of how the RCU implementation works. Along
|
||||
the way, presents a conceptual view of RCU.
|
||||
See 00-INDEX for the list.
|
||||
|
58
Documentation/RCU/stallwarn.txt
Normal file
58
Documentation/RCU/stallwarn.txt
Normal file
@ -0,0 +1,58 @@
|
||||
Using RCU's CPU Stall Detector
|
||||
|
||||
The CONFIG_RCU_CPU_STALL_DETECTOR kernel config parameter enables
|
||||
RCU's CPU stall detector, which detects conditions that unduly delay
|
||||
RCU grace periods. The stall detector's idea of what constitutes
|
||||
"unduly delayed" is controlled by a pair of C preprocessor macros:
|
||||
|
||||
RCU_SECONDS_TILL_STALL_CHECK
|
||||
|
||||
This macro defines the period of time that RCU will wait from
|
||||
the beginning of a grace period until it issues an RCU CPU
|
||||
stall warning. It is normally ten seconds.
|
||||
|
||||
RCU_SECONDS_TILL_STALL_RECHECK
|
||||
|
||||
This macro defines the period of time that RCU will wait after
|
||||
issuing a stall warning until it issues another stall warning.
|
||||
It is normally set to thirty seconds.
|
||||
|
||||
RCU_STALL_RAT_DELAY
|
||||
|
||||
The CPU stall detector tries to make the offending CPU rat on itself,
|
||||
as this often gives better-quality stack traces. However, if
|
||||
the offending CPU does not detect its own stall in the number
|
||||
of jiffies specified by RCU_STALL_RAT_DELAY, then other CPUs will
|
||||
complain. This is normally set to two jiffies.
|
||||
|
||||
The following problems can result in an RCU CPU stall warning:
|
||||
|
||||
o A CPU looping in an RCU read-side critical section.
|
||||
|
||||
o A CPU looping with interrupts disabled.
|
||||
|
||||
o A CPU looping with preemption disabled.
|
||||
|
||||
o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
|
||||
without invoking schedule().
|
||||
|
||||
o A bug in the RCU implementation.
|
||||
|
||||
o A hardware failure. This is quite unlikely, but has occurred
|
||||
at least once in a former life. A CPU failed in a running system,
|
||||
becoming unresponsive, but not causing an immediate crash.
|
||||
This resulted in a series of RCU CPU stall warnings, eventually
|
||||
leading the realization that the CPU had failed.
|
||||
|
||||
The RCU, RCU-sched, and RCU-bh implementations have CPU stall warning.
|
||||
SRCU does not do so directly, but its calls to synchronize_sched() will
|
||||
result in RCU-sched detecting any CPU stalls that might be occurring.
|
||||
|
||||
To diagnose the cause of the stall, inspect the stack traces. The offending
|
||||
function will usually be near the top of the stack. If you have a series
|
||||
of stall warnings from a single extended stall, comparing the stack traces
|
||||
can often help determine where the stall is occurring, which will usually
|
||||
be in the function nearest the top of the stack that stays the same from
|
||||
trace to trace.
|
||||
|
||||
RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE.
|
@ -30,6 +30,18 @@ MODULE PARAMETERS
|
||||
|
||||
This module has the following parameters:
|
||||
|
||||
fqs_duration Duration (in microseconds) of artificially induced bursts
|
||||
of force_quiescent_state() invocations. In RCU
|
||||
implementations having force_quiescent_state(), these
|
||||
bursts help force races between forcing a given grace
|
||||
period and that grace period ending on its own.
|
||||
|
||||
fqs_holdoff Holdoff time (in microseconds) between consecutive calls
|
||||
to force_quiescent_state() within a burst.
|
||||
|
||||
fqs_stutter Wait time (in seconds) between consecutive bursts
|
||||
of calls to force_quiescent_state().
|
||||
|
||||
irqreaders Says to invoke RCU readers from irq level. This is currently
|
||||
done via timers. Defaults to "1" for variants of RCU that
|
||||
permit this. (Or, more accurately, variants of RCU that do
|
||||
|
@ -323,14 +323,17 @@ used as follows:
|
||||
Defer Protect
|
||||
|
||||
a. synchronize_rcu() rcu_read_lock() / rcu_read_unlock()
|
||||
call_rcu()
|
||||
call_rcu() rcu_dereference()
|
||||
|
||||
b. call_rcu_bh() rcu_read_lock_bh() / rcu_read_unlock_bh()
|
||||
rcu_dereference_bh()
|
||||
|
||||
c. synchronize_sched() preempt_disable() / preempt_enable()
|
||||
c. synchronize_sched() rcu_read_lock_sched() / rcu_read_unlock_sched()
|
||||
preempt_disable() / preempt_enable()
|
||||
local_irq_save() / local_irq_restore()
|
||||
hardirq enter / hardirq exit
|
||||
NMI enter / NMI exit
|
||||
rcu_dereference_sched()
|
||||
|
||||
These three mechanisms are used as follows:
|
||||
|
||||
@ -780,9 +783,8 @@ Linux-kernel source code, but it helps to have a full list of the
|
||||
APIs, since there does not appear to be a way to categorize them
|
||||
in docbook. Here is the list, by category.
|
||||
|
||||
RCU pointer/list traversal:
|
||||
RCU list traversal:
|
||||
|
||||
rcu_dereference
|
||||
list_for_each_entry_rcu
|
||||
hlist_for_each_entry_rcu
|
||||
hlist_nulls_for_each_entry_rcu
|
||||
@ -808,7 +810,7 @@ RCU: Critical sections Grace period Barrier
|
||||
|
||||
rcu_read_lock synchronize_net rcu_barrier
|
||||
rcu_read_unlock synchronize_rcu
|
||||
synchronize_rcu_expedited
|
||||
rcu_dereference synchronize_rcu_expedited
|
||||
call_rcu
|
||||
|
||||
|
||||
@ -816,7 +818,7 @@ bh: Critical sections Grace period Barrier
|
||||
|
||||
rcu_read_lock_bh call_rcu_bh rcu_barrier_bh
|
||||
rcu_read_unlock_bh synchronize_rcu_bh
|
||||
synchronize_rcu_bh_expedited
|
||||
rcu_dereference_bh synchronize_rcu_bh_expedited
|
||||
|
||||
|
||||
sched: Critical sections Grace period Barrier
|
||||
@ -825,17 +827,25 @@ sched: Critical sections Grace period Barrier
|
||||
rcu_read_unlock_sched call_rcu_sched
|
||||
[preempt_disable] synchronize_sched_expedited
|
||||
[and friends]
|
||||
rcu_dereference_sched
|
||||
|
||||
|
||||
SRCU: Critical sections Grace period Barrier
|
||||
|
||||
srcu_read_lock synchronize_srcu N/A
|
||||
srcu_read_unlock synchronize_srcu_expedited
|
||||
srcu_dereference
|
||||
|
||||
SRCU: Initialization/cleanup
|
||||
init_srcu_struct
|
||||
cleanup_srcu_struct
|
||||
|
||||
All: lockdep-checked RCU-protected pointer access
|
||||
|
||||
rcu_dereference_check
|
||||
rcu_dereference_protected
|
||||
rcu_access_pointer
|
||||
|
||||
See the comment headers in the source code (or the docbook generated
|
||||
from them) for more information.
|
||||
|
||||
|
@ -9,10 +9,14 @@ Documentation/SubmittingPatches and elsewhere regarding submitting Linux
|
||||
kernel patches.
|
||||
|
||||
|
||||
1: Builds cleanly with applicable or modified CONFIG options =y, =m, and
|
||||
1: If you use a facility then #include the file that defines/declares
|
||||
that facility. Don't depend on other header files pulling in ones
|
||||
that you use.
|
||||
|
||||
2: Builds cleanly with applicable or modified CONFIG options =y, =m, and
|
||||
=n. No gcc warnings/errors, no linker warnings/errors.
|
||||
|
||||
2: Passes allnoconfig, allmodconfig
|
||||
2b: Passes allnoconfig, allmodconfig
|
||||
|
||||
3: Builds on multiple CPU architectures by using local cross-compile tools
|
||||
or some other build farm.
|
||||
|
@ -14,8 +14,8 @@ Introduction
|
||||
how the clocks are arranged. The first implementation used as single
|
||||
PLL to feed the ARM, memory and peripherals via a series of dividers
|
||||
and muxes and this is the implementation that is documented here. A
|
||||
newer version where there is a seperate PLL and clock divider for the
|
||||
ARM core is available as a seperate driver.
|
||||
newer version where there is a separate PLL and clock divider for the
|
||||
ARM core is available as a separate driver.
|
||||
|
||||
|
||||
Layout
|
||||
|
86
Documentation/arm/Samsung/Overview.txt
Normal file
86
Documentation/arm/Samsung/Overview.txt
Normal file
@ -0,0 +1,86 @@
|
||||
Samsung ARM Linux Overview
|
||||
==========================
|
||||
|
||||
Introduction
|
||||
------------
|
||||
|
||||
The Samsung range of ARM SoCs spans many similar devices, from the initial
|
||||
ARM9 through to the newest ARM cores. This document shows an overview of
|
||||
the current kernel support, how to use it and where to find the code
|
||||
that supports this.
|
||||
|
||||
The currently supported SoCs are:
|
||||
|
||||
- S3C24XX: See Documentation/arm/Samsung-S3C24XX/Overview.txt for full list
|
||||
- S3C64XX: S3C6400 and S3C6410
|
||||
- S5PC6440
|
||||
|
||||
S5PC100 and S5PC110 support is currently being merged
|
||||
|
||||
|
||||
S3C24XX Systems
|
||||
---------------
|
||||
|
||||
There is still documentation in Documnetation/arm/Samsung-S3C24XX/ which
|
||||
deals with the architecture and drivers specific to these devices.
|
||||
|
||||
See Documentation/arm/Samsung-S3C24XX/Overview.txt for more information
|
||||
on the implementation details and specific support.
|
||||
|
||||
|
||||
Configuration
|
||||
-------------
|
||||
|
||||
A number of configurations are supplied, as there is no current way of
|
||||
unifying all the SoCs into one kernel.
|
||||
|
||||
s5p6440_defconfig - S5P6440 specific default configuration
|
||||
s5pc100_defconfig - S5PC100 specific default configuration
|
||||
|
||||
|
||||
Layout
|
||||
------
|
||||
|
||||
The directory layout is currently being restructured, and consists of
|
||||
several platform directories and then the machine specific directories
|
||||
of the CPUs being built for.
|
||||
|
||||
plat-samsung provides the base for all the implementations, and is the
|
||||
last in the line of include directories that are processed for the build
|
||||
specific information. It contains the base clock, GPIO and device definitions
|
||||
to get the system running.
|
||||
|
||||
plat-s3c is the s3c24xx/s3c64xx platform directory, although it is currently
|
||||
involved in other builds this will be phased out once the relevant code is
|
||||
moved elsewhere.
|
||||
|
||||
plat-s3c24xx is for s3c24xx specific builds, see the S3C24XX docs.
|
||||
|
||||
plat-s3c64xx is for the s3c64xx specific bits, see the S3C24XX docs.
|
||||
|
||||
plat-s5p is for s5p specific builds, more to be added.
|
||||
|
||||
|
||||
[ to finish ]
|
||||
|
||||
|
||||
Port Contributors
|
||||
-----------------
|
||||
|
||||
Ben Dooks (BJD)
|
||||
Vincent Sanders
|
||||
Herbert Potzl
|
||||
Arnaud Patard (RTP)
|
||||
Roc Wu
|
||||
Klaus Fetscher
|
||||
Dimitry Andric
|
||||
Shannon Holland
|
||||
Guillaume Gourat (NexVision)
|
||||
Christer Weinigel (wingel) (Acer N30)
|
||||
Lucas Correia Villa Real (S3C2400 port)
|
||||
|
||||
|
||||
Document Author
|
||||
---------------
|
||||
|
||||
Copyright 2009-2010 Ben Dooks <ben-linux@fluff.org>
|
167
Documentation/arm/Samsung/clksrc-change-registers.awk
Executable file
167
Documentation/arm/Samsung/clksrc-change-registers.awk
Executable file
@ -0,0 +1,167 @@
|
||||
#!/usr/bin/awk -f
|
||||
#
|
||||
# Copyright 2010 Ben Dooks <ben-linux@fluff.org>
|
||||
#
|
||||
# Released under GPLv2
|
||||
|
||||
# example usage
|
||||
# ./clksrc-change-registers.awk arch/arm/plat-s5pc1xx/include/plat/regs-clock.h < src > dst
|
||||
|
||||
function extract_value(s)
|
||||
{
|
||||
eqat = index(s, "=")
|
||||
comat = index(s, ",")
|
||||
return substr(s, eqat+2, (comat-eqat)-2)
|
||||
}
|
||||
|
||||
function remove_brackets(b)
|
||||
{
|
||||
return substr(b, 2, length(b)-2)
|
||||
}
|
||||
|
||||
function splitdefine(l, p)
|
||||
{
|
||||
r = split(l, tp)
|
||||
|
||||
p[0] = tp[2]
|
||||
p[1] = remove_brackets(tp[3])
|
||||
}
|
||||
|
||||
function find_length(f)
|
||||
{
|
||||
if (0)
|
||||
printf "find_length " f "\n" > "/dev/stderr"
|
||||
|
||||
if (f ~ /0x1/)
|
||||
return 1
|
||||
else if (f ~ /0x3/)
|
||||
return 2
|
||||
else if (f ~ /0x7/)
|
||||
return 3
|
||||
else if (f ~ /0xf/)
|
||||
return 4
|
||||
|
||||
printf "unknown legnth " f "\n" > "/dev/stderr"
|
||||
exit
|
||||
}
|
||||
|
||||
function find_shift(s)
|
||||
{
|
||||
id = index(s, "<")
|
||||
if (id <= 0) {
|
||||
printf "cannot find shift " s "\n" > "/dev/stderr"
|
||||
exit
|
||||
}
|
||||
|
||||
return substr(s, id+2)
|
||||
}
|
||||
|
||||
|
||||
BEGIN {
|
||||
if (ARGC < 2) {
|
||||
print "too few arguments" > "/dev/stderr"
|
||||
exit
|
||||
}
|
||||
|
||||
# read the header file and find the mask values that we will need
|
||||
# to replace and create an associative array of values
|
||||
|
||||
while (getline line < ARGV[1] > 0) {
|
||||
if (line ~ /\#define.*_MASK/ &&
|
||||
!(line ~ /S5PC100_EPLL_MASK/) &&
|
||||
!(line ~ /USB_SIG_MASK/)) {
|
||||
splitdefine(line, fields)
|
||||
name = fields[0]
|
||||
if (0)
|
||||
printf "MASK " line "\n" > "/dev/stderr"
|
||||
dmask[name,0] = find_length(fields[1])
|
||||
dmask[name,1] = find_shift(fields[1])
|
||||
if (0)
|
||||
printf "=> '" name "' LENGTH=" dmask[name,0] " SHIFT=" dmask[name,1] "\n" > "/dev/stderr"
|
||||
} else {
|
||||
}
|
||||
}
|
||||
|
||||
delete ARGV[1]
|
||||
}
|
||||
|
||||
/clksrc_clk.*=.*{/ {
|
||||
shift=""
|
||||
mask=""
|
||||
divshift=""
|
||||
reg_div=""
|
||||
reg_src=""
|
||||
indent=1
|
||||
|
||||
print $0
|
||||
|
||||
for(; indent >= 1;) {
|
||||
if ((getline line) <= 0) {
|
||||
printf "unexpected end of file" > "/dev/stderr"
|
||||
exit 1;
|
||||
}
|
||||
|
||||
if (line ~ /\.shift/) {
|
||||
shift = extract_value(line)
|
||||
} else if (line ~ /\.mask/) {
|
||||
mask = extract_value(line)
|
||||
} else if (line ~ /\.reg_divider/) {
|
||||
reg_div = extract_value(line)
|
||||
} else if (line ~ /\.reg_source/) {
|
||||
reg_src = extract_value(line)
|
||||
} else if (line ~ /\.divider_shift/) {
|
||||
divshift = extract_value(line)
|
||||
} else if (line ~ /{/) {
|
||||
indent++
|
||||
print line
|
||||
} else if (line ~ /}/) {
|
||||
indent--
|
||||
|
||||
if (indent == 0) {
|
||||
if (0) {
|
||||
printf "shift '" shift "' ='" dmask[shift,0] "'\n" > "/dev/stderr"
|
||||
printf "mask '" mask "'\n" > "/dev/stderr"
|
||||
printf "dshft '" divshift "'\n" > "/dev/stderr"
|
||||
printf "rdiv '" reg_div "'\n" > "/dev/stderr"
|
||||
printf "rsrc '" reg_src "'\n" > "/dev/stderr"
|
||||
}
|
||||
|
||||
generated = mask
|
||||
sub(reg_src, reg_div, generated)
|
||||
|
||||
if (0) {
|
||||
printf "/* rsrc " reg_src " */\n"
|
||||
printf "/* rdiv " reg_div " */\n"
|
||||
printf "/* shift " shift " */\n"
|
||||
printf "/* mask " mask " */\n"
|
||||
printf "/* generated " generated " */\n"
|
||||
}
|
||||
|
||||
if (reg_div != "") {
|
||||
printf "\t.reg_div = { "
|
||||
printf ".reg = " reg_div ", "
|
||||
printf ".shift = " dmask[generated,1] ", "
|
||||
printf ".size = " dmask[generated,0] ", "
|
||||
printf "},\n"
|
||||
}
|
||||
|
||||
printf "\t.reg_src = { "
|
||||
printf ".reg = " reg_src ", "
|
||||
printf ".shift = " dmask[mask,1] ", "
|
||||
printf ".size = " dmask[mask,0] ", "
|
||||
|
||||
printf "},\n"
|
||||
|
||||
}
|
||||
|
||||
print line
|
||||
} else {
|
||||
print line
|
||||
}
|
||||
|
||||
if (0)
|
||||
printf indent ":" line "\n" > "/dev/stderr"
|
||||
}
|
||||
}
|
||||
|
||||
// && ! /clksrc_clk.*=.*{/ { print $0 }
|
@ -59,7 +59,11 @@ PAGE_OFFSET high_memory-1 Kernel direct-mapped RAM region.
|
||||
This maps the platforms RAM, and typically
|
||||
maps all platform RAM in a 1:1 relationship.
|
||||
|
||||
TASK_SIZE PAGE_OFFSET-1 Kernel module space
|
||||
PKMAP_BASE PAGE_OFFSET-1 Permanent kernel mappings
|
||||
One way of mapping HIGHMEM pages into kernel
|
||||
space.
|
||||
|
||||
MODULES_VADDR MODULES_END-1 Kernel module space
|
||||
Kernel modules inserted via insmod are
|
||||
placed here using dynamic mappings.
|
||||
|
||||
|
@ -1,7 +1,5 @@
|
||||
00-INDEX
|
||||
- This file
|
||||
as-iosched.txt
|
||||
- Anticipatory IO scheduler
|
||||
barrier.txt
|
||||
- I/O Barriers
|
||||
biodoc.txt
|
||||
|
@ -1,172 +0,0 @@
|
||||
Anticipatory IO scheduler
|
||||
-------------------------
|
||||
Nick Piggin <piggin@cyberone.com.au> 13 Sep 2003
|
||||
|
||||
Attention! Database servers, especially those using "TCQ" disks should
|
||||
investigate performance with the 'deadline' IO scheduler. Any system with high
|
||||
disk performance requirements should do so, in fact.
|
||||
|
||||
If you see unusual performance characteristics of your disk systems, or you
|
||||
see big performance regressions versus the deadline scheduler, please email
|
||||
me. Database users don't bother unless you're willing to test a lot of patches
|
||||
from me ;) its a known issue.
|
||||
|
||||
Also, users with hardware RAID controllers, doing striping, may find
|
||||
highly variable performance results with using the as-iosched. The
|
||||
as-iosched anticipatory implementation is based on the notion that a disk
|
||||
device has only one physical seeking head. A striped RAID controller
|
||||
actually has a head for each physical device in the logical RAID device.
|
||||
|
||||
However, setting the antic_expire (see tunable parameters below) produces
|
||||
very similar behavior to the deadline IO scheduler.
|
||||
|
||||
Selecting IO schedulers
|
||||
-----------------------
|
||||
Refer to Documentation/block/switching-sched.txt for information on
|
||||
selecting an io scheduler on a per-device basis.
|
||||
|
||||
Anticipatory IO scheduler Policies
|
||||
----------------------------------
|
||||
The as-iosched implementation implements several layers of policies
|
||||
to determine when an IO request is dispatched to the disk controller.
|
||||
Here are the policies outlined, in order of application.
|
||||
|
||||
1. one-way Elevator algorithm.
|
||||
|
||||
The elevator algorithm is similar to that used in deadline scheduler, with
|
||||
the addition that it allows limited backward movement of the elevator
|
||||
(i.e. seeks backwards). A seek backwards can occur when choosing between
|
||||
two IO requests where one is behind the elevator's current position, and
|
||||
the other is in front of the elevator's position. If the seek distance to
|
||||
the request in back of the elevator is less than half the seek distance to
|
||||
the request in front of the elevator, then the request in back can be chosen.
|
||||
Backward seeks are also limited to a maximum of MAXBACK (1024*1024) sectors.
|
||||
This favors forward movement of the elevator, while allowing opportunistic
|
||||
"short" backward seeks.
|
||||
|
||||
2. FIFO expiration times for reads and for writes.
|
||||
|
||||
This is again very similar to the deadline IO scheduler. The expiration
|
||||
times for requests on these lists is tunable using the parameters read_expire
|
||||
and write_expire discussed below. When a read or a write expires in this way,
|
||||
the IO scheduler will interrupt its current elevator sweep or read anticipation
|
||||
to service the expired request.
|
||||
|
||||
3. Read and write request batching
|
||||
|
||||
A batch is a collection of read requests or a collection of write
|
||||
requests. The as scheduler alternates dispatching read and write batches
|
||||
to the driver. In the case a read batch, the scheduler submits read
|
||||
requests to the driver as long as there are read requests to submit, and
|
||||
the read batch time limit has not been exceeded (read_batch_expire).
|
||||
The read batch time limit begins counting down only when there are
|
||||
competing write requests pending.
|
||||
|
||||
In the case of a write batch, the scheduler submits write requests to
|
||||
the driver as long as there are write requests available, and the
|
||||
write batch time limit has not been exceeded (write_batch_expire).
|
||||
However, the length of write batches will be gradually shortened
|
||||
when read batches frequently exceed their time limit.
|
||||
|
||||
When changing between batch types, the scheduler waits for all requests
|
||||
from the previous batch to complete before scheduling requests for the
|
||||
next batch.
|
||||
|
||||
The read and write fifo expiration times described in policy 2 above
|
||||
are checked only when in scheduling IO of a batch for the corresponding
|
||||
(read/write) type. So for example, the read FIFO timeout values are
|
||||
tested only during read batches. Likewise, the write FIFO timeout
|
||||
values are tested only during write batches. For this reason,
|
||||
it is generally not recommended for the read batch time
|
||||
to be longer than the write expiration time, nor for the write batch
|
||||
time to exceed the read expiration time (see tunable parameters below).
|
||||
|
||||
When the IO scheduler changes from a read to a write batch,
|
||||
it begins the elevator from the request that is on the head of the
|
||||
write expiration FIFO. Likewise, when changing from a write batch to
|
||||
a read batch, scheduler begins the elevator from the first entry
|
||||
on the read expiration FIFO.
|
||||
|
||||
4. Read anticipation.
|
||||
|
||||
Read anticipation occurs only when scheduling a read batch.
|
||||
This implementation of read anticipation allows only one read request
|
||||
to be dispatched to the disk controller at a time. In
|
||||
contrast, many write requests may be dispatched to the disk controller
|
||||
at a time during a write batch. It is this characteristic that can make
|
||||
the anticipatory scheduler perform anomalously with controllers supporting
|
||||
TCQ, or with hardware striped RAID devices. Setting the antic_expire
|
||||
queue parameter (see below) to zero disables this behavior, and the
|
||||
anticipatory scheduler behaves essentially like the deadline scheduler.
|
||||
|
||||
When read anticipation is enabled (antic_expire is not zero), reads
|
||||
are dispatched to the disk controller one at a time.
|
||||
At the end of each read request, the IO scheduler examines its next
|
||||
candidate read request from its sorted read list. If that next request
|
||||
is from the same process as the request that just completed,
|
||||
or if the next request in the queue is "very close" to the
|
||||
just completed request, it is dispatched immediately. Otherwise,
|
||||
statistics (average think time, average seek distance) on the process
|
||||
that submitted the just completed request are examined. If it seems
|
||||
likely that that process will submit another request soon, and that
|
||||
request is likely to be near the just completed request, then the IO
|
||||
scheduler will stop dispatching more read requests for up to (antic_expire)
|
||||
milliseconds, hoping that process will submit a new request near the one
|
||||
that just completed. If such a request is made, then it is dispatched
|
||||
immediately. If the antic_expire wait time expires, then the IO scheduler
|
||||
will dispatch the next read request from the sorted read queue.
|
||||
|
||||
To decide whether an anticipatory wait is worthwhile, the scheduler
|
||||
maintains statistics for each process that can be used to compute
|
||||
mean "think time" (the time between read requests), and mean seek
|
||||
distance for that process. One observation is that these statistics
|
||||
are associated with each process, but those statistics are not associated
|
||||
with a specific IO device. So for example, if a process is doing IO
|
||||
on several file systems on separate devices, the statistics will be
|
||||
a combination of IO behavior from all those devices.
|
||||
|
||||
|
||||
Tuning the anticipatory IO scheduler
|
||||
------------------------------------
|
||||
When using 'as', the anticipatory IO scheduler there are 5 parameters under
|
||||
/sys/block/*/queue/iosched/. All are units of milliseconds.
|
||||
|
||||
The parameters are:
|
||||
* read_expire
|
||||
Controls how long until a read request becomes "expired". It also controls the
|
||||
interval between which expired requests are served, so set to 50, a request
|
||||
might take anywhere < 100ms to be serviced _if_ it is the next on the
|
||||
expired list. Obviously request expiration strategies won't make the disk
|
||||
go faster. The result basically equates to the timeslice a single reader
|
||||
gets in the presence of other IO. 100*((seek time / read_expire) + 1) is
|
||||
very roughly the % streaming read efficiency your disk should get with
|
||||
multiple readers.
|
||||
|
||||
* read_batch_expire
|
||||
Controls how much time a batch of reads is given before pending writes are
|
||||
served. A higher value is more efficient. This might be set below read_expire
|
||||
if writes are to be given higher priority than reads, but reads are to be
|
||||
as efficient as possible when there are no writes. Generally though, it
|
||||
should be some multiple of read_expire.
|
||||
|
||||
* write_expire, and
|
||||
* write_batch_expire are equivalent to the above, for writes.
|
||||
|
||||
* antic_expire
|
||||
Controls the maximum amount of time we can anticipate a good read (one
|
||||
with a short seek distance from the most recently completed request) before
|
||||
giving up. Many other factors may cause anticipation to be stopped early,
|
||||
or some processes will not be "anticipated" at all. Should be a bit higher
|
||||
for big seek time devices though not a linear correspondence - most
|
||||
processes have only a few ms thinktime.
|
||||
|
||||
In addition to the tunables above there is a read-only file named est_time
|
||||
which, when read, will show:
|
||||
|
||||
- The probability of a task exiting without a cooperating task
|
||||
submitting an anticipated IO.
|
||||
|
||||
- The current mean think time.
|
||||
|
||||
- The seek distance used to determine if an incoming IO is better.
|
||||
|
@ -186,7 +186,7 @@ a virtual address mapping (unlike the earlier scheme of virtual address
|
||||
do not have a corresponding kernel virtual address space mapping) and
|
||||
low-memory pages.
|
||||
|
||||
Note: Please refer to Documentation/DMA-mapping.txt for a discussion
|
||||
Note: Please refer to Documentation/PCI/PCI-DMA-mapping.txt for a discussion
|
||||
on PCI high mem DMA aspects and mapping of scatter gather lists, and support
|
||||
for 64 bit PCI.
|
||||
|
||||
@ -1162,8 +1162,8 @@ where a driver received a request ala this before:
|
||||
|
||||
As mentioned, there is no virtual mapping of a bio. For DMA, this is
|
||||
not a problem as the driver probably never will need a virtual mapping.
|
||||
Instead it needs a bus mapping (pci_map_page for a single segment or
|
||||
use blk_rq_map_sg for scatter gather) to be able to ship it to the driver. For
|
||||
Instead it needs a bus mapping (dma_map_page for a single segment or
|
||||
use dma_map_sg for scatter gather) to be able to ship it to the driver. For
|
||||
PIO drivers (or drivers that need to revert to PIO transfer once in a
|
||||
while (IDE for example)), where the CPU is doing the actual data
|
||||
transfer a virtual mapping is needed. If the driver supports highmem I/O,
|
||||
|
@ -25,11 +25,11 @@ size allowed by the hardware.
|
||||
|
||||
nomerges (RW)
|
||||
-------------
|
||||
This enables the user to disable the lookup logic involved with IO merging
|
||||
requests in the block layer. Merging may still occur through a direct
|
||||
1-hit cache, since that comes for (almost) free. The IO scheduler will not
|
||||
waste cycles doing tree/hash lookups for merges if nomerges is 1. Defaults
|
||||
to 0, enabling all merges.
|
||||
This enables the user to disable the lookup logic involved with IO
|
||||
merging requests in the block layer. By default (0) all merges are
|
||||
enabled. When set to 1 only simple one-hit merges will be tried. When
|
||||
set to 2 no merge algorithms will be tried (including one-hit or more
|
||||
complex tree/hash lookups).
|
||||
|
||||
nr_requests (RW)
|
||||
----------------
|
||||
|
@ -88,12 +88,12 @@ changes occur:
|
||||
This is used primarily during fault processing.
|
||||
|
||||
5) void update_mmu_cache(struct vm_area_struct *vma,
|
||||
unsigned long address, pte_t pte)
|
||||
unsigned long address, pte_t *ptep)
|
||||
|
||||
At the end of every page fault, this routine is invoked to
|
||||
tell the architecture specific code that a translation
|
||||
described by "pte" now exists at virtual address "address"
|
||||
for address space "vma->vm_mm", in the software page tables.
|
||||
now exists at virtual address "address" for address space
|
||||
"vma->vm_mm", in the software page tables.
|
||||
|
||||
A port may use this information in any way it so chooses.
|
||||
For example, it could use this event to pre-load TLB
|
||||
@ -377,3 +377,27 @@ maps this page at its virtual address.
|
||||
All the functionality of flush_icache_page can be implemented in
|
||||
flush_dcache_page and update_mmu_cache. In 2.7 the hope is to
|
||||
remove this interface completely.
|
||||
|
||||
The final category of APIs is for I/O to deliberately aliased address
|
||||
ranges inside the kernel. Such aliases are set up by use of the
|
||||
vmap/vmalloc API. Since kernel I/O goes via physical pages, the I/O
|
||||
subsystem assumes that the user mapping and kernel offset mapping are
|
||||
the only aliases. This isn't true for vmap aliases, so anything in
|
||||
the kernel trying to do I/O to vmap areas must manually manage
|
||||
coherency. It must do this by flushing the vmap range before doing
|
||||
I/O and invalidating it after the I/O returns.
|
||||
|
||||
void flush_kernel_vmap_range(void *vaddr, int size)
|
||||
flushes the kernel cache for a given virtual address range in
|
||||
the vmap area. This is to make sure that any data the kernel
|
||||
modified in the vmap range is made visible to the physical
|
||||
page. The design is to make this area safe to perform I/O on.
|
||||
Note that this API does *not* also flush the offset map alias
|
||||
of the area.
|
||||
|
||||
void invalidate_kernel_vmap_range(void *vaddr, int size) invalidates
|
||||
the cache for a given virtual address range in the vmap area
|
||||
which prevents the processor from making the cache stale by
|
||||
speculatively reading data while the I/O was occurring to the
|
||||
physical pages. This is only necessary for data reads into the
|
||||
vmap area.
|
||||
|
@ -159,42 +159,7 @@ two arguments: the CDROM device, and the slot number to which you wish
|
||||
to change. If the slot number is -1, the drive is unloaded.
|
||||
|
||||
|
||||
4. Compilation options
|
||||
----------------------
|
||||
|
||||
There are a few additional options which can be set when compiling the
|
||||
driver. Most people should not need to mess with any of these; they
|
||||
are listed here simply for completeness. A compilation option can be
|
||||
enabled by adding a line of the form `#define <option> 1' to the top
|
||||
of ide-cd.c. All these options are disabled by default.
|
||||
|
||||
VERBOSE_IDE_CD_ERRORS
|
||||
If this is set, ATAPI error codes will be translated into textual
|
||||
descriptions. In addition, a dump is made of the command which
|
||||
provoked the error. This is off by default to save the memory used
|
||||
by the (somewhat long) table of error descriptions.
|
||||
|
||||
STANDARD_ATAPI
|
||||
If this is set, the code needed to deal with certain drives which do
|
||||
not properly implement the ATAPI spec will be disabled. If you know
|
||||
your drive implements ATAPI properly, you can turn this on to get a
|
||||
slightly smaller kernel.
|
||||
|
||||
NO_DOOR_LOCKING
|
||||
If this is set, the driver will never attempt to lock the door of
|
||||
the drive.
|
||||
|
||||
CDROM_NBLOCKS_BUFFER
|
||||
This sets the size of the buffer to be used for a CDROMREADAUDIO
|
||||
ioctl. The default is 8.
|
||||
|
||||
TEST
|
||||
This currently enables an additional ioctl which enables a user-mode
|
||||
program to execute an arbitrary packet command. See the source for
|
||||
details. This should be left off unless you know what you're doing.
|
||||
|
||||
|
||||
5. Common problems
|
||||
4. Common problems
|
||||
------------------
|
||||
|
||||
This section discusses some common problems encountered when trying to
|
||||
@ -371,7 +336,7 @@ f. Data corruption.
|
||||
expense of low system performance.
|
||||
|
||||
|
||||
6. cdchange.c
|
||||
5. cdchange.c
|
||||
-------------
|
||||
|
||||
/*
|
||||
|
110
Documentation/cgroups/cgroup_event_listener.c
Normal file
110
Documentation/cgroups/cgroup_event_listener.c
Normal file
@ -0,0 +1,110 @@
|
||||
/*
|
||||
* cgroup_event_listener.c - Simple listener of cgroup events
|
||||
*
|
||||
* Copyright (C) Kirill A. Shutemov <kirill@shutemov.name>
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <libgen.h>
|
||||
#include <limits.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <sys/eventfd.h>
|
||||
|
||||
#define USAGE_STR "Usage: cgroup_event_listener <path-to-control-file> <args>\n"
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int efd = -1;
|
||||
int cfd = -1;
|
||||
int event_control = -1;
|
||||
char event_control_path[PATH_MAX];
|
||||
char line[LINE_MAX];
|
||||
int ret;
|
||||
|
||||
if (argc != 3) {
|
||||
fputs(USAGE_STR, stderr);
|
||||
return 1;
|
||||
}
|
||||
|
||||
cfd = open(argv[1], O_RDONLY);
|
||||
if (cfd == -1) {
|
||||
fprintf(stderr, "Cannot open %s: %s\n", argv[1],
|
||||
strerror(errno));
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = snprintf(event_control_path, PATH_MAX, "%s/cgroup.event_control",
|
||||
dirname(argv[1]));
|
||||
if (ret >= PATH_MAX) {
|
||||
fputs("Path to cgroup.event_control is too long\n", stderr);
|
||||
goto out;
|
||||
}
|
||||
|
||||
event_control = open(event_control_path, O_WRONLY);
|
||||
if (event_control == -1) {
|
||||
fprintf(stderr, "Cannot open %s: %s\n", event_control_path,
|
||||
strerror(errno));
|
||||
goto out;
|
||||
}
|
||||
|
||||
efd = eventfd(0, 0);
|
||||
if (efd == -1) {
|
||||
perror("eventfd() failed");
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = snprintf(line, LINE_MAX, "%d %d %s", efd, cfd, argv[2]);
|
||||
if (ret >= LINE_MAX) {
|
||||
fputs("Arguments string is too long\n", stderr);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = write(event_control, line, strlen(line) + 1);
|
||||
if (ret == -1) {
|
||||
perror("Cannot write to cgroup.event_control");
|
||||
goto out;
|
||||
}
|
||||
|
||||
while (1) {
|
||||
uint64_t result;
|
||||
|
||||
ret = read(efd, &result, sizeof(result));
|
||||
if (ret == -1) {
|
||||
if (errno == EINTR)
|
||||
continue;
|
||||
perror("Cannot read from eventfd");
|
||||
break;
|
||||
}
|
||||
assert(ret == sizeof(result));
|
||||
|
||||
ret = access(event_control_path, W_OK);
|
||||
if ((ret == -1) && (errno == ENOENT)) {
|
||||
puts("The cgroup seems to have removed.");
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
if (ret == -1) {
|
||||
perror("cgroup.event_control "
|
||||
"is not accessable any more");
|
||||
break;
|
||||
}
|
||||
|
||||
printf("%s %s: crossed\n", argv[1], argv[2]);
|
||||
}
|
||||
|
||||
out:
|
||||
if (efd >= 0)
|
||||
close(efd);
|
||||
if (event_control >= 0)
|
||||
close(event_control);
|
||||
if (cfd >= 0)
|
||||
close(cfd);
|
||||
|
||||
return (ret != 0);
|
||||
}
|
@ -22,6 +22,8 @@ CONTENTS:
|
||||
2. Usage Examples and Syntax
|
||||
2.1 Basic Usage
|
||||
2.2 Attaching processes
|
||||
2.3 Mounting hierarchies by name
|
||||
2.4 Notification API
|
||||
3. Kernel API
|
||||
3.1 Overview
|
||||
3.2 Synchronization
|
||||
@ -233,8 +235,7 @@ containing the following files describing that cgroup:
|
||||
- cgroup.procs: list of tgids in the cgroup. This list is not
|
||||
guaranteed to be sorted or free of duplicate tgids, and userspace
|
||||
should sort/uniquify the list if this property is required.
|
||||
Writing a tgid into this file moves all threads with that tgid into
|
||||
this cgroup.
|
||||
This is a read-only file, for now.
|
||||
- notify_on_release flag: run the release agent on exit?
|
||||
- release_agent: the path to use for release notifications (this file
|
||||
exists in the top cgroup only)
|
||||
@ -434,6 +435,25 @@ you give a subsystem a name.
|
||||
The name of the subsystem appears as part of the hierarchy description
|
||||
in /proc/mounts and /proc/<pid>/cgroups.
|
||||
|
||||
2.4 Notification API
|
||||
--------------------
|
||||
|
||||
There is mechanism which allows to get notifications about changing
|
||||
status of a cgroup.
|
||||
|
||||
To register new notification handler you need:
|
||||
- create a file descriptor for event notification using eventfd(2);
|
||||
- open a control file to be monitored (e.g. memory.usage_in_bytes);
|
||||
- write "<event_fd> <control_fd> <args>" to cgroup.event_control.
|
||||
Interpretation of args is defined by control file implementation;
|
||||
|
||||
eventfd will be woken up by control file implementation or when the
|
||||
cgroup is removed.
|
||||
|
||||
To unregister notification handler just close eventfd.
|
||||
|
||||
NOTE: Support of notifications should be implemented for the control
|
||||
file. See documentation for the subsystem.
|
||||
|
||||
3. Kernel API
|
||||
=============
|
||||
@ -488,6 +508,11 @@ Each subsystem should:
|
||||
- add an entry in linux/cgroup_subsys.h
|
||||
- define a cgroup_subsys object called <name>_subsys
|
||||
|
||||
If a subsystem can be compiled as a module, it should also have in its
|
||||
module initcall a call to cgroup_load_subsys(), and in its exitcall a
|
||||
call to cgroup_unload_subsys(). It should also set its_subsys.module =
|
||||
THIS_MODULE in its .c file.
|
||||
|
||||
Each subsystem may export the following methods. The only mandatory
|
||||
methods are create/destroy. Any others that are null are presumed to
|
||||
be successful no-ops.
|
||||
@ -536,10 +561,21 @@ returns an error, this will abort the attach operation. If a NULL
|
||||
task is passed, then a successful result indicates that *any*
|
||||
unspecified task can be moved into the cgroup. Note that this isn't
|
||||
called on a fork. If this method returns 0 (success) then this should
|
||||
remain valid while the caller holds cgroup_mutex. If threadgroup is
|
||||
remain valid while the caller holds cgroup_mutex and it is ensured that either
|
||||
attach() or cancel_attach() will be called in future. If threadgroup is
|
||||
true, then a successful result indicates that all threads in the given
|
||||
thread's threadgroup can be moved together.
|
||||
|
||||
void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
|
||||
struct task_struct *task, bool threadgroup)
|
||||
(cgroup_mutex held by caller)
|
||||
|
||||
Called when a task attach operation has failed after can_attach() has succeeded.
|
||||
A subsystem whose can_attach() has some side-effects should provide this
|
||||
function, so that the subsytem can implement a rollback. If not, not necessary.
|
||||
This will be called only about subsystems whose can_attach() operation have
|
||||
succeeded.
|
||||
|
||||
void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
|
||||
struct cgroup *old_cgrp, struct task_struct *task,
|
||||
bool threadgroup)
|
||||
|
@ -168,20 +168,20 @@ Each cpuset is represented by a directory in the cgroup file system
|
||||
containing (on top of the standard cgroup files) the following
|
||||
files describing that cpuset:
|
||||
|
||||
- cpus: list of CPUs in that cpuset
|
||||
- mems: list of Memory Nodes in that cpuset
|
||||
- memory_migrate flag: if set, move pages to cpusets nodes
|
||||
- cpu_exclusive flag: is cpu placement exclusive?
|
||||
- mem_exclusive flag: is memory placement exclusive?
|
||||
- mem_hardwall flag: is memory allocation hardwalled
|
||||
- memory_pressure: measure of how much paging pressure in cpuset
|
||||
- memory_spread_page flag: if set, spread page cache evenly on allowed nodes
|
||||
- memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
|
||||
- sched_load_balance flag: if set, load balance within CPUs on that cpuset
|
||||
- sched_relax_domain_level: the searching range when migrating tasks
|
||||
- cpuset.cpus: list of CPUs in that cpuset
|
||||
- cpuset.mems: list of Memory Nodes in that cpuset
|
||||
- cpuset.memory_migrate flag: if set, move pages to cpusets nodes
|
||||
- cpuset.cpu_exclusive flag: is cpu placement exclusive?
|
||||
- cpuset.mem_exclusive flag: is memory placement exclusive?
|
||||
- cpuset.mem_hardwall flag: is memory allocation hardwalled
|
||||
- cpuset.memory_pressure: measure of how much paging pressure in cpuset
|
||||
- cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes
|
||||
- cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
|
||||
- cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset
|
||||
- cpuset.sched_relax_domain_level: the searching range when migrating tasks
|
||||
|
||||
In addition, the root cpuset only has the following file:
|
||||
- memory_pressure_enabled flag: compute memory_pressure?
|
||||
- cpuset.memory_pressure_enabled flag: compute memory_pressure?
|
||||
|
||||
New cpusets are created using the mkdir system call or shell
|
||||
command. The properties of a cpuset, such as its flags, allowed
|
||||
@ -229,7 +229,7 @@ If a cpuset is cpu or mem exclusive, no other cpuset, other than
|
||||
a direct ancestor or descendant, may share any of the same CPUs or
|
||||
Memory Nodes.
|
||||
|
||||
A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled",
|
||||
A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled",
|
||||
i.e. it restricts kernel allocations for page, buffer and other data
|
||||
commonly shared by the kernel across multiple users. All cpusets,
|
||||
whether hardwalled or not, restrict allocations of memory for user
|
||||
@ -304,15 +304,15 @@ times 1000.
|
||||
---------------------------
|
||||
There are two boolean flag files per cpuset that control where the
|
||||
kernel allocates pages for the file system buffers and related in
|
||||
kernel data structures. They are called 'memory_spread_page' and
|
||||
'memory_spread_slab'.
|
||||
kernel data structures. They are called 'cpuset.memory_spread_page' and
|
||||
'cpuset.memory_spread_slab'.
|
||||
|
||||
If the per-cpuset boolean flag file 'memory_spread_page' is set, then
|
||||
If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then
|
||||
the kernel will spread the file system buffers (page cache) evenly
|
||||
over all the nodes that the faulting task is allowed to use, instead
|
||||
of preferring to put those pages on the node where the task is running.
|
||||
|
||||
If the per-cpuset boolean flag file 'memory_spread_slab' is set,
|
||||
If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set,
|
||||
then the kernel will spread some file system related slab caches,
|
||||
such as for inodes and dentries evenly over all the nodes that the
|
||||
faulting task is allowed to use, instead of preferring to put those
|
||||
@ -337,21 +337,21 @@ their containing tasks memory spread settings. If memory spreading
|
||||
is turned off, then the currently specified NUMA mempolicy once again
|
||||
applies to memory page allocations.
|
||||
|
||||
Both 'memory_spread_page' and 'memory_spread_slab' are boolean flag
|
||||
Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag
|
||||
files. By default they contain "0", meaning that the feature is off
|
||||
for that cpuset. If a "1" is written to that file, then that turns
|
||||
the named feature on.
|
||||
|
||||
The implementation is simple.
|
||||
|
||||
Setting the flag 'memory_spread_page' turns on a per-process flag
|
||||
Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag
|
||||
PF_SPREAD_PAGE for each task that is in that cpuset or subsequently
|
||||
joins that cpuset. The page allocation calls for the page cache
|
||||
is modified to perform an inline check for this PF_SPREAD_PAGE task
|
||||
flag, and if set, a call to a new routine cpuset_mem_spread_node()
|
||||
returns the node to prefer for the allocation.
|
||||
|
||||
Similarly, setting 'memory_spread_slab' turns on the flag
|
||||
Similarly, setting 'cpuset.memory_spread_slab' turns on the flag
|
||||
PF_SPREAD_SLAB, and appropriately marked slab caches will allocate
|
||||
pages from the node returned by cpuset_mem_spread_node().
|
||||
|
||||
@ -404,24 +404,24 @@ the following two situations:
|
||||
system overhead on those CPUs, including avoiding task load
|
||||
balancing if that is not needed.
|
||||
|
||||
When the per-cpuset flag "sched_load_balance" is enabled (the default
|
||||
setting), it requests that all the CPUs in that cpusets allowed 'cpus'
|
||||
When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default
|
||||
setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus'
|
||||
be contained in a single sched domain, ensuring that load balancing
|
||||
can move a task (not otherwised pinned, as by sched_setaffinity)
|
||||
from any CPU in that cpuset to any other.
|
||||
|
||||
When the per-cpuset flag "sched_load_balance" is disabled, then the
|
||||
When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the
|
||||
scheduler will avoid load balancing across the CPUs in that cpuset,
|
||||
--except-- in so far as is necessary because some overlapping cpuset
|
||||
has "sched_load_balance" enabled.
|
||||
|
||||
So, for example, if the top cpuset has the flag "sched_load_balance"
|
||||
So, for example, if the top cpuset has the flag "cpuset.sched_load_balance"
|
||||
enabled, then the scheduler will have one sched domain covering all
|
||||
CPUs, and the setting of the "sched_load_balance" flag in any other
|
||||
CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other
|
||||
cpusets won't matter, as we're already fully load balancing.
|
||||
|
||||
Therefore in the above two situations, the top cpuset flag
|
||||
"sched_load_balance" should be disabled, and only some of the smaller,
|
||||
"cpuset.sched_load_balance" should be disabled, and only some of the smaller,
|
||||
child cpusets have this flag enabled.
|
||||
|
||||
When doing this, you don't usually want to leave any unpinned tasks in
|
||||
@ -433,7 +433,7 @@ scheduler might not consider the possibility of load balancing that
|
||||
task to that underused CPU.
|
||||
|
||||
Of course, tasks pinned to a particular CPU can be left in a cpuset
|
||||
that disables "sched_load_balance" as those tasks aren't going anywhere
|
||||
that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere
|
||||
else anyway.
|
||||
|
||||
There is an impedance mismatch here, between cpusets and sched domains.
|
||||
@ -443,19 +443,19 @@ overlap and each CPU is in at most one sched domain.
|
||||
It is necessary for sched domains to be flat because load balancing
|
||||
across partially overlapping sets of CPUs would risk unstable dynamics
|
||||
that would be beyond our understanding. So if each of two partially
|
||||
overlapping cpusets enables the flag 'sched_load_balance', then we
|
||||
overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we
|
||||
form a single sched domain that is a superset of both. We won't move
|
||||
a task to a CPU outside it cpuset, but the scheduler load balancing
|
||||
code might waste some compute cycles considering that possibility.
|
||||
|
||||
This mismatch is why there is not a simple one-to-one relation
|
||||
between which cpusets have the flag "sched_load_balance" enabled,
|
||||
between which cpusets have the flag "cpuset.sched_load_balance" enabled,
|
||||
and the sched domain configuration. If a cpuset enables the flag, it
|
||||
will get balancing across all its CPUs, but if it disables the flag,
|
||||
it will only be assured of no load balancing if no other overlapping
|
||||
cpuset enables the flag.
|
||||
|
||||
If two cpusets have partially overlapping 'cpus' allowed, and only
|
||||
If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only
|
||||
one of them has this flag enabled, then the other may find its
|
||||
tasks only partially load balanced, just on the overlapping CPUs.
|
||||
This is just the general case of the top_cpuset example given a few
|
||||
@ -468,23 +468,23 @@ load balancing to the other CPUs.
|
||||
1.7.1 sched_load_balance implementation details.
|
||||
------------------------------------------------
|
||||
|
||||
The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary
|
||||
The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary
|
||||
to most cpuset flags.) When enabled for a cpuset, the kernel will
|
||||
ensure that it can load balance across all the CPUs in that cpuset
|
||||
(makes sure that all the CPUs in the cpus_allowed of that cpuset are
|
||||
in the same sched domain.)
|
||||
|
||||
If two overlapping cpusets both have 'sched_load_balance' enabled,
|
||||
If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled,
|
||||
then they will be (must be) both in the same sched domain.
|
||||
|
||||
If, as is the default, the top cpuset has 'sched_load_balance' enabled,
|
||||
If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled,
|
||||
then by the above that means there is a single sched domain covering
|
||||
the whole system, regardless of any other cpuset settings.
|
||||
|
||||
The kernel commits to user space that it will avoid load balancing
|
||||
where it can. It will pick as fine a granularity partition of sched
|
||||
domains as it can while still providing load balancing for any set
|
||||
of CPUs allowed to a cpuset having 'sched_load_balance' enabled.
|
||||
of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled.
|
||||
|
||||
The internal kernel cpuset to scheduler interface passes from the
|
||||
cpuset code to the scheduler code a partition of the load balanced
|
||||
@ -495,9 +495,9 @@ all the CPUs that must be load balanced.
|
||||
The cpuset code builds a new such partition and passes it to the
|
||||
scheduler sched domain setup code, to have the sched domains rebuilt
|
||||
as necessary, whenever:
|
||||
- the 'sched_load_balance' flag of a cpuset with non-empty CPUs changes,
|
||||
- the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes,
|
||||
- or CPUs come or go from a cpuset with this flag enabled,
|
||||
- or 'sched_relax_domain_level' value of a cpuset with non-empty CPUs
|
||||
- or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs
|
||||
and with this flag enabled changes,
|
||||
- or a cpuset with non-empty CPUs and with this flag enabled is removed,
|
||||
- or a cpu is offlined/onlined.
|
||||
@ -542,7 +542,7 @@ As the result, task B on CPU X need to wait task A or wait load balance
|
||||
on the next tick. For some applications in special situation, waiting
|
||||
1 tick may be too long.
|
||||
|
||||
The 'sched_relax_domain_level' file allows you to request changing
|
||||
The 'cpuset.sched_relax_domain_level' file allows you to request changing
|
||||
this searching range as you like. This file takes int value which
|
||||
indicates size of searching range in levels ideally as follows,
|
||||
otherwise initial value -1 that indicates the cpuset has no request.
|
||||
@ -559,8 +559,8 @@ The system default is architecture dependent. The system default
|
||||
can be changed using the relax_domain_level= boot parameter.
|
||||
|
||||
This file is per-cpuset and affect the sched domain where the cpuset
|
||||
belongs to. Therefore if the flag 'sched_load_balance' of a cpuset
|
||||
is disabled, then 'sched_relax_domain_level' have no effect since
|
||||
belongs to. Therefore if the flag 'cpuset.sched_load_balance' of a cpuset
|
||||
is disabled, then 'cpuset.sched_relax_domain_level' have no effect since
|
||||
there is no sched domain belonging the cpuset.
|
||||
|
||||
If multiple cpusets are overlapping and hence they form a single sched
|
||||
@ -607,9 +607,9 @@ from one cpuset to another, then the kernel will adjust the tasks
|
||||
memory placement, as above, the next time that the kernel attempts
|
||||
to allocate a page of memory for that task.
|
||||
|
||||
If a cpuset has its 'cpus' modified, then each task in that cpuset
|
||||
If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset
|
||||
will have its allowed CPU placement changed immediately. Similarly,
|
||||
if a tasks pid is written to another cpusets 'tasks' file, then its
|
||||
if a tasks pid is written to another cpusets 'cpuset.tasks' file, then its
|
||||
allowed CPU placement is changed immediately. If such a task had been
|
||||
bound to some subset of its cpuset using the sched_setaffinity() call,
|
||||
the task will be allowed to run on any CPU allowed in its new cpuset,
|
||||
@ -622,8 +622,8 @@ and the processor placement is updated immediately.
|
||||
Normally, once a page is allocated (given a physical page
|
||||
of main memory) then that page stays on whatever node it
|
||||
was allocated, so long as it remains allocated, even if the
|
||||
cpusets memory placement policy 'mems' subsequently changes.
|
||||
If the cpuset flag file 'memory_migrate' is set true, then when
|
||||
cpusets memory placement policy 'cpuset.mems' subsequently changes.
|
||||
If the cpuset flag file 'cpuset.memory_migrate' is set true, then when
|
||||
tasks are attached to that cpuset, any pages that task had
|
||||
allocated to it on nodes in its previous cpuset are migrated
|
||||
to the tasks new cpuset. The relative placement of the page within
|
||||
@ -631,12 +631,12 @@ the cpuset is preserved during these migration operations if possible.
|
||||
For example if the page was on the second valid node of the prior cpuset
|
||||
then the page will be placed on the second valid node of the new cpuset.
|
||||
|
||||
Also if 'memory_migrate' is set true, then if that cpusets
|
||||
'mems' file is modified, pages allocated to tasks in that
|
||||
cpuset, that were on nodes in the previous setting of 'mems',
|
||||
Also if 'cpuset.memory_migrate' is set true, then if that cpusets
|
||||
'cpuset.mems' file is modified, pages allocated to tasks in that
|
||||
cpuset, that were on nodes in the previous setting of 'cpuset.mems',
|
||||
will be moved to nodes in the new setting of 'mems.'
|
||||
Pages that were not in the tasks prior cpuset, or in the cpusets
|
||||
prior 'mems' setting, will not be moved.
|
||||
prior 'cpuset.mems' setting, will not be moved.
|
||||
|
||||
There is an exception to the above. If hotplug functionality is used
|
||||
to remove all the CPUs that are currently assigned to a cpuset,
|
||||
@ -678,8 +678,8 @@ and then start a subshell 'sh' in that cpuset:
|
||||
cd /dev/cpuset
|
||||
mkdir Charlie
|
||||
cd Charlie
|
||||
/bin/echo 2-3 > cpus
|
||||
/bin/echo 1 > mems
|
||||
/bin/echo 2-3 > cpuset.cpus
|
||||
/bin/echo 1 > cpuset.mems
|
||||
/bin/echo $$ > tasks
|
||||
sh
|
||||
# The subshell 'sh' is now running in cpuset Charlie
|
||||
@ -725,10 +725,13 @@ Now you want to do something with this cpuset.
|
||||
|
||||
In this directory you can find several files:
|
||||
# ls
|
||||
cpu_exclusive memory_migrate mems tasks
|
||||
cpus memory_pressure notify_on_release
|
||||
mem_exclusive memory_spread_page sched_load_balance
|
||||
mem_hardwall memory_spread_slab sched_relax_domain_level
|
||||
cpuset.cpu_exclusive cpuset.memory_spread_slab
|
||||
cpuset.cpus cpuset.mems
|
||||
cpuset.mem_exclusive cpuset.sched_load_balance
|
||||
cpuset.mem_hardwall cpuset.sched_relax_domain_level
|
||||
cpuset.memory_migrate notify_on_release
|
||||
cpuset.memory_pressure tasks
|
||||
cpuset.memory_spread_page
|
||||
|
||||
Reading them will give you information about the state of this cpuset:
|
||||
the CPUs and Memory Nodes it can use, the processes that are using
|
||||
@ -736,13 +739,13 @@ it, its properties. By writing to these files you can manipulate
|
||||
the cpuset.
|
||||
|
||||
Set some flags:
|
||||
# /bin/echo 1 > cpu_exclusive
|
||||
# /bin/echo 1 > cpuset.cpu_exclusive
|
||||
|
||||
Add some cpus:
|
||||
# /bin/echo 0-7 > cpus
|
||||
# /bin/echo 0-7 > cpuset.cpus
|
||||
|
||||
Add some mems:
|
||||
# /bin/echo 0-7 > mems
|
||||
# /bin/echo 0-7 > cpuset.mems
|
||||
|
||||
Now attach your shell to this cpuset:
|
||||
# /bin/echo $$ > tasks
|
||||
@ -774,28 +777,28 @@ echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent
|
||||
This is the syntax to use when writing in the cpus or mems files
|
||||
in cpuset directories:
|
||||
|
||||
# /bin/echo 1-4 > cpus -> set cpus list to cpus 1,2,3,4
|
||||
# /bin/echo 1,2,3,4 > cpus -> set cpus list to cpus 1,2,3,4
|
||||
# /bin/echo 1-4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4
|
||||
# /bin/echo 1,2,3,4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4
|
||||
|
||||
To add a CPU to a cpuset, write the new list of CPUs including the
|
||||
CPU to be added. To add 6 to the above cpuset:
|
||||
|
||||
# /bin/echo 1-4,6 > cpus -> set cpus list to cpus 1,2,3,4,6
|
||||
# /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6
|
||||
|
||||
Similarly to remove a CPU from a cpuset, write the new list of CPUs
|
||||
without the CPU to be removed.
|
||||
|
||||
To remove all the CPUs:
|
||||
|
||||
# /bin/echo "" > cpus -> clear cpus list
|
||||
# /bin/echo "" > cpuset.cpus -> clear cpus list
|
||||
|
||||
2.3 Setting flags
|
||||
-----------------
|
||||
|
||||
The syntax is very simple:
|
||||
|
||||
# /bin/echo 1 > cpu_exclusive -> set flag 'cpu_exclusive'
|
||||
# /bin/echo 0 > cpu_exclusive -> unset flag 'cpu_exclusive'
|
||||
# /bin/echo 1 > cpuset.cpu_exclusive -> set flag 'cpuset.cpu_exclusive'
|
||||
# /bin/echo 0 > cpuset.cpu_exclusive -> unset flag 'cpuset.cpu_exclusive'
|
||||
|
||||
2.4 Attaching processes
|
||||
-----------------------
|
||||
|
@ -1,6 +1,6 @@
|
||||
Memory Resource Controller(Memcg) Implementation Memo.
|
||||
Last Updated: 2009/1/20
|
||||
Base Kernel Version: based on 2.6.29-rc2.
|
||||
Last Updated: 2010/2
|
||||
Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34).
|
||||
|
||||
Because VM is getting complex (one of reasons is memcg...), memcg's behavior
|
||||
is complex. This is a document for memcg's internal behavior.
|
||||
@ -337,7 +337,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
|
||||
race and lock dependency with other cgroup subsystems.
|
||||
|
||||
example)
|
||||
# mount -t cgroup none /cgroup -t cpuset,memory,cpu,devices
|
||||
# mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices
|
||||
|
||||
and do task move, mkdir, rmdir etc...under this.
|
||||
|
||||
@ -348,7 +348,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
|
||||
|
||||
For example, test like following is good.
|
||||
(Shell-A)
|
||||
# mount -t cgroup none /cgroup -t memory
|
||||
# mount -t cgroup none /cgroup -o memory
|
||||
# mkdir /cgroup/test
|
||||
# echo 40M > /cgroup/test/memory.limit_in_bytes
|
||||
# echo 0 > /cgroup/test/tasks
|
||||
@ -378,3 +378,42 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
|
||||
#echo 50M > memory.limit_in_bytes
|
||||
#echo 50M > memory.memsw.limit_in_bytes
|
||||
run 51M of malloc
|
||||
|
||||
9.9 Move charges at task migration
|
||||
Charges associated with a task can be moved along with task migration.
|
||||
|
||||
(Shell-A)
|
||||
#mkdir /cgroup/A
|
||||
#echo $$ >/cgroup/A/tasks
|
||||
run some programs which uses some amount of memory in /cgroup/A.
|
||||
|
||||
(Shell-B)
|
||||
#mkdir /cgroup/B
|
||||
#echo 1 >/cgroup/B/memory.move_charge_at_immigrate
|
||||
#echo "pid of the program running in group A" >/cgroup/B/tasks
|
||||
|
||||
You can see charges have been moved by reading *.usage_in_bytes or
|
||||
memory.stat of both A and B.
|
||||
See 8.2 of Documentation/cgroups/memory.txt to see what value should be
|
||||
written to move_charge_at_immigrate.
|
||||
|
||||
9.10 Memory thresholds
|
||||
Memory controler implements memory thresholds using cgroups notification
|
||||
API. You can use Documentation/cgroups/cgroup_event_listener.c to test
|
||||
it.
|
||||
|
||||
(Shell-A) Create cgroup and run event listener
|
||||
# mkdir /cgroup/A
|
||||
# ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M
|
||||
|
||||
(Shell-B) Add task to cgroup and try to allocate and free memory
|
||||
# echo $$ >/cgroup/A/tasks
|
||||
# a="$(dd if=/dev/zero bs=1M count=10)"
|
||||
# a=
|
||||
|
||||
You will see message from cgroup_event_listener every time you cross
|
||||
the thresholds.
|
||||
|
||||
Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds.
|
||||
|
||||
It's good idea to test root cgroup as well.
|
||||
|
@ -182,6 +182,8 @@ list.
|
||||
NOTE: Reclaim does not work for the root cgroup, since we cannot set any
|
||||
limits on the root cgroup.
|
||||
|
||||
Note2: When panic_on_oom is set to "2", the whole system will panic.
|
||||
|
||||
2. Locking
|
||||
|
||||
The memory controller uses the following hierarchy
|
||||
@ -262,10 +264,12 @@ some of the pages cached in the cgroup (page cache pages).
|
||||
4.2 Task migration
|
||||
|
||||
When a task migrates from one cgroup to another, it's charge is not
|
||||
carried forward. The pages allocated from the original cgroup still
|
||||
carried forward by default. The pages allocated from the original cgroup still
|
||||
remain charged to it, the charge is dropped when the page is freed or
|
||||
reclaimed.
|
||||
|
||||
Note: You can move charges of a task along with task migration. See 8.
|
||||
|
||||
4.3 Removing a cgroup
|
||||
|
||||
A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
|
||||
@ -336,7 +340,7 @@ Note:
|
||||
5.3 swappiness
|
||||
Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only.
|
||||
|
||||
Following cgroups' swapiness can't be changed.
|
||||
Following cgroups' swappiness can't be changed.
|
||||
- root cgroup (uses /proc/sys/vm/swappiness).
|
||||
- a cgroup which uses hierarchy and it has child cgroup.
|
||||
- a cgroup which uses hierarchy and not the root of hierarchy.
|
||||
@ -377,7 +381,8 @@ The feature can be disabled by
|
||||
NOTE1: Enabling/disabling will fail if the cgroup already has other
|
||||
cgroups created below it.
|
||||
|
||||
NOTE2: This feature can be enabled/disabled per subtree.
|
||||
NOTE2: When panic_on_oom is set to "2", the whole system will panic in
|
||||
case of an oom event in any cgroup.
|
||||
|
||||
7. Soft limits
|
||||
|
||||
@ -414,7 +419,76 @@ NOTE1: Soft limits take effect over a long period of time, since they involve
|
||||
NOTE2: It is recommended to set the soft limit always below the hard limit,
|
||||
otherwise the hard limit will take precedence.
|
||||
|
||||
8. TODO
|
||||
8. Move charges at task migration
|
||||
|
||||
Users can move charges associated with a task along with task migration, that
|
||||
is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
|
||||
This feature is not supported in !CONFIG_MMU environments because of lack of
|
||||
page tables.
|
||||
|
||||
8.1 Interface
|
||||
|
||||
This feature is disabled by default. It can be enabled(and disabled again) by
|
||||
writing to memory.move_charge_at_immigrate of the destination cgroup.
|
||||
|
||||
If you want to enable it:
|
||||
|
||||
# echo (some positive value) > memory.move_charge_at_immigrate
|
||||
|
||||
Note: Each bits of move_charge_at_immigrate has its own meaning about what type
|
||||
of charges should be moved. See 8.2 for details.
|
||||
Note: Charges are moved only when you move mm->owner, IOW, a leader of a thread
|
||||
group.
|
||||
Note: If we cannot find enough space for the task in the destination cgroup, we
|
||||
try to make space by reclaiming memory. Task migration may fail if we
|
||||
cannot make enough space.
|
||||
Note: It can take several seconds if you move charges in giga bytes order.
|
||||
|
||||
And if you want disable it again:
|
||||
|
||||
# echo 0 > memory.move_charge_at_immigrate
|
||||
|
||||
8.2 Type of charges which can be move
|
||||
|
||||
Each bits of move_charge_at_immigrate has its own meaning about what type of
|
||||
charges should be moved.
|
||||
|
||||
bit | what type of charges would be moved ?
|
||||
-----+------------------------------------------------------------------------
|
||||
0 | A charge of an anonymous page(or swap of it) used by the target task.
|
||||
| Those pages and swaps must be used only by the target task. You must
|
||||
| enable Swap Extension(see 2.4) to enable move of swap charges.
|
||||
|
||||
Note: Those pages and swaps must be charged to the old cgroup.
|
||||
Note: More type of pages(e.g. file cache, shmem,) will be supported by other
|
||||
bits in future.
|
||||
|
||||
8.3 TODO
|
||||
|
||||
- Add support for other types of pages(e.g. file cache, shmem, etc.).
|
||||
- Implement madvise(2) to let users decide the vma to be moved or not to be
|
||||
moved.
|
||||
- All of moving charge operations are done under cgroup_mutex. It's not good
|
||||
behavior to hold the mutex too long, so we may need some trick.
|
||||
|
||||
9. Memory thresholds
|
||||
|
||||
Memory controler implements memory thresholds using cgroups notification
|
||||
API (see cgroups.txt). It allows to register multiple memory and memsw
|
||||
thresholds and gets notifications when it crosses.
|
||||
|
||||
To register a threshold application need:
|
||||
- create an eventfd using eventfd(2);
|
||||
- open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
|
||||
- write string like "<event_fd> <memory.usage_in_bytes> <threshold>" to
|
||||
cgroup.event_control.
|
||||
|
||||
Application will be notified through eventfd when memory usage crosses
|
||||
threshold in any direction.
|
||||
|
||||
It's applicable for root and non-root cgroup.
|
||||
|
||||
10. TODO
|
||||
|
||||
1. Add support for accounting huge pages (as a separate controller)
|
||||
2. Make per-cgroup scanner reclaim not-shared pages first
|
||||
|
234
Documentation/circular-buffers.txt
Normal file
234
Documentation/circular-buffers.txt
Normal file
@ -0,0 +1,234 @@
|
||||
================
|
||||
CIRCULAR BUFFERS
|
||||
================
|
||||
|
||||
By: David Howells <dhowells@redhat.com>
|
||||
Paul E. McKenney <paulmck@linux.vnet.ibm.com>
|
||||
|
||||
|
||||
Linux provides a number of features that can be used to implement circular
|
||||
buffering. There are two sets of such features:
|
||||
|
||||
(1) Convenience functions for determining information about power-of-2 sized
|
||||
buffers.
|
||||
|
||||
(2) Memory barriers for when the producer and the consumer of objects in the
|
||||
buffer don't want to share a lock.
|
||||
|
||||
To use these facilities, as discussed below, there needs to be just one
|
||||
producer and just one consumer. It is possible to handle multiple producers by
|
||||
serialising them, and to handle multiple consumers by serialising them.
|
||||
|
||||
|
||||
Contents:
|
||||
|
||||
(*) What is a circular buffer?
|
||||
|
||||
(*) Measuring power-of-2 buffers.
|
||||
|
||||
(*) Using memory barriers with circular buffers.
|
||||
- The producer.
|
||||
- The consumer.
|
||||
|
||||
|
||||
==========================
|
||||
WHAT IS A CIRCULAR BUFFER?
|
||||
==========================
|
||||
|
||||
First of all, what is a circular buffer? A circular buffer is a buffer of
|
||||
fixed, finite size into which there are two indices:
|
||||
|
||||
(1) A 'head' index - the point at which the producer inserts items into the
|
||||
buffer.
|
||||
|
||||
(2) A 'tail' index - the point at which the consumer finds the next item in
|
||||
the buffer.
|
||||
|
||||
Typically when the tail pointer is equal to the head pointer, the buffer is
|
||||
empty; and the buffer is full when the head pointer is one less than the tail
|
||||
pointer.
|
||||
|
||||
The head index is incremented when items are added, and the tail index when
|
||||
items are removed. The tail index should never jump the head index, and both
|
||||
indices should be wrapped to 0 when they reach the end of the buffer, thus
|
||||
allowing an infinite amount of data to flow through the buffer.
|
||||
|
||||
Typically, items will all be of the same unit size, but this isn't strictly
|
||||
required to use the techniques below. The indices can be increased by more
|
||||
than 1 if multiple items or variable-sized items are to be included in the
|
||||
buffer, provided that neither index overtakes the other. The implementer must
|
||||
be careful, however, as a region more than one unit in size may wrap the end of
|
||||
the buffer and be broken into two segments.
|
||||
|
||||
|
||||
============================
|
||||
MEASURING POWER-OF-2 BUFFERS
|
||||
============================
|
||||
|
||||
Calculation of the occupancy or the remaining capacity of an arbitrarily sized
|
||||
circular buffer would normally be a slow operation, requiring the use of a
|
||||
modulus (divide) instruction. However, if the buffer is of a power-of-2 size,
|
||||
then a much quicker bitwise-AND instruction can be used instead.
|
||||
|
||||
Linux provides a set of macros for handling power-of-2 circular buffers. These
|
||||
can be made use of by:
|
||||
|
||||
#include <linux/circ_buf.h>
|
||||
|
||||
The macros are:
|
||||
|
||||
(*) Measure the remaining capacity of a buffer:
|
||||
|
||||
CIRC_SPACE(head_index, tail_index, buffer_size);
|
||||
|
||||
This returns the amount of space left in the buffer[1] into which items
|
||||
can be inserted.
|
||||
|
||||
|
||||
(*) Measure the maximum consecutive immediate space in a buffer:
|
||||
|
||||
CIRC_SPACE_TO_END(head_index, tail_index, buffer_size);
|
||||
|
||||
This returns the amount of consecutive space left in the buffer[1] into
|
||||
which items can be immediately inserted without having to wrap back to the
|
||||
beginning of the buffer.
|
||||
|
||||
|
||||
(*) Measure the occupancy of a buffer:
|
||||
|
||||
CIRC_CNT(head_index, tail_index, buffer_size);
|
||||
|
||||
This returns the number of items currently occupying a buffer[2].
|
||||
|
||||
|
||||
(*) Measure the non-wrapping occupancy of a buffer:
|
||||
|
||||
CIRC_CNT_TO_END(head_index, tail_index, buffer_size);
|
||||
|
||||
This returns the number of consecutive items[2] that can be extracted from
|
||||
the buffer without having to wrap back to the beginning of the buffer.
|
||||
|
||||
|
||||
Each of these macros will nominally return a value between 0 and buffer_size-1,
|
||||
however:
|
||||
|
||||
[1] CIRC_SPACE*() are intended to be used in the producer. To the producer
|
||||
they will return a lower bound as the producer controls the head index,
|
||||
but the consumer may still be depleting the buffer on another CPU and
|
||||
moving the tail index.
|
||||
|
||||
To the consumer it will show an upper bound as the producer may be busy
|
||||
depleting the space.
|
||||
|
||||
[2] CIRC_CNT*() are intended to be used in the consumer. To the consumer they
|
||||
will return a lower bound as the consumer controls the tail index, but the
|
||||
producer may still be filling the buffer on another CPU and moving the
|
||||
head index.
|
||||
|
||||
To the producer it will show an upper bound as the consumer may be busy
|
||||
emptying the buffer.
|
||||
|
||||
[3] To a third party, the order in which the writes to the indices by the
|
||||
producer and consumer become visible cannot be guaranteed as they are
|
||||
independent and may be made on different CPUs - so the result in such a
|
||||
situation will merely be a guess, and may even be negative.
|
||||
|
||||
|
||||
===========================================
|
||||
USING MEMORY BARRIERS WITH CIRCULAR BUFFERS
|
||||
===========================================
|
||||
|
||||
By using memory barriers in conjunction with circular buffers, you can avoid
|
||||
the need to:
|
||||
|
||||
(1) use a single lock to govern access to both ends of the buffer, thus
|
||||
allowing the buffer to be filled and emptied at the same time; and
|
||||
|
||||
(2) use atomic counter operations.
|
||||
|
||||
There are two sides to this: the producer that fills the buffer, and the
|
||||
consumer that empties it. Only one thing should be filling a buffer at any one
|
||||
time, and only one thing should be emptying a buffer at any one time, but the
|
||||
two sides can operate simultaneously.
|
||||
|
||||
|
||||
THE PRODUCER
|
||||
------------
|
||||
|
||||
The producer will look something like this:
|
||||
|
||||
spin_lock(&producer_lock);
|
||||
|
||||
unsigned long head = buffer->head;
|
||||
unsigned long tail = ACCESS_ONCE(buffer->tail);
|
||||
|
||||
if (CIRC_SPACE(head, tail, buffer->size) >= 1) {
|
||||
/* insert one item into the buffer */
|
||||
struct item *item = buffer[head];
|
||||
|
||||
produce_item(item);
|
||||
|
||||
smp_wmb(); /* commit the item before incrementing the head */
|
||||
|
||||
buffer->head = (head + 1) & (buffer->size - 1);
|
||||
|
||||
/* wake_up() will make sure that the head is committed before
|
||||
* waking anyone up */
|
||||
wake_up(consumer);
|
||||
}
|
||||
|
||||
spin_unlock(&producer_lock);
|
||||
|
||||
This will instruct the CPU that the contents of the new item must be written
|
||||
before the head index makes it available to the consumer and then instructs the
|
||||
CPU that the revised head index must be written before the consumer is woken.
|
||||
|
||||
Note that wake_up() doesn't have to be the exact mechanism used, but whatever
|
||||
is used must guarantee a (write) memory barrier between the update of the head
|
||||
index and the change of state of the consumer, if a change of state occurs.
|
||||
|
||||
|
||||
THE CONSUMER
|
||||
------------
|
||||
|
||||
The consumer will look something like this:
|
||||
|
||||
spin_lock(&consumer_lock);
|
||||
|
||||
unsigned long head = ACCESS_ONCE(buffer->head);
|
||||
unsigned long tail = buffer->tail;
|
||||
|
||||
if (CIRC_CNT(head, tail, buffer->size) >= 1) {
|
||||
/* read index before reading contents at that index */
|
||||
smp_read_barrier_depends();
|
||||
|
||||
/* extract one item from the buffer */
|
||||
struct item *item = buffer[tail];
|
||||
|
||||
consume_item(item);
|
||||
|
||||
smp_mb(); /* finish reading descriptor before incrementing tail */
|
||||
|
||||
buffer->tail = (tail + 1) & (buffer->size - 1);
|
||||
}
|
||||
|
||||
spin_unlock(&consumer_lock);
|
||||
|
||||
This will instruct the CPU to make sure the index is up to date before reading
|
||||
the new item, and then it shall make sure the CPU has finished reading the item
|
||||
before it writes the new tail pointer, which will erase the item.
|
||||
|
||||
|
||||
Note the use of ACCESS_ONCE() in both algorithms to read the opposition index.
|
||||
This prevents the compiler from discarding and reloading its cached value -
|
||||
which some compilers will do across smp_read_barrier_depends(). This isn't
|
||||
strictly needed if you can be sure that the opposition index will _only_ be
|
||||
used the once.
|
||||
|
||||
|
||||
===============
|
||||
FURTHER READING
|
||||
===============
|
||||
|
||||
See also Documentation/memory-barriers.txt for a description of Linux's memory
|
||||
barrier facilities.
|
@ -25,6 +25,7 @@
|
||||
#include <linux/module.h>
|
||||
#include <linux/moduleparam.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/timer.h>
|
||||
|
||||
#include <linux/connector.h>
|
||||
|
@ -74,7 +74,7 @@ driver takes over the consoles vacated by the driver. Binding, on the other
|
||||
hand, will bind the driver to the consoles that are currently occupied by a
|
||||
system driver.
|
||||
|
||||
NOTE1: Binding and binding must be selected in Kconfig. It's under:
|
||||
NOTE1: Binding and unbinding must be selected in Kconfig. It's under:
|
||||
|
||||
Device Drivers -> Character devices -> Support for binding and unbinding
|
||||
console drivers
|
||||
|
@ -145,8 +145,8 @@ show_sampling_rate_max: THIS INTERFACE IS DEPRECATED, DON'T USE IT.
|
||||
up_threshold: defines what the average CPU usage between the samplings
|
||||
of 'sampling_rate' needs to be for the kernel to make a decision on
|
||||
whether it should increase the frequency. For example when it is set
|
||||
to its default value of '80' it means that between the checking
|
||||
intervals the CPU needs to be on average more than 80% in use to then
|
||||
to its default value of '95' it means that between the checking
|
||||
intervals the CPU needs to be on average more than 95% in use to then
|
||||
decide that the CPU frequency needs to be increased.
|
||||
|
||||
ignore_nice_load: this parameter takes a value of '0' or '1'. When
|
||||
|
207
Documentation/cpu-freq/pcc-cpufreq.txt
Normal file
207
Documentation/cpu-freq/pcc-cpufreq.txt
Normal file
@ -0,0 +1,207 @@
|
||||
/*
|
||||
* pcc-cpufreq.txt - PCC interface documentation
|
||||
*
|
||||
* Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
|
||||
* Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
|
||||
* Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
|
||||
*
|
||||
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; version 2 of the License.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
|
||||
* INFRINGEMENT. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
* 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*
|
||||
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
*/
|
||||
|
||||
|
||||
Processor Clocking Control Driver
|
||||
---------------------------------
|
||||
|
||||
Contents:
|
||||
---------
|
||||
1. Introduction
|
||||
1.1 PCC interface
|
||||
1.1.1 Get Average Frequency
|
||||
1.1.2 Set Desired Frequency
|
||||
1.2 Platforms affected
|
||||
2. Driver and /sys details
|
||||
2.1 scaling_available_frequencies
|
||||
2.2 cpuinfo_transition_latency
|
||||
2.3 cpuinfo_cur_freq
|
||||
2.4 related_cpus
|
||||
3. Caveats
|
||||
|
||||
1. Introduction:
|
||||
----------------
|
||||
Processor Clocking Control (PCC) is an interface between the platform
|
||||
firmware and OSPM. It is a mechanism for coordinating processor
|
||||
performance (ie: frequency) between the platform firmware and the OS.
|
||||
|
||||
The PCC driver (pcc-cpufreq) allows OSPM to take advantage of the PCC
|
||||
interface.
|
||||
|
||||
OS utilizes the PCC interface to inform platform firmware what frequency the
|
||||
OS wants for a logical processor. The platform firmware attempts to achieve
|
||||
the requested frequency. If the request for the target frequency could not be
|
||||
satisfied by platform firmware, then it usually means that power budget
|
||||
conditions are in place, and "power capping" is taking place.
|
||||
|
||||
1.1 PCC interface:
|
||||
------------------
|
||||
The complete PCC specification is available here:
|
||||
http://www.acpica.org/download/Processor-Clocking-Control-v1p0.pdf
|
||||
|
||||
PCC relies on a shared memory region that provides a channel for communication
|
||||
between the OS and platform firmware. PCC also implements a "doorbell" that
|
||||
is used by the OS to inform the platform firmware that a command has been
|
||||
sent.
|
||||
|
||||
The ACPI PCCH() method is used to discover the location of the PCC shared
|
||||
memory region. The shared memory region header contains the "command" and
|
||||
"status" interface. PCCH() also contains details on how to access the platform
|
||||
doorbell.
|
||||
|
||||
The following commands are supported by the PCC interface:
|
||||
* Get Average Frequency
|
||||
* Set Desired Frequency
|
||||
|
||||
The ACPI PCCP() method is implemented for each logical processor and is
|
||||
used to discover the offsets for the input and output buffers in the shared
|
||||
memory region.
|
||||
|
||||
When PCC mode is enabled, the platform will not expose processor performance
|
||||
or throttle states (_PSS, _TSS and related ACPI objects) to OSPM. Therefore,
|
||||
the native P-state driver (such as acpi-cpufreq for Intel, powernow-k8 for
|
||||
AMD) will not load.
|
||||
|
||||
However, OSPM remains in control of policy. The governor (eg: "ondemand")
|
||||
computes the required performance for each processor based on server workload.
|
||||
The PCC driver fills in the command interface, and the input buffer and
|
||||
communicates the request to the platform firmware. The platform firmware is
|
||||
responsible for delivering the requested performance.
|
||||
|
||||
Each PCC command is "global" in scope and can affect all the logical CPUs in
|
||||
the system. Therefore, PCC is capable of performing "group" updates. With PCC
|
||||
the OS is capable of getting/setting the frequency of all the logical CPUs in
|
||||
the system with a single call to the BIOS.
|
||||
|
||||
1.1.1 Get Average Frequency:
|
||||
----------------------------
|
||||
This command is used by the OSPM to query the running frequency of the
|
||||
processor since the last time this command was completed. The output buffer
|
||||
indicates the average unhalted frequency of the logical processor expressed as
|
||||
a percentage of the nominal (ie: maximum) CPU frequency. The output buffer
|
||||
also signifies if the CPU frequency is limited by a power budget condition.
|
||||
|
||||
1.1.2 Set Desired Frequency:
|
||||
----------------------------
|
||||
This command is used by the OSPM to communicate to the platform firmware the
|
||||
desired frequency for a logical processor. The output buffer is currently
|
||||
ignored by OSPM. The next invocation of "Get Average Frequency" will inform
|
||||
OSPM if the desired frequency was achieved or not.
|
||||
|
||||
1.2 Platforms affected:
|
||||
-----------------------
|
||||
The PCC driver will load on any system where the platform firmware:
|
||||
* supports the PCC interface, and the associated PCCH() and PCCP() methods
|
||||
* assumes responsibility for managing the hardware clocking controls in order
|
||||
to deliver the requested processor performance
|
||||
|
||||
Currently, certain HP ProLiant platforms implement the PCC interface. On those
|
||||
platforms PCC is the "default" choice.
|
||||
|
||||
However, it is possible to disable this interface via a BIOS setting. In
|
||||
such an instance, as is also the case on platforms where the PCC interface
|
||||
is not implemented, the PCC driver will fail to load silently.
|
||||
|
||||
2. Driver and /sys details:
|
||||
---------------------------
|
||||
When the driver loads, it merely prints the lowest and the highest CPU
|
||||
frequencies supported by the platform firmware.
|
||||
|
||||
The PCC driver loads with a message such as:
|
||||
pcc-cpufreq: (v1.00.00) driver loaded with frequency limits: 1600 MHz, 2933
|
||||
MHz
|
||||
|
||||
This means that the OPSM can request the CPU to run at any frequency in
|
||||
between the limits (1600 MHz, and 2933 MHz) specified in the message.
|
||||
|
||||
Internally, there is no need for the driver to convert the "target" frequency
|
||||
to a corresponding P-state.
|
||||
|
||||
The VERSION number for the driver will be of the format v.xy.ab.
|
||||
eg: 1.00.02
|
||||
----- --
|
||||
| |
|
||||
| -- this will increase with bug fixes/enhancements to the driver
|
||||
|-- this is the version of the PCC specification the driver adheres to
|
||||
|
||||
|
||||
The following is a brief discussion on some of the fields exported via the
|
||||
/sys filesystem and how their values are affected by the PCC driver:
|
||||
|
||||
2.1 scaling_available_frequencies:
|
||||
----------------------------------
|
||||
scaling_available_frequencies is not created in /sys. No intermediate
|
||||
frequencies need to be listed because the BIOS will try to achieve any
|
||||
frequency, within limits, requested by the governor. A frequency does not have
|
||||
to be strictly associated with a P-state.
|
||||
|
||||
2.2 cpuinfo_transition_latency:
|
||||
-------------------------------
|
||||
The cpuinfo_transition_latency field is 0. The PCC specification does
|
||||
not include a field to expose this value currently.
|
||||
|
||||
2.3 cpuinfo_cur_freq:
|
||||
---------------------
|
||||
A) Often cpuinfo_cur_freq will show a value different than what is declared
|
||||
in the scaling_available_frequencies or scaling_cur_freq, or scaling_max_freq.
|
||||
This is due to "turbo boost" available on recent Intel processors. If certain
|
||||
conditions are met the BIOS can achieve a slightly higher speed than requested
|
||||
by OSPM. An example:
|
||||
|
||||
scaling_cur_freq : 2933000
|
||||
cpuinfo_cur_freq : 3196000
|
||||
|
||||
B) There is a round-off error associated with the cpuinfo_cur_freq value.
|
||||
Since the driver obtains the current frequency as a "percentage" (%) of the
|
||||
nominal frequency from the BIOS, sometimes, the values displayed by
|
||||
scaling_cur_freq and cpuinfo_cur_freq may not match. An example:
|
||||
|
||||
scaling_cur_freq : 1600000
|
||||
cpuinfo_cur_freq : 1583000
|
||||
|
||||
In this example, the nominal frequency is 2933 MHz. The driver obtains the
|
||||
current frequency, cpuinfo_cur_freq, as 54% of the nominal frequency:
|
||||
|
||||
54% of 2933 MHz = 1583 MHz
|
||||
|
||||
Nominal frequency is the maximum frequency of the processor, and it usually
|
||||
corresponds to the frequency of the P0 P-state.
|
||||
|
||||
2.4 related_cpus:
|
||||
-----------------
|
||||
The related_cpus field is identical to affected_cpus.
|
||||
|
||||
affected_cpus : 4
|
||||
related_cpus : 4
|
||||
|
||||
Currently, the PCC driver does not evaluate _PSD. The platforms that support
|
||||
PCC do not implement SW_ALL. So OSPM doesn't need to perform any coordination
|
||||
to ensure that the same frequency is requested of all dependent CPUs.
|
||||
|
||||
3. Caveats:
|
||||
-----------
|
||||
The "cpufreq_stats" module in its present form cannot be loaded and
|
||||
expected to work with the PCC driver. Since the "cpufreq_stats" module
|
||||
provides information wrt each P-state, it is not applicable to the PCC driver.
|
@ -122,3 +122,47 @@ volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16
|
||||
brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real
|
||||
brw------- 1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow
|
||||
brw------- 1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base
|
||||
|
||||
|
||||
How to determine when a merging is complete
|
||||
===========================================
|
||||
The snapshot-merge and snapshot status lines end with:
|
||||
<sectors_allocated>/<total_sectors> <metadata_sectors>
|
||||
|
||||
Both <sectors_allocated> and <total_sectors> include both data and metadata.
|
||||
During merging, the number of sectors allocated gets smaller and
|
||||
smaller. Merging has finished when the number of sectors holding data
|
||||
is zero, in other words <sectors_allocated> == <metadata_sectors>.
|
||||
|
||||
Here is a practical example (using a hybrid of lvm and dmsetup commands):
|
||||
|
||||
# lvs
|
||||
LV VG Attr LSize Origin Snap% Move Log Copy% Convert
|
||||
base volumeGroup owi-a- 4.00g
|
||||
snap volumeGroup swi-a- 1.00g base 18.97
|
||||
|
||||
# dmsetup status volumeGroup-snap
|
||||
0 8388608 snapshot 397896/2097152 1560
|
||||
^^^^ metadata sectors
|
||||
|
||||
# lvconvert --merge -b volumeGroup/snap
|
||||
Merging of volume snap started.
|
||||
|
||||
# lvs volumeGroup/snap
|
||||
LV VG Attr LSize Origin Snap% Move Log Copy% Convert
|
||||
base volumeGroup Owi-a- 4.00g 17.23
|
||||
|
||||
# dmsetup status volumeGroup-base
|
||||
0 8388608 snapshot-merge 281688/2097152 1104
|
||||
|
||||
# dmsetup status volumeGroup-base
|
||||
0 8388608 snapshot-merge 180480/2097152 712
|
||||
|
||||
# dmsetup status volumeGroup-base
|
||||
0 8388608 snapshot-merge 16/2097152 16
|
||||
|
||||
Merging has finished.
|
||||
|
||||
# lvs
|
||||
LV VG Attr LSize Origin Snap% Move Log Copy% Convert
|
||||
base volumeGroup owi-a- 4.00g
|
||||
|
@ -69,7 +69,6 @@ av_permissions.h
|
||||
bbootsect
|
||||
bin2c
|
||||
binkernel.spec
|
||||
binoffset
|
||||
bootsect
|
||||
bounds.h
|
||||
bsetup
|
||||
|
@ -192,7 +192,7 @@ command line. This will execute all matching early_param() callbacks.
|
||||
User specified early platform devices will be registered at this point.
|
||||
For the early serial console case the user can specify port on the
|
||||
kernel command line as "earlyprintk=serial.0" where "earlyprintk" is
|
||||
the class string, "serial" is the name of the platfrom driver and
|
||||
the class string, "serial" is the name of the platform driver and
|
||||
0 is the platform device id. If the id is -1 then the dot and the
|
||||
id can be omitted.
|
||||
|
||||
|
@ -26,7 +26,7 @@ use IO::Handle;
|
||||
"dec3000s", "vp7041", "dibusb", "nxt2002", "nxt2004",
|
||||
"or51211", "or51132_qam", "or51132_vsb", "bluebird",
|
||||
"opera1", "cx231xx", "cx18", "cx23885", "pvrusb2", "mpc718",
|
||||
"af9015");
|
||||
"af9015", "ngene");
|
||||
|
||||
# Check args
|
||||
syntax() if (scalar(@ARGV) != 1);
|
||||
@ -39,7 +39,7 @@ for ($i=0; $i < scalar(@components); $i++) {
|
||||
die $@ if $@;
|
||||
print STDERR <<EOF;
|
||||
Firmware(s) $outfile extracted successfully.
|
||||
Now copy it(they) to either /usr/lib/hotplug/firmware or /lib/firmware
|
||||
Now copy it(them) to either /usr/lib/hotplug/firmware or /lib/firmware
|
||||
(depending on configuration of firmware hotplug).
|
||||
EOF
|
||||
exit(0);
|
||||
@ -549,6 +549,24 @@ sub af9015 {
|
||||
close INFILE;
|
||||
}
|
||||
|
||||
sub ngene {
|
||||
my $url = "http://www.digitaldevices.de/download/";
|
||||
my $file1 = "ngene_15.fw";
|
||||
my $hash1 = "d798d5a757121174f0dbc5f2833c0c85";
|
||||
my $file2 = "ngene_17.fw";
|
||||
my $hash2 = "26b687136e127b8ac24b81e0eeafc20b";
|
||||
|
||||
checkstandard();
|
||||
|
||||
wgetfile($file1, $url . $file1);
|
||||
verify($file1, $hash1);
|
||||
|
||||
wgetfile($file2, $url . $file2);
|
||||
verify($file2, $hash2);
|
||||
|
||||
"$file1, $file2";
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Utilities
|
||||
|
||||
@ -667,6 +685,7 @@ sub delzero{
|
||||
sub syntax() {
|
||||
print STDERR "syntax: get_dvb_firmware <component>\n";
|
||||
print STDERR "Supported components:\n";
|
||||
@components = sort @components;
|
||||
for($i=0; $i < scalar(@components); $i++) {
|
||||
print STDERR "\t" . $components[$i] . "\n";
|
||||
}
|
||||
|
@ -171,7 +171,7 @@ device.
|
||||
virtual_root.force_probe :
|
||||
|
||||
Force the probing code to probe EISA slots even when it cannot find an
|
||||
EISA compliant mainboard (nothing appears on slot 0). Defaultd to 0
|
||||
EISA compliant mainboard (nothing appears on slot 0). Defaults to 0
|
||||
(don't force), and set to 1 (force probing) when either
|
||||
CONFIG_ALPHA_JENSEN or CONFIG_EISA_VLB_PRIMING are set.
|
||||
|
||||
|
@ -216,26 +216,14 @@ Works. Use "Insert file..." or external editor.
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Gmail (Web GUI)
|
||||
|
||||
If you just have to use Gmail to send patches, it CAN be made to work. It
|
||||
requires a bit of external help, though.
|
||||
Does not work for sending patches.
|
||||
|
||||
The first problem is that Gmail converts tabs to spaces. This will
|
||||
totally break your patches. To prevent this, you have to use a different
|
||||
editor. There is a firefox extension called "ViewSourceWith"
|
||||
(https://addons.mozilla.org/en-US/firefox/addon/394) which allows you to
|
||||
edit any text box in the editor of your choice. Configure it to launch
|
||||
your favorite editor. When you want to send a patch, use this technique.
|
||||
Once you have crafted your messsage + patch, save and exit the editor,
|
||||
which should reload the Gmail edit box. GMAIL WILL PRESERVE THE TABS.
|
||||
Hoorah. Apparently you can cut-n-paste literal tabs, but Gmail will
|
||||
convert those to spaces upon sending!
|
||||
Gmail web client converts tabs to spaces automatically.
|
||||
|
||||
The second problem is that Gmail converts tabs to spaces on replies. If
|
||||
you reply to a patch, don't expect to be able to apply it as a patch.
|
||||
At the same time it wraps lines every 78 chars with CRLF style line breaks
|
||||
although tab2space problem can be solved with external editor.
|
||||
|
||||
The last problem is that Gmail will base64-encode any message that has a
|
||||
non-ASCII character. That includes things like European names. Be aware.
|
||||
|
||||
Gmail is not convenient for lkml patches, but CAN be made to work.
|
||||
Another problem is that Gmail will base64-encode any message that has a
|
||||
non-ASCII character. That includes things like European names.
|
||||
|
||||
###
|
||||
|
@ -143,8 +143,8 @@ o provide a way to configure fault attributes
|
||||
failslab, fail_page_alloc, and fail_make_request use this way.
|
||||
Helper functions:
|
||||
|
||||
init_fault_attr_entries(entries, attr, name);
|
||||
void cleanup_fault_attr_entries(entries);
|
||||
init_fault_attr_dentries(entries, attr, name);
|
||||
void cleanup_fault_attr_dentries(entries);
|
||||
|
||||
- module parameters
|
||||
|
||||
|
38
Documentation/fault-injection/provoke-crashes.txt
Normal file
38
Documentation/fault-injection/provoke-crashes.txt
Normal file
@ -0,0 +1,38 @@
|
||||
The lkdtm module provides an interface to crash or injure the kernel at
|
||||
predefined crashpoints to evaluate the reliability of crash dumps obtained
|
||||
using different dumping solutions. The module uses KPROBEs to instrument
|
||||
crashing points, but can also crash the kernel directly without KRPOBE
|
||||
support.
|
||||
|
||||
|
||||
You can provide the way either through module arguments when inserting
|
||||
the module, or through a debugfs interface.
|
||||
|
||||
Usage: insmod lkdtm.ko [recur_count={>0}] cpoint_name=<> cpoint_type=<>
|
||||
[cpoint_count={>0}]
|
||||
|
||||
recur_count : Recursion level for the stack overflow test. Default is 10.
|
||||
|
||||
cpoint_name : Crash point where the kernel is to be crashed. It can be
|
||||
one of INT_HARDWARE_ENTRY, INT_HW_IRQ_EN, INT_TASKLET_ENTRY,
|
||||
FS_DEVRW, MEM_SWAPOUT, TIMERADD, SCSI_DISPATCH_CMD,
|
||||
IDE_CORE_CP, DIRECT
|
||||
|
||||
cpoint_type : Indicates the action to be taken on hitting the crash point.
|
||||
It can be one of PANIC, BUG, EXCEPTION, LOOP, OVERFLOW,
|
||||
CORRUPT_STACK, UNALIGNED_LOAD_STORE_WRITE, OVERWRITE_ALLOCATION,
|
||||
WRITE_AFTER_FREE,
|
||||
|
||||
cpoint_count : Indicates the number of times the crash point is to be hit
|
||||
to trigger an action. The default is 10.
|
||||
|
||||
You can also induce failures by mounting debugfs and writing the type to
|
||||
<mountpoint>/provoke-crash/<crashpoint>. E.g.,
|
||||
|
||||
mount -t debugfs debugfs /mnt
|
||||
echo EXCEPTION > /mnt/provoke-crash/INT_HARDWARE_ENTRY
|
||||
|
||||
|
||||
A special file is `DIRECT' which will induce the crash directly without
|
||||
KPROBE instrumentation. This mode is the only one available when the module
|
||||
is built on a kernel without KPROBEs support.
|
@ -1,9 +1,9 @@
|
||||
|
||||
What is imacfb?
|
||||
What is efifb?
|
||||
===============
|
||||
|
||||
This is a generic EFI platform driver for Intel based Apple computers.
|
||||
Imacfb is only for EFI booted Intel Macs.
|
||||
efifb is only for EFI booted Intel Macs.
|
||||
|
||||
Supported Hardware
|
||||
==================
|
||||
@ -16,16 +16,16 @@ MacMini
|
||||
How to use it?
|
||||
==============
|
||||
|
||||
Imacfb does not have any kind of autodetection of your machine.
|
||||
efifb does not have any kind of autodetection of your machine.
|
||||
You have to add the following kernel parameters in your elilo.conf:
|
||||
Macbook :
|
||||
video=imacfb:macbook
|
||||
video=efifb:macbook
|
||||
MacMini :
|
||||
video=imacfb:mini
|
||||
video=efifb:mini
|
||||
Macbook Pro 15", iMac 17" :
|
||||
video=imacfb:i17
|
||||
video=efifb:i17
|
||||
Macbook Pro 17", iMac 20" :
|
||||
video=imacfb:i20
|
||||
video=efifb:i20
|
||||
|
||||
--
|
||||
Edgar Hucek <gimli@dark-green.com>
|
@ -6,21 +6,6 @@ be removed from this file.
|
||||
|
||||
---------------------------
|
||||
|
||||
What: USER_SCHED
|
||||
When: 2.6.34
|
||||
|
||||
Why: USER_SCHED was implemented as a proof of concept for group scheduling.
|
||||
The effect of USER_SCHED can already be achieved from userspace with
|
||||
the help of libcgroup. The removal of USER_SCHED will also simplify
|
||||
the scheduler code with the removal of one major ifdef. There are also
|
||||
issues USER_SCHED has with USER_NS. A decision was taken not to fix
|
||||
those and instead remove USER_SCHED. Also new group scheduling
|
||||
features will not be implemented for USER_SCHED.
|
||||
|
||||
Who: Dhaval Giani <dhaval@linux.vnet.ibm.com>
|
||||
|
||||
---------------------------
|
||||
|
||||
What: PRISM54
|
||||
When: 2.6.34
|
||||
|
||||
@ -64,6 +49,17 @@ Who: Robin Getz <rgetz@blackfin.uclinux.org> & Matt Mackall <mpm@selenic.com>
|
||||
|
||||
---------------------------
|
||||
|
||||
What: Deprecated snapshot ioctls
|
||||
When: 2.6.36
|
||||
|
||||
Why: The ioctls in kernel/power/user.c were marked as deprecated long time
|
||||
ago. Now they notify users about that so that they need to replace
|
||||
their userspace. After some more time, remove them completely.
|
||||
|
||||
Who: Jiri Slaby <jirislaby@gmail.com>
|
||||
|
||||
---------------------------
|
||||
|
||||
What: The ieee80211_regdom module parameter
|
||||
When: March 2010 / desktop catchup
|
||||
|
||||
@ -88,27 +84,6 @@ Who: Luis R. Rodriguez <lrodriguez@atheros.com>
|
||||
|
||||
---------------------------
|
||||
|
||||
What: CONFIG_WIRELESS_OLD_REGULATORY - old static regulatory information
|
||||
When: March 2010 / desktop catchup
|
||||
|
||||
Why: The old regulatory infrastructure has been replaced with a new one
|
||||
which does not require statically defined regulatory domains. We do
|
||||
not want to keep static regulatory domains in the kernel due to the
|
||||
the dynamic nature of regulatory law and localization. We kept around
|
||||
the old static definitions for the regulatory domains of:
|
||||
|
||||
* US
|
||||
* JP
|
||||
* EU
|
||||
|
||||
and used by default the US when CONFIG_WIRELESS_OLD_REGULATORY was
|
||||
set. We will remove this option once the standard Linux desktop catches
|
||||
up with the new userspace APIs we have implemented.
|
||||
|
||||
Who: Luis R. Rodriguez <lrodriguez@atheros.com>
|
||||
|
||||
---------------------------
|
||||
|
||||
What: dev->power.power_state
|
||||
When: July 2007
|
||||
Why: Broken design for runtime control over driver power states, confusing
|
||||
@ -142,19 +117,25 @@ Who: Mauro Carvalho Chehab <mchehab@infradead.org>
|
||||
---------------------------
|
||||
|
||||
What: PCMCIA control ioctl (needed for pcmcia-cs [cardmgr, cardctl])
|
||||
When: November 2005
|
||||
When: 2.6.35/2.6.36
|
||||
Files: drivers/pcmcia/: pcmcia_ioctl.c
|
||||
Why: With the 16-bit PCMCIA subsystem now behaving (almost) like a
|
||||
normal hotpluggable bus, and with it using the default kernel
|
||||
infrastructure (hotplug, driver core, sysfs) keeping the PCMCIA
|
||||
control ioctl needed by cardmgr and cardctl from pcmcia-cs is
|
||||
unnecessary, and makes further cleanups and integration of the
|
||||
unnecessary and potentially harmful (it does not provide for
|
||||
proper locking), and makes further cleanups and integration of the
|
||||
PCMCIA subsystem into the Linux kernel device driver model more
|
||||
difficult. The features provided by cardmgr and cardctl are either
|
||||
handled by the kernel itself now or are available in the new
|
||||
pcmciautils package available at
|
||||
http://kernel.org/pub/linux/utils/kernel/pcmcia/
|
||||
Who: Dominik Brodowski <linux@brodo.de>
|
||||
|
||||
For all architectures except ARM, the associated config symbol
|
||||
has been removed from kernel 2.6.34; for ARM, it will be likely
|
||||
be removed from kernel 2.6.35. The actual code will then likely
|
||||
be removed from kernel 2.6.36.
|
||||
Who: Dominik Brodowski <linux@dominikbrodowski.net>
|
||||
|
||||
---------------------------
|
||||
|
||||
@ -468,12 +449,6 @@ Who: Alok N Kataria <akataria@vmware.com>
|
||||
|
||||
----------------------------
|
||||
|
||||
What: adt7473 hardware monitoring driver
|
||||
When: February 2010
|
||||
Why: Obsoleted by the adt7475 driver.
|
||||
Who: Jean Delvare <khali@linux-fr.org>
|
||||
|
||||
---------------------------
|
||||
What: Support for lcd_switch and display_get in asus-laptop driver
|
||||
When: March 2010
|
||||
Why: These two features use non-standard interfaces. There are the
|
||||
@ -493,3 +468,124 @@ Why: These two features use non-standard interfaces. There are the
|
||||
Who: Corentin Chary <corentin.chary@gmail.com>
|
||||
|
||||
----------------------------
|
||||
|
||||
What: usbvideo quickcam_messenger driver
|
||||
When: 2.6.35
|
||||
Files: drivers/media/video/usbvideo/quickcam_messenger.[ch]
|
||||
Why: obsolete v4l1 driver replaced by gspca_stv06xx
|
||||
Who: Hans de Goede <hdegoede@redhat.com>
|
||||
|
||||
----------------------------
|
||||
|
||||
What: ov511 v4l1 driver
|
||||
When: 2.6.35
|
||||
Files: drivers/media/video/ov511.[ch]
|
||||
Why: obsolete v4l1 driver replaced by gspca_ov519
|
||||
Who: Hans de Goede <hdegoede@redhat.com>
|
||||
|
||||
----------------------------
|
||||
|
||||
What: w9968cf v4l1 driver
|
||||
When: 2.6.35
|
||||
Files: drivers/media/video/w9968cf*.[ch]
|
||||
Why: obsolete v4l1 driver replaced by gspca_ov519
|
||||
Who: Hans de Goede <hdegoede@redhat.com>
|
||||
|
||||
----------------------------
|
||||
|
||||
What: ovcamchip sensor framework
|
||||
When: 2.6.35
|
||||
Files: drivers/media/video/ovcamchip/*
|
||||
Why: Only used by obsoleted v4l1 drivers
|
||||
Who: Hans de Goede <hdegoede@redhat.com>
|
||||
|
||||
----------------------------
|
||||
|
||||
What: stv680 v4l1 driver
|
||||
When: 2.6.35
|
||||
Files: drivers/media/video/stv680.[ch]
|
||||
Why: obsolete v4l1 driver replaced by gspca_stv0680
|
||||
Who: Hans de Goede <hdegoede@redhat.com>
|
||||
|
||||
----------------------------
|
||||
|
||||
What: zc0301 v4l driver
|
||||
When: 2.6.35
|
||||
Files: drivers/media/video/zc0301/*
|
||||
Why: Duplicate functionality with the gspca_zc3xx driver, zc0301 only
|
||||
supports 2 USB-ID's (because it only supports a limited set of
|
||||
sensors) wich are also supported by the gspca_zc3xx driver
|
||||
(which supports 53 USB-ID's in total)
|
||||
Who: Hans de Goede <hdegoede@redhat.com>
|
||||
|
||||
----------------------------
|
||||
|
||||
What: corgikbd, spitzkbd, tosakbd driver
|
||||
When: 2.6.35
|
||||
Files: drivers/input/keyboard/{corgi,spitz,tosa}kbd.c
|
||||
Why: We now have a generic GPIO based matrix keyboard driver that
|
||||
are fully capable of handling all the keys on these devices.
|
||||
The original drivers manipulate the GPIO registers directly
|
||||
and so are difficult to maintain.
|
||||
Who: Eric Miao <eric.y.miao@gmail.com>
|
||||
|
||||
----------------------------
|
||||
|
||||
What: corgi_ssp and corgi_ts driver
|
||||
When: 2.6.35
|
||||
Files: arch/arm/mach-pxa/corgi_ssp.c, drivers/input/touchscreen/corgi_ts.c
|
||||
Why: The corgi touchscreen is now deprecated in favour of the generic
|
||||
ads7846.c driver. The noise reduction technique used in corgi_ts.c,
|
||||
that's to wait till vsync before ADC sampling, is also integrated into
|
||||
ads7846 driver now. Provided that the original driver is not generic
|
||||
and is difficult to maintain, it will be removed later.
|
||||
Who: Eric Miao <eric.y.miao@gmail.com>
|
||||
|
||||
----------------------------
|
||||
|
||||
What: capifs
|
||||
When: February 2011
|
||||
Files: drivers/isdn/capi/capifs.*
|
||||
Why: udev fully replaces this special file system that only contains CAPI
|
||||
NCCI TTY device nodes. User space (pppdcapiplugin) works without
|
||||
noticing the difference.
|
||||
Who: Jan Kiszka <jan.kiszka@web.de>
|
||||
|
||||
----------------------------
|
||||
|
||||
What: KVM memory aliases support
|
||||
When: July 2010
|
||||
Why: Memory aliasing support is used for speeding up guest vga access
|
||||
through the vga windows.
|
||||
|
||||
Modern userspace no longer uses this feature, so it's just bitrotted
|
||||
code and can be removed with no impact.
|
||||
Who: Avi Kivity <avi@redhat.com>
|
||||
|
||||
----------------------------
|
||||
|
||||
What: KVM kernel-allocated memory slots
|
||||
When: July 2010
|
||||
Why: Since 2.6.25, kvm supports user-allocated memory slots, which are
|
||||
much more flexible than kernel-allocated slots. All current userspace
|
||||
supports the newer interface and this code can be removed with no
|
||||
impact.
|
||||
Who: Avi Kivity <avi@redhat.com>
|
||||
|
||||
----------------------------
|
||||
|
||||
What: KVM paravirt mmu host support
|
||||
When: January 2011
|
||||
Why: The paravirt mmu host support is slower than non-paravirt mmu, both
|
||||
on newer and older hardware. It is already not exposed to the guest,
|
||||
and kept only for live migration purposes.
|
||||
Who: Avi Kivity <avi@redhat.com>
|
||||
|
||||
----------------------------
|
||||
|
||||
What: "acpi=ht" boot option
|
||||
When: 2.6.35
|
||||
Why: Useful in 2003, implementation is a hack.
|
||||
Generally invoked by accident today.
|
||||
Seen as doing more harm than good.
|
||||
Who: Len Brown <len.brown@intel.com>
|
||||
|
@ -16,6 +16,8 @@ befs.txt
|
||||
- information about the BeOS filesystem for Linux.
|
||||
bfs.txt
|
||||
- info for the SCO UnixWare Boot Filesystem (BFS).
|
||||
ceph.txt
|
||||
- info for the Ceph Distributed File System
|
||||
cifs.txt
|
||||
- description of the CIFS filesystem.
|
||||
coda.txt
|
||||
@ -32,6 +34,8 @@ dlmfs.txt
|
||||
- info on the userspace interface to the OCFS2 DLM.
|
||||
dnotify.txt
|
||||
- info about directory notification in Linux.
|
||||
dnotify_test.c
|
||||
- example program for dnotify
|
||||
ecryptfs.txt
|
||||
- docs on eCryptfs: stacked cryptographic filesystem for Linux.
|
||||
exofs.txt
|
||||
@ -62,6 +66,8 @@ jfs.txt
|
||||
- info and mount options for the JFS filesystem.
|
||||
locks.txt
|
||||
- info on file locking implementations, flock() vs. fcntl(), etc.
|
||||
logfs.txt
|
||||
- info on the LogFS flash filesystem.
|
||||
mandatory-locking.txt
|
||||
- info on the Linux implementation of Sys V mandatory file locking.
|
||||
ncpfs.txt
|
||||
|
@ -37,6 +37,15 @@ For Plan 9 From User Space applications (http://swtch.com/plan9)
|
||||
|
||||
mount -t 9p `namespace`/acme /mnt/9 -o trans=unix,uname=$USER
|
||||
|
||||
For server running on QEMU host with virtio transport:
|
||||
|
||||
mount -t 9p -o trans=virtio <mount_tag> /mnt/9
|
||||
|
||||
where mount_tag is the tag associated by the server to each of the exported
|
||||
mount points. Each 9P export is seen by the client as a virtio device with an
|
||||
associated "mount_tag" property. Available mount tags can be
|
||||
seen by reading /sys/bus/virtio/drivers/9pnet_virtio/virtio<n>/mount_tag files.
|
||||
|
||||
OPTIONS
|
||||
=======
|
||||
|
||||
@ -47,7 +56,7 @@ OPTIONS
|
||||
fd - used passed file descriptors for connection
|
||||
(see rfdno and wfdno)
|
||||
virtio - connect to the next virtio channel available
|
||||
(from lguest or KVM with trans_virtio module)
|
||||
(from QEMU with trans_virtio module)
|
||||
rdma - connect to a specified RDMA channel
|
||||
|
||||
uname=name user name to attempt mount as on the remote server. The
|
||||
@ -85,7 +94,12 @@ OPTIONS
|
||||
|
||||
port=n port to connect to on the remote server
|
||||
|
||||
noextend force legacy mode (no 9p2000.u semantics)
|
||||
noextend force legacy mode (no 9p2000.u or 9p2000.L semantics)
|
||||
|
||||
version=name Select 9P protocol version. Valid options are:
|
||||
9p2000 - Legacy mode (same as noextend)
|
||||
9p2000.u - Use 9P2000.u protocol
|
||||
9p2000.L - Use 9P2000.L protocol
|
||||
|
||||
dfltuid attempt to mount as a particular uid
|
||||
|
||||
|
@ -460,13 +460,6 @@ in sys_read() and friends.
|
||||
|
||||
--------------------------- dquot_operations -------------------------------
|
||||
prototypes:
|
||||
int (*initialize) (struct inode *, int);
|
||||
int (*drop) (struct inode *);
|
||||
int (*alloc_space) (struct inode *, qsize_t, int);
|
||||
int (*alloc_inode) (const struct inode *, unsigned long);
|
||||
int (*free_space) (struct inode *, qsize_t);
|
||||
int (*free_inode) (const struct inode *, unsigned long);
|
||||
int (*transfer) (struct inode *, struct iattr *);
|
||||
int (*write_dquot) (struct dquot *);
|
||||
int (*acquire_dquot) (struct dquot *);
|
||||
int (*release_dquot) (struct dquot *);
|
||||
@ -479,13 +472,6 @@ a proper locking wrt the filesystem and call the generic quota operations.
|
||||
What filesystem should expect from the generic quota functions:
|
||||
|
||||
FS recursion Held locks when called
|
||||
initialize: yes maybe dqonoff_sem
|
||||
drop: yes -
|
||||
alloc_space: ->mark_dirty() -
|
||||
alloc_inode: ->mark_dirty() -
|
||||
free_space: ->mark_dirty() -
|
||||
free_inode: ->mark_dirty() -
|
||||
transfer: yes -
|
||||
write_dquot: yes dqonoff_sem or dqptr_sem
|
||||
acquire_dquot: yes dqonoff_sem or dqptr_sem
|
||||
release_dquot: yes dqonoff_sem or dqptr_sem
|
||||
@ -495,10 +481,6 @@ write_info: yes dqonoff_sem
|
||||
FS recursion means calling ->quota_read() and ->quota_write() from superblock
|
||||
operations.
|
||||
|
||||
->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called
|
||||
only directly by the filesystem and do not call any fs functions only
|
||||
the ->mark_dirty() operation.
|
||||
|
||||
More details about quota locking can be found in fs/dquot.c.
|
||||
|
||||
--------------------------- vm_operations_struct -----------------------------
|
||||
|
8
Documentation/filesystems/Makefile
Normal file
8
Documentation/filesystems/Makefile
Normal file
@ -0,0 +1,8 @@
|
||||
# kbuild trick to avoid linker error. Can be omitted if a module is built.
|
||||
obj- := dummy.o
|
||||
|
||||
# List of programs to build
|
||||
hostprogs-y := dnotify_test
|
||||
|
||||
# Tell kbuild to always build the programs
|
||||
always := $(hostprogs-y)
|
140
Documentation/filesystems/ceph.txt
Normal file
140
Documentation/filesystems/ceph.txt
Normal file
@ -0,0 +1,140 @@
|
||||
Ceph Distributed File System
|
||||
============================
|
||||
|
||||
Ceph is a distributed network file system designed to provide good
|
||||
performance, reliability, and scalability.
|
||||
|
||||
Basic features include:
|
||||
|
||||
* POSIX semantics
|
||||
* Seamless scaling from 1 to many thousands of nodes
|
||||
* High availability and reliability. No single point of failure.
|
||||
* N-way replication of data across storage nodes
|
||||
* Fast recovery from node failures
|
||||
* Automatic rebalancing of data on node addition/removal
|
||||
* Easy deployment: most FS components are userspace daemons
|
||||
|
||||
Also,
|
||||
* Flexible snapshots (on any directory)
|
||||
* Recursive accounting (nested files, directories, bytes)
|
||||
|
||||
In contrast to cluster filesystems like GFS, OCFS2, and GPFS that rely
|
||||
on symmetric access by all clients to shared block devices, Ceph
|
||||
separates data and metadata management into independent server
|
||||
clusters, similar to Lustre. Unlike Lustre, however, metadata and
|
||||
storage nodes run entirely as user space daemons. Storage nodes
|
||||
utilize btrfs to store data objects, leveraging its advanced features
|
||||
(checksumming, metadata replication, etc.). File data is striped
|
||||
across storage nodes in large chunks to distribute workload and
|
||||
facilitate high throughputs. When storage nodes fail, data is
|
||||
re-replicated in a distributed fashion by the storage nodes themselves
|
||||
(with some minimal coordination from a cluster monitor), making the
|
||||
system extremely efficient and scalable.
|
||||
|
||||
Metadata servers effectively form a large, consistent, distributed
|
||||
in-memory cache above the file namespace that is extremely scalable,
|
||||
dynamically redistributes metadata in response to workload changes,
|
||||
and can tolerate arbitrary (well, non-Byzantine) node failures. The
|
||||
metadata server takes a somewhat unconventional approach to metadata
|
||||
storage to significantly improve performance for common workloads. In
|
||||
particular, inodes with only a single link are embedded in
|
||||
directories, allowing entire directories of dentries and inodes to be
|
||||
loaded into its cache with a single I/O operation. The contents of
|
||||
extremely large directories can be fragmented and managed by
|
||||
independent metadata servers, allowing scalable concurrent access.
|
||||
|
||||
The system offers automatic data rebalancing/migration when scaling
|
||||
from a small cluster of just a few nodes to many hundreds, without
|
||||
requiring an administrator carve the data set into static volumes or
|
||||
go through the tedious process of migrating data between servers.
|
||||
When the file system approaches full, new nodes can be easily added
|
||||
and things will "just work."
|
||||
|
||||
Ceph includes flexible snapshot mechanism that allows a user to create
|
||||
a snapshot on any subdirectory (and its nested contents) in the
|
||||
system. Snapshot creation and deletion are as simple as 'mkdir
|
||||
.snap/foo' and 'rmdir .snap/foo'.
|
||||
|
||||
Ceph also provides some recursive accounting on directories for nested
|
||||
files and bytes. That is, a 'getfattr -d foo' on any directory in the
|
||||
system will reveal the total number of nested regular files and
|
||||
subdirectories, and a summation of all nested file sizes. This makes
|
||||
the identification of large disk space consumers relatively quick, as
|
||||
no 'du' or similar recursive scan of the file system is required.
|
||||
|
||||
|
||||
Mount Syntax
|
||||
============
|
||||
|
||||
The basic mount syntax is:
|
||||
|
||||
# mount -t ceph monip[:port][,monip2[:port]...]:/[subdir] mnt
|
||||
|
||||
You only need to specify a single monitor, as the client will get the
|
||||
full list when it connects. (However, if the monitor you specify
|
||||
happens to be down, the mount won't succeed.) The port can be left
|
||||
off if the monitor is using the default. So if the monitor is at
|
||||
1.2.3.4,
|
||||
|
||||
# mount -t ceph 1.2.3.4:/ /mnt/ceph
|
||||
|
||||
is sufficient. If /sbin/mount.ceph is installed, a hostname can be
|
||||
used instead of an IP address.
|
||||
|
||||
|
||||
|
||||
Mount Options
|
||||
=============
|
||||
|
||||
ip=A.B.C.D[:N]
|
||||
Specify the IP and/or port the client should bind to locally.
|
||||
There is normally not much reason to do this. If the IP is not
|
||||
specified, the client's IP address is determined by looking at the
|
||||
address it's connection to the monitor originates from.
|
||||
|
||||
wsize=X
|
||||
Specify the maximum write size in bytes. By default there is no
|
||||
maximum. Ceph will normally size writes based on the file stripe
|
||||
size.
|
||||
|
||||
rsize=X
|
||||
Specify the maximum readahead.
|
||||
|
||||
mount_timeout=X
|
||||
Specify the timeout value for mount (in seconds), in the case
|
||||
of a non-responsive Ceph file system. The default is 30
|
||||
seconds.
|
||||
|
||||
rbytes
|
||||
When stat() is called on a directory, set st_size to 'rbytes',
|
||||
the summation of file sizes over all files nested beneath that
|
||||
directory. This is the default.
|
||||
|
||||
norbytes
|
||||
When stat() is called on a directory, set st_size to the
|
||||
number of entries in that directory.
|
||||
|
||||
nocrc
|
||||
Disable CRC32C calculation for data writes. If set, the storage node
|
||||
must rely on TCP's error correction to detect data corruption
|
||||
in the data payload.
|
||||
|
||||
noasyncreaddir
|
||||
Disable client's use its local cache to satisfy readdir
|
||||
requests. (This does not change correctness; the client uses
|
||||
cached metadata only when a lease or capability ensures it is
|
||||
valid.)
|
||||
|
||||
|
||||
More Information
|
||||
================
|
||||
|
||||
For more information on Ceph, see the home page at
|
||||
http://ceph.newdream.net/
|
||||
|
||||
The Linux kernel client source tree is available at
|
||||
git://ceph.newdream.net/git/ceph-client.git
|
||||
git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
|
||||
|
||||
and the source for the full system is at
|
||||
git://ceph.newdream.net/git/ceph.git
|
@ -62,7 +62,8 @@ changes are :
|
||||
2. Insertion of a dentry into the hash table is done using
|
||||
hlist_add_head_rcu() which take care of ordering the writes - the
|
||||
writes to the dentry must be visible before the dentry is
|
||||
inserted. This works in conjunction with hlist_for_each_rcu() while
|
||||
inserted. This works in conjunction with hlist_for_each_rcu(),
|
||||
which has since been replaced by hlist_for_each_entry_rcu(), while
|
||||
walking the hash chain. The only requirement is that all
|
||||
initialization to the dentry must be done before
|
||||
hlist_add_head_rcu() since we don't have dcache_lock protection
|
||||
|
@ -62,38 +62,9 @@ disabled, fcntl(fd, F_NOTIFY, ...) will return -EINVAL.
|
||||
|
||||
Example
|
||||
-------
|
||||
See Documentation/filesystems/dnotify_test.c for an example.
|
||||
|
||||
#define _GNU_SOURCE /* needed to get the defines */
|
||||
#include <fcntl.h> /* in glibc 2.2 this has the needed
|
||||
values defined */
|
||||
#include <signal.h>
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
|
||||
static volatile int event_fd;
|
||||
|
||||
static void handler(int sig, siginfo_t *si, void *data)
|
||||
{
|
||||
event_fd = si->si_fd;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
struct sigaction act;
|
||||
int fd;
|
||||
|
||||
act.sa_sigaction = handler;
|
||||
sigemptyset(&act.sa_mask);
|
||||
act.sa_flags = SA_SIGINFO;
|
||||
sigaction(SIGRTMIN + 1, &act, NULL);
|
||||
|
||||
fd = open(".", O_RDONLY);
|
||||
fcntl(fd, F_SETSIG, SIGRTMIN + 1);
|
||||
fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT);
|
||||
/* we will now be notified if any of the files
|
||||
in "." is modified or new files are created */
|
||||
while (1) {
|
||||
pause();
|
||||
printf("Got event on fd=%d\n", event_fd);
|
||||
}
|
||||
}
|
||||
NOTE
|
||||
----
|
||||
Beginning with Linux 2.6.13, dnotify has been replaced by inotify.
|
||||
See Documentation/filesystems/inotify.txt for more information on it.
|
||||
|
34
Documentation/filesystems/dnotify_test.c
Normal file
34
Documentation/filesystems/dnotify_test.c
Normal file
@ -0,0 +1,34 @@
|
||||
#define _GNU_SOURCE /* needed to get the defines */
|
||||
#include <fcntl.h> /* in glibc 2.2 this has the needed
|
||||
values defined */
|
||||
#include <signal.h>
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
|
||||
static volatile int event_fd;
|
||||
|
||||
static void handler(int sig, siginfo_t *si, void *data)
|
||||
{
|
||||
event_fd = si->si_fd;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
struct sigaction act;
|
||||
int fd;
|
||||
|
||||
act.sa_sigaction = handler;
|
||||
sigemptyset(&act.sa_mask);
|
||||
act.sa_flags = SA_SIGINFO;
|
||||
sigaction(SIGRTMIN + 1, &act, NULL);
|
||||
|
||||
fd = open(".", O_RDONLY);
|
||||
fcntl(fd, F_SETSIG, SIGRTMIN + 1);
|
||||
fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT);
|
||||
/* we will now be notified if any of the files
|
||||
in "." is modified or new files are created */
|
||||
while (1) {
|
||||
pause();
|
||||
printf("Got event on fd=%d\n", event_fd);
|
||||
}
|
||||
}
|
@ -196,7 +196,7 @@ nobarrier This also requires an IO stack which can support
|
||||
also be used to enable or disable barriers, for
|
||||
consistency with other ext4 mount options.
|
||||
|
||||
inode_readahead=n This tuning parameter controls the maximum
|
||||
inode_readahead_blks=n This tuning parameter controls the maximum
|
||||
number of inode table blocks that ext4's inode
|
||||
table readahead algorithm will pre-read into
|
||||
the buffer cache. The default value is 32 blocks.
|
||||
|
241
Documentation/filesystems/logfs.txt
Normal file
241
Documentation/filesystems/logfs.txt
Normal file
@ -0,0 +1,241 @@
|
||||
|
||||
The LogFS Flash Filesystem
|
||||
==========================
|
||||
|
||||
Specification
|
||||
=============
|
||||
|
||||
Superblocks
|
||||
-----------
|
||||
|
||||
Two superblocks exist at the beginning and end of the filesystem.
|
||||
Each superblock is 256 Bytes large, with another 3840 Bytes reserved
|
||||
for future purposes, making a total of 4096 Bytes.
|
||||
|
||||
Superblock locations may differ for MTD and block devices. On MTD the
|
||||
first non-bad block contains a superblock in the first 4096 Bytes and
|
||||
the last non-bad block contains a superblock in the last 4096 Bytes.
|
||||
On block devices, the first 4096 Bytes of the device contain the first
|
||||
superblock and the last aligned 4096 Byte-block contains the second
|
||||
superblock.
|
||||
|
||||
For the most part, the superblocks can be considered read-only. They
|
||||
are written only to correct errors detected within the superblocks,
|
||||
move the journal and change the filesystem parameters through tunefs.
|
||||
As a result, the superblock does not contain any fields that require
|
||||
constant updates, like the amount of free space, etc.
|
||||
|
||||
Segments
|
||||
--------
|
||||
|
||||
The space in the device is split up into equal-sized segments.
|
||||
Segments are the primary write unit of LogFS. Within each segments,
|
||||
writes happen from front (low addresses) to back (high addresses. If
|
||||
only a partial segment has been written, the segment number, the
|
||||
current position within and optionally a write buffer are stored in
|
||||
the journal.
|
||||
|
||||
Segments are erased as a whole. Therefore Garbage Collection may be
|
||||
required to completely free a segment before doing so.
|
||||
|
||||
Journal
|
||||
--------
|
||||
|
||||
The journal contains all global information about the filesystem that
|
||||
is subject to frequent change. At mount time, it has to be scanned
|
||||
for the most recent commit entry, which contains a list of pointers to
|
||||
all currently valid entries.
|
||||
|
||||
Object Store
|
||||
------------
|
||||
|
||||
All space except for the superblocks and journal is part of the object
|
||||
store. Each segment contains a segment header and a number of
|
||||
objects, each consisting of the object header and the payload.
|
||||
Objects are either inodes, directory entries (dentries), file data
|
||||
blocks or indirect blocks.
|
||||
|
||||
Levels
|
||||
------
|
||||
|
||||
Garbage collection (GC) may fail if all data is written
|
||||
indiscriminately. One requirement of GC is that data is seperated
|
||||
roughly according to the distance between the tree root and the data.
|
||||
Effectively that means all file data is on level 0, indirect blocks
|
||||
are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks,
|
||||
respectively. Inode file data is on level 6 for the inodes and 7-11
|
||||
for indirect blocks.
|
||||
|
||||
Each segment contains objects of a single level only. As a result,
|
||||
each level requires its own seperate segment to be open for writing.
|
||||
|
||||
Inode File
|
||||
----------
|
||||
|
||||
All inodes are stored in a special file, the inode file. Single
|
||||
exception is the inode file's inode (master inode) which for obvious
|
||||
reasons is stored in the journal instead. Instead of data blocks, the
|
||||
leaf nodes of the inode files are inodes.
|
||||
|
||||
Aliases
|
||||
-------
|
||||
|
||||
Writes in LogFS are done by means of a wandering tree. A naïve
|
||||
implementation would require that for each write or a block, all
|
||||
parent blocks are written as well, since the block pointers have
|
||||
changed. Such an implementation would not be very efficient.
|
||||
|
||||
In LogFS, the block pointer changes are cached in the journal by means
|
||||
of alias entries. Each alias consists of its logical address - inode
|
||||
number, block index, level and child number (index into block) - and
|
||||
the changed data. Any 8-byte word can be changes in this manner.
|
||||
|
||||
Currently aliases are used for block pointers, file size, file used
|
||||
bytes and the height of an inodes indirect tree.
|
||||
|
||||
Segment Aliases
|
||||
---------------
|
||||
|
||||
Related to regular aliases, these are used to handle bad blocks.
|
||||
Initially, bad blocks are handled by moving the affected segment
|
||||
content to a spare segment and noting this move in the journal with a
|
||||
segment alias, a simple (to, from) tupel. GC will later empty this
|
||||
segment and the alias can be removed again. This is used on MTD only.
|
||||
|
||||
Vim
|
||||
---
|
||||
|
||||
By cleverly predicting the life time of data, it is possible to
|
||||
seperate long-living data from short-living data and thereby reduce
|
||||
the GC overhead later. Each type of distinc life expectency (vim) can
|
||||
have a seperate segment open for writing. Each (level, vim) tupel can
|
||||
be open just once. If an open segment with unknown vim is encountered
|
||||
at mount time, it is closed and ignored henceforth.
|
||||
|
||||
Indirect Tree
|
||||
-------------
|
||||
|
||||
Inodes in LogFS are similar to FFS-style filesystems with direct and
|
||||
indirect block pointers. One difference is that LogFS uses a single
|
||||
indirect pointer that can be either a 1x, 2x, etc. indirect pointer.
|
||||
A height field in the inode defines the height of the indirect tree
|
||||
and thereby the indirection of the pointer.
|
||||
|
||||
Another difference is the addressing of indirect blocks. In LogFS,
|
||||
the first 16 pointers in the first indirect block are left empty,
|
||||
corresponding to the 16 direct pointers in the inode. In ext2 (maybe
|
||||
others as well) the first pointer in the first indirect block
|
||||
corresponds to logical block 12, skipping the 12 direct pointers.
|
||||
So where ext2 is using arithmetic to better utilize space, LogFS keeps
|
||||
arithmetic simple and uses compression to save space.
|
||||
|
||||
Compression
|
||||
-----------
|
||||
|
||||
Both file data and metadata can be compressed. Compression for file
|
||||
data can be enabled with chattr +c and disabled with chattr -c. Doing
|
||||
so has no effect on existing data, but new data will be stored
|
||||
accordingly. New inodes will inherit the compression flag of the
|
||||
parent directory.
|
||||
|
||||
Metadata is always compressed. However, the space accounting ignores
|
||||
this and charges for the uncompressed size. Failing to do so could
|
||||
result in GC failures when, after moving some data, indirect blocks
|
||||
compress worse than previously. Even on a 100% full medium, GC may
|
||||
not consume any extra space, so the compression gains are lost space
|
||||
to the user.
|
||||
|
||||
However, they are not lost space to the filesystem internals. By
|
||||
cheating the user for those bytes, the filesystem gained some slack
|
||||
space and GC will run less often and faster.
|
||||
|
||||
Garbage Collection and Wear Leveling
|
||||
------------------------------------
|
||||
|
||||
Garbage collection is invoked whenever the number of free segments
|
||||
falls below a threshold. The best (known) candidate is picked based
|
||||
on the least amount of valid data contained in the segment. All
|
||||
remaining valid data is copied elsewhere, thereby invalidating it.
|
||||
|
||||
The GC code also checks for aliases and writes then back if their
|
||||
number gets too large.
|
||||
|
||||
Wear leveling is done by occasionally picking a suboptimal segment for
|
||||
garbage collection. If a stale segments erase count is significantly
|
||||
lower than the active segments' erase counts, it will be picked. Wear
|
||||
leveling is rate limited, so it will never monopolize the device for
|
||||
more than one segment worth at a time.
|
||||
|
||||
Values for "occasionally", "significantly lower" are compile time
|
||||
constants.
|
||||
|
||||
Hashed directories
|
||||
------------------
|
||||
|
||||
To satisfy efficient lookup(), directory entries are hashed and
|
||||
located based on the hash. In order to both support large directories
|
||||
and not be overly inefficient for small directories, several hash
|
||||
tables of increasing size are used. For each table, the hash value
|
||||
modulo the table size gives the table index.
|
||||
|
||||
Tables sizes are chosen to limit the number of indirect blocks with a
|
||||
fully populated table to 0, 1, 2 or 3 respectively. So the first
|
||||
table contains 16 entries, the second 512-16, etc.
|
||||
|
||||
The last table is special in several ways. First its size depends on
|
||||
the effective 32bit limit on telldir/seekdir cookies. Since logfs
|
||||
uses the upper half of the address space for indirect blocks, the size
|
||||
is limited to 2^31. Secondly the table contains hash buckets with 16
|
||||
entries each.
|
||||
|
||||
Using single-entry buckets would result in birthday "attacks". At
|
||||
just 2^16 used entries, hash collisions would be likely (P >= 0.5).
|
||||
My math skills are insufficient to do the combinatorics for the 17x
|
||||
collisions necessary to overflow a bucket, but testing showed that in
|
||||
10,000 runs the lowest directory fill before a bucket overflow was
|
||||
188,057,130 entries with an average of 315,149,915 entries. So for
|
||||
directory sizes of up to a million, bucket overflows should be
|
||||
virtually impossible under normal circumstances.
|
||||
|
||||
With carefully chosen filenames, it is obviously possible to cause an
|
||||
overflow with just 21 entries (4 higher tables + 16 entries + 1). So
|
||||
there may be a security concern if a malicious user has write access
|
||||
to a directory.
|
||||
|
||||
Open For Discussion
|
||||
===================
|
||||
|
||||
Device Address Space
|
||||
--------------------
|
||||
|
||||
A device address space is used for caching. Both block devices and
|
||||
MTD provide functions to either read a single page or write a segment.
|
||||
Partial segments may be written for data integrity, but where possible
|
||||
complete segments are written for performance on simple block device
|
||||
flash media.
|
||||
|
||||
Meta Inodes
|
||||
-----------
|
||||
|
||||
Inodes are stored in the inode file, which is just a regular file for
|
||||
most purposes. At umount time, however, the inode file needs to
|
||||
remain open until all dirty inodes are written. So
|
||||
generic_shutdown_super() may not close this inode, but shouldn't
|
||||
complain about remaining inodes due to the inode file either. Same
|
||||
goes for mapping inode of the device address space.
|
||||
|
||||
Currently logfs uses a hack that essentially copies part of fs/inode.c
|
||||
code over. A general solution would be preferred.
|
||||
|
||||
Indirect block mapping
|
||||
----------------------
|
||||
|
||||
With compression, the block device (or mapping inode) cannot be used
|
||||
to cache indirect blocks. Some other place is required. Currently
|
||||
logfs uses the top half of each inode's address space. The low 8TB
|
||||
(on 32bit) are filled with file data, the high 8TB are used for
|
||||
indirect blocks.
|
||||
|
||||
One problem is that 16TB files created on 64bit systems actually have
|
||||
data in the top 8TB. But files >16TB would cause problems anyway, so
|
||||
only the limit has changed.
|
@ -17,8 +17,7 @@ kernels must turn 4.1 on or off *before* turning support for version 4
|
||||
on or off; rpc.nfsd does this correctly.)
|
||||
|
||||
The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based
|
||||
on the latest NFSv4.1 Internet Draft:
|
||||
http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29
|
||||
on RFC 5661.
|
||||
|
||||
From the many new features in NFSv4.1 the current implementation
|
||||
focuses on the mandatory-to-implement NFSv4.1 Sessions, providing
|
||||
@ -44,7 +43,7 @@ interoperability problems with future clients. Known issues:
|
||||
trunking, but this is a mandatory feature, and its use is
|
||||
recommended to clients in a number of places. (E.g. to ensure
|
||||
timely renewal in case an existing connection's retry timeouts
|
||||
have gotten too long; see section 8.3 of the draft.)
|
||||
have gotten too long; see section 8.3 of the RFC.)
|
||||
Therefore, lack of this feature may cause future clients to
|
||||
fail.
|
||||
- Incomplete backchannel support: incomplete backchannel gss
|
||||
|
@ -28,7 +28,7 @@ described in the man pages included in the package.
|
||||
Project web page: http://www.nilfs.org/en/
|
||||
Download page: http://www.nilfs.org/en/download.html
|
||||
Git tree web page: http://www.nilfs.org/git/
|
||||
NILFS mailing lists: http://www.nilfs.org/mailman/listinfo/users
|
||||
List info: http://vger.kernel.org/vger-lists.html#linux-nilfs
|
||||
|
||||
Caveats
|
||||
=======
|
||||
@ -74,6 +74,9 @@ norecovery Disable recovery of the filesystem on mount.
|
||||
This disables every write access on the device for
|
||||
read-only mounts or snapshots. This option will fail
|
||||
for r/w mounts on an unclean volume.
|
||||
discard Issue discard/TRIM commands to the underlying block
|
||||
device when blocks are freed. This is useful for SSD
|
||||
devices and sparse/thinly-provisioned LUNs.
|
||||
|
||||
NILFS2 usage
|
||||
============
|
||||
|
@ -164,6 +164,7 @@ read the file /proc/PID/status:
|
||||
VmExe: 68 kB
|
||||
VmLib: 1412 kB
|
||||
VmPTE: 20 kb
|
||||
VmSwap: 0 kB
|
||||
Threads: 1
|
||||
SigQ: 0/28578
|
||||
SigPnd: 0000000000000000
|
||||
@ -177,7 +178,6 @@ read the file /proc/PID/status:
|
||||
CapBnd: ffffffffffffffff
|
||||
voluntary_ctxt_switches: 0
|
||||
nonvoluntary_ctxt_switches: 1
|
||||
Stack usage: 12 kB
|
||||
|
||||
This shows you nearly the same information you would get if you viewed it with
|
||||
the ps command. In fact, ps uses the proc file system to obtain its
|
||||
@ -189,7 +189,13 @@ memory usage. Its seven fields are explained in Table 1-3. The stat file
|
||||
contains details information about the process itself. Its fields are
|
||||
explained in Table 1-4.
|
||||
|
||||
Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
|
||||
(for SMP CONFIG users)
|
||||
For making accounting scalable, RSS related information are handled in
|
||||
asynchronous manner and the vaule may not be very precise. To see a precise
|
||||
snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table.
|
||||
It's slow but very precise.
|
||||
|
||||
Table 1-2: Contents of the status files (as of 2.6.30-rc7)
|
||||
..............................................................................
|
||||
Field Content
|
||||
Name filename of the executable
|
||||
@ -214,6 +220,7 @@ Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
|
||||
VmExe size of text segment
|
||||
VmLib size of shared library code
|
||||
VmPTE size of page table entries
|
||||
VmSwap size of swap usage (the number of referred swapents)
|
||||
Threads number of threads
|
||||
SigQ number of signals queued/max. number for queue
|
||||
SigPnd bitmap of pending signals for the thread
|
||||
@ -231,7 +238,6 @@ Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
|
||||
Mems_allowed_list Same as previous, but in "list format"
|
||||
voluntary_ctxt_switches number of voluntary context switches
|
||||
nonvoluntary_ctxt_switches number of non voluntary context switches
|
||||
Stack usage: stack usage high water mark (round up to page size)
|
||||
..............................................................................
|
||||
|
||||
Table 1-3: Contents of the statm files (as of 2.6.8-rc3)
|
||||
@ -432,6 +438,7 @@ Table 1-5: Kernel info in /proc
|
||||
modules List of loaded modules
|
||||
mounts Mounted filesystems
|
||||
net Networking info (see text)
|
||||
pagetypeinfo Additional page allocator information (see text) (2.5)
|
||||
partitions Table of partitions known to the system
|
||||
pci Deprecated info of PCI bus (new way -> /proc/bus/pci/,
|
||||
decoupled by lspci (2.4)
|
||||
@ -586,7 +593,7 @@ Node 0, zone DMA 0 4 5 4 4 3 ...
|
||||
Node 0, zone Normal 1 0 0 1 101 8 ...
|
||||
Node 0, zone HighMem 2 0 0 1 1 0 ...
|
||||
|
||||
Memory fragmentation is a problem under some workloads, and buddyinfo is a
|
||||
External fragmentation is a problem under some workloads, and buddyinfo is a
|
||||
useful tool for helping diagnose these problems. Buddyinfo will give you a
|
||||
clue as to how big an area you can safely allocate, or why a previous
|
||||
allocation failed.
|
||||
@ -596,6 +603,48 @@ available. In this case, there are 0 chunks of 2^0*PAGE_SIZE available in
|
||||
ZONE_DMA, 4 chunks of 2^1*PAGE_SIZE in ZONE_DMA, 101 chunks of 2^4*PAGE_SIZE
|
||||
available in ZONE_NORMAL, etc...
|
||||
|
||||
More information relevant to external fragmentation can be found in
|
||||
pagetypeinfo.
|
||||
|
||||
> cat /proc/pagetypeinfo
|
||||
Page block order: 9
|
||||
Pages per block: 512
|
||||
|
||||
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
|
||||
Node 0, zone DMA, type Unmovable 0 0 0 1 1 1 1 1 1 1 0
|
||||
Node 0, zone DMA, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0
|
||||
Node 0, zone DMA, type Movable 1 1 2 1 2 1 1 0 1 0 2
|
||||
Node 0, zone DMA, type Reserve 0 0 0 0 0 0 0 0 0 1 0
|
||||
Node 0, zone DMA, type Isolate 0 0 0 0 0 0 0 0 0 0 0
|
||||
Node 0, zone DMA32, type Unmovable 103 54 77 1 1 1 11 8 7 1 9
|
||||
Node 0, zone DMA32, type Reclaimable 0 0 2 1 0 0 0 0 1 0 0
|
||||
Node 0, zone DMA32, type Movable 169 152 113 91 77 54 39 13 6 1 452
|
||||
Node 0, zone DMA32, type Reserve 1 2 2 2 2 0 1 1 1 1 0
|
||||
Node 0, zone DMA32, type Isolate 0 0 0 0 0 0 0 0 0 0 0
|
||||
|
||||
Number of blocks type Unmovable Reclaimable Movable Reserve Isolate
|
||||
Node 0, zone DMA 2 0 5 1 0
|
||||
Node 0, zone DMA32 41 6 967 2 0
|
||||
|
||||
Fragmentation avoidance in the kernel works by grouping pages of different
|
||||
migrate types into the same contiguous regions of memory called page blocks.
|
||||
A page block is typically the size of the default hugepage size e.g. 2MB on
|
||||
X86-64. By keeping pages grouped based on their ability to move, the kernel
|
||||
can reclaim pages within a page block to satisfy a high-order allocation.
|
||||
|
||||
The pagetypinfo begins with information on the size of a page block. It
|
||||
then gives the same type of information as buddyinfo except broken down
|
||||
by migrate-type and finishes with details on how many page blocks of each
|
||||
type exist.
|
||||
|
||||
If min_free_kbytes has been tuned correctly (recommendations made by hugeadm
|
||||
from libhugetlbfs http://sourceforge.net/projects/libhugetlbfs/), one can
|
||||
make an estimate of the likely number of huge pages that can be allocated
|
||||
at a given point in time. All the "Movable" blocks should be allocatable
|
||||
unless memory has been mlock()'d. Some of the Reclaimable blocks should
|
||||
also be allocatable although a lot of filesystem metadata may have to be
|
||||
reclaimed to achieve this.
|
||||
|
||||
..............................................................................
|
||||
|
||||
meminfo:
|
||||
|
@ -837,6 +837,9 @@ replicas continue to be exactly same.
|
||||
individual lists does not affect propagation or the way propagation
|
||||
tree is modified by operations.
|
||||
|
||||
All vfsmounts in a peer group have the same ->mnt_master. If it is
|
||||
non-NULL, they form a contiguous (ordered) segment of slave list.
|
||||
|
||||
A example propagation tree looks as shown in the figure below.
|
||||
[ NOTE: Though it looks like a forest, if we consider all the shared
|
||||
mounts as a conceptual entity called 'pnode', it becomes a tree]
|
||||
@ -874,8 +877,19 @@ replicas continue to be exactly same.
|
||||
|
||||
NOTE: The propagation tree is orthogonal to the mount tree.
|
||||
|
||||
8B Locking:
|
||||
|
||||
8B Algorithm:
|
||||
->mnt_share, ->mnt_slave, ->mnt_slave_list, ->mnt_master are protected
|
||||
by namespace_sem (exclusive for modifications, shared for reading).
|
||||
|
||||
Normally we have ->mnt_flags modifications serialized by vfsmount_lock.
|
||||
There are two exceptions: do_add_mount() and clone_mnt().
|
||||
The former modifies a vfsmount that has not been visible in any shared
|
||||
data structures yet.
|
||||
The latter holds namespace_sem and the only references to vfsmount
|
||||
are in lists that can't be traversed without namespace_sem.
|
||||
|
||||
8C Algorithm:
|
||||
|
||||
The crux of the implementation resides in rbind/move operation.
|
||||
|
||||
|
@ -82,11 +82,13 @@ tmpfs has a mount option to set the NUMA memory allocation policy for
|
||||
all files in that instance (if CONFIG_NUMA is enabled) - which can be
|
||||
adjusted on the fly via 'mount -o remount ...'
|
||||
|
||||
mpol=default prefers to allocate memory from the local node
|
||||
mpol=default use the process allocation policy
|
||||
(see set_mempolicy(2))
|
||||
mpol=prefer:Node prefers to allocate memory from the given Node
|
||||
mpol=bind:NodeList allocates memory only from nodes in NodeList
|
||||
mpol=interleave prefers to allocate from each node in turn
|
||||
mpol=interleave:NodeList allocates from each node of NodeList in turn
|
||||
mpol=local prefers to allocate memory from the local node
|
||||
|
||||
NodeList format is a comma-separated list of decimal numbers and ranges,
|
||||
a range being two hyphen-separated decimal numbers, the smallest and
|
||||
@ -134,3 +136,5 @@ Author:
|
||||
Christoph Rohland <cr@sap.com>, 1.12.01
|
||||
Updated:
|
||||
Hugh Dickins, 4 June 2007
|
||||
Updated:
|
||||
KOSAKI Motohiro, 16 Mar 2010
|
||||
|
@ -253,6 +253,70 @@ pin setup (e.g. controlling which pin the GPIO uses, pullup/pulldown).
|
||||
Also note that it's your responsibility to have stopped using a GPIO
|
||||
before you free it.
|
||||
|
||||
Considering in most cases GPIOs are actually configured right after they
|
||||
are claimed, three additional calls are defined:
|
||||
|
||||
/* request a single GPIO, with initial configuration specified by
|
||||
* 'flags', identical to gpio_request() wrt other arguments and
|
||||
* return value
|
||||
*/
|
||||
int gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
|
||||
|
||||
/* request multiple GPIOs in a single call
|
||||
*/
|
||||
int gpio_request_array(struct gpio *array, size_t num);
|
||||
|
||||
/* release multiple GPIOs in a single call
|
||||
*/
|
||||
void gpio_free_array(struct gpio *array, size_t num);
|
||||
|
||||
where 'flags' is currently defined to specify the following properties:
|
||||
|
||||
* GPIOF_DIR_IN - to configure direction as input
|
||||
* GPIOF_DIR_OUT - to configure direction as output
|
||||
|
||||
* GPIOF_INIT_LOW - as output, set initial level to LOW
|
||||
* GPIOF_INIT_HIGH - as output, set initial level to HIGH
|
||||
|
||||
since GPIOF_INIT_* are only valid when configured as output, so group valid
|
||||
combinations as:
|
||||
|
||||
* GPIOF_IN - configure as input
|
||||
* GPIOF_OUT_INIT_LOW - configured as output, initial level LOW
|
||||
* GPIOF_OUT_INIT_HIGH - configured as output, initial level HIGH
|
||||
|
||||
In the future, these flags can be extended to support more properties such
|
||||
as open-drain status.
|
||||
|
||||
Further more, to ease the claim/release of multiple GPIOs, 'struct gpio' is
|
||||
introduced to encapsulate all three fields as:
|
||||
|
||||
struct gpio {
|
||||
unsigned gpio;
|
||||
unsigned long flags;
|
||||
const char *label;
|
||||
};
|
||||
|
||||
A typical example of usage:
|
||||
|
||||
static struct gpio leds_gpios[] = {
|
||||
{ 32, GPIOF_OUT_INIT_HIGH, "Power LED" }, /* default to ON */
|
||||
{ 33, GPIOF_OUT_INIT_LOW, "Green LED" }, /* default to OFF */
|
||||
{ 34, GPIOF_OUT_INIT_LOW, "Red LED" }, /* default to OFF */
|
||||
{ 35, GPIOF_OUT_INIT_LOW, "Blue LED" }, /* default to OFF */
|
||||
{ ... },
|
||||
};
|
||||
|
||||
err = gpio_request_one(31, GPIOF_IN, "Reset Button");
|
||||
if (err)
|
||||
...
|
||||
|
||||
err = gpio_request_array(leds_gpios, ARRAY_SIZE(leds_gpios));
|
||||
if (err)
|
||||
...
|
||||
|
||||
gpio_free_array(leds_gpios, ARRAY_SIZE(leds_gpios));
|
||||
|
||||
|
||||
GPIOs mapped to IRQs
|
||||
--------------------
|
||||
|
@ -30,7 +30,7 @@ Supported chips:
|
||||
bank1_types=1,1,0,0,0,0,0,2,0,0,0,0,2,0,0,1
|
||||
You may also need to specify the fan_sensors option for these boards
|
||||
fan_sensors=5
|
||||
2) There is a seperate abituguru3 driver for these motherboards,
|
||||
2) There is a separate abituguru3 driver for these motherboards,
|
||||
the abituguru (without the 3 !) driver will not work on these
|
||||
motherboards (and visa versa)!
|
||||
|
||||
|
42
Documentation/hwmon/adt7411
Normal file
42
Documentation/hwmon/adt7411
Normal file
@ -0,0 +1,42 @@
|
||||
Kernel driver adt7411
|
||||
=====================
|
||||
|
||||
Supported chips:
|
||||
* Analog Devices ADT7411
|
||||
Prefix: 'adt7411'
|
||||
Addresses scanned: 0x48, 0x4a, 0x4b
|
||||
Datasheet: Publicly available at the Analog Devices website
|
||||
|
||||
Author: Wolfram Sang (based on adt7470 by Darrick J. Wong)
|
||||
|
||||
Description
|
||||
-----------
|
||||
|
||||
This driver implements support for the Analog Devices ADT7411 chip. There may
|
||||
be other chips that implement this interface.
|
||||
|
||||
The ADT7411 can use an I2C/SMBus compatible 2-wire interface or an
|
||||
SPI-compatible 4-wire interface. It provides a 10-bit analog to digital
|
||||
converter which measures 1 temperature, vdd and 8 input voltages. It has an
|
||||
internal temperature sensor, but an external one can also be connected (one
|
||||
loses 2 inputs then). There are high- and low-limit registers for all inputs.
|
||||
|
||||
Check the datasheet for details.
|
||||
|
||||
sysfs-Interface
|
||||
---------------
|
||||
|
||||
in0_input - vdd voltage input
|
||||
in[1-8]_input - analog 1-8 input
|
||||
temp1_input - temperature input
|
||||
|
||||
Besides standard interfaces, this driver adds (0 = off, 1 = on):
|
||||
|
||||
adc_ref_vdd - Use vdd as reference instead of 2.25 V
|
||||
fast_sampling - Sample at 22.5 kHz instead of 1.4 kHz, but drop filters
|
||||
no_average - Turn off averaging over 16 samples
|
||||
|
||||
Notes
|
||||
-----
|
||||
|
||||
SPI, external temperature sensor and limit registers are not supported yet.
|
@ -1,74 +0,0 @@
|
||||
Kernel driver adt7473
|
||||
======================
|
||||
|
||||
Supported chips:
|
||||
* Analog Devices ADT7473
|
||||
Prefix: 'adt7473'
|
||||
Addresses scanned: I2C 0x2C, 0x2D, 0x2E
|
||||
Datasheet: Publicly available at the Analog Devices website
|
||||
|
||||
Author: Darrick J. Wong
|
||||
|
||||
This driver is depreacted, please use the adt7475 driver instead.
|
||||
|
||||
Description
|
||||
-----------
|
||||
|
||||
This driver implements support for the Analog Devices ADT7473 chip family.
|
||||
|
||||
The ADT7473 uses the 2-wire interface compatible with the SMBUS 2.0
|
||||
specification. Using an analog to digital converter it measures three (3)
|
||||
temperatures and two (2) voltages. It has four (4) 16-bit counters for
|
||||
measuring fan speed. There are three (3) PWM outputs that can be used
|
||||
to control fan speed.
|
||||
|
||||
A sophisticated control system for the PWM outputs is designed into the
|
||||
ADT7473 that allows fan speed to be adjusted automatically based on any of the
|
||||
three temperature sensors. Each PWM output is individually adjustable and
|
||||
programmable. Once configured, the ADT7473 will adjust the PWM outputs in
|
||||
response to the measured temperatures without further host intervention.
|
||||
This feature can also be disabled for manual control of the PWM's.
|
||||
|
||||
Each of the measured inputs (voltage, temperature, fan speed) has
|
||||
corresponding high/low limit values. The ADT7473 will signal an ALARM if
|
||||
any measured value exceeds either limit.
|
||||
|
||||
The ADT7473 samples all inputs continuously. The driver will not read
|
||||
the registers more often than once every other second. Further,
|
||||
configuration data is only read once per minute.
|
||||
|
||||
Special Features
|
||||
----------------
|
||||
|
||||
The ADT7473 have a 10-bit ADC and can therefore measure temperatures
|
||||
with 0.25 degC resolution. Temperature readings can be configured either
|
||||
for twos complement format or "Offset 64" format, wherein 63 is subtracted
|
||||
from the raw value to get the temperature value.
|
||||
|
||||
The Analog Devices datasheet is very detailed and describes a procedure for
|
||||
determining an optimal configuration for the automatic PWM control.
|
||||
|
||||
Configuration Notes
|
||||
-------------------
|
||||
|
||||
Besides standard interfaces driver adds the following:
|
||||
|
||||
* PWM Control
|
||||
|
||||
* pwm#_auto_point1_pwm and temp#_auto_point1_temp and
|
||||
* pwm#_auto_point2_pwm and temp#_auto_point2_temp -
|
||||
|
||||
point1: Set the pwm speed at a lower temperature bound.
|
||||
point2: Set the pwm speed at a higher temperature bound.
|
||||
|
||||
The ADT7473 will scale the pwm between the lower and higher pwm speed when
|
||||
the temperature is between the two temperature boundaries. PWM values range
|
||||
from 0 (off) to 255 (full speed). Fan speed will be set to maximum when the
|
||||
temperature sensor associated with the PWM control exceeds temp#_max.
|
||||
|
||||
Notes
|
||||
-----
|
||||
|
||||
The NVIDIA binary driver presents an ADT7473 chip via an on-card i2c bus.
|
||||
Unfortunately, they fail to set the i2c adapter class, so this driver may
|
||||
fail to find the chip until the nvidia driver is patched.
|
102
Documentation/hwmon/amc6821
Normal file
102
Documentation/hwmon/amc6821
Normal file
@ -0,0 +1,102 @@
|
||||
Kernel driver amc6821
|
||||
=====================
|
||||
|
||||
Supported chips:
|
||||
Texas Instruments AMC6821
|
||||
Prefix: 'amc6821'
|
||||
Addresses scanned: 0x18, 0x19, 0x1a, 0x2c, 0x2d, 0x2e, 0x4c, 0x4d, 0x4e
|
||||
Datasheet: http://focus.ti.com/docs/prod/folders/print/amc6821.html
|
||||
|
||||
Authors:
|
||||
Tomaz Mertelj <tomaz.mertelj@guest.arnes.si>
|
||||
|
||||
|
||||
Description
|
||||
-----------
|
||||
|
||||
This driver implements support for the Texas Instruments amc6821 chip.
|
||||
The chip has one on-chip and one remote temperature sensor and one pwm fan
|
||||
regulator.
|
||||
The pwm can be controlled either from software or automatically.
|
||||
|
||||
The driver provides the following sensor accesses in sysfs:
|
||||
|
||||
temp1_input ro on-chip temperature
|
||||
temp1_min rw "
|
||||
temp1_max rw "
|
||||
temp1_crit rw "
|
||||
temp1_min_alarm ro "
|
||||
temp1_max_alarm ro "
|
||||
temp1_crit_alarm ro "
|
||||
|
||||
temp2_input ro remote temperature
|
||||
temp2_min rw "
|
||||
temp2_max rw "
|
||||
temp2_crit rw "
|
||||
temp2_min_alarm ro "
|
||||
temp2_max_alarm ro "
|
||||
temp2_crit_alarm ro "
|
||||
temp2_fault ro "
|
||||
|
||||
fan1_input ro tachometer speed
|
||||
fan1_min rw "
|
||||
fan1_max rw "
|
||||
fan1_fault ro "
|
||||
fan1_div rw Fan divisor can be either 2 or 4.
|
||||
|
||||
pwm1 rw pwm1
|
||||
pwm1_enable rw regulator mode, 1=open loop, 2=fan controlled
|
||||
by remote temperature, 3=fan controlled by
|
||||
combination of the on-chip temperature and
|
||||
remote-sensor temperature,
|
||||
pwm1_auto_channels_temp ro 1 if pwm_enable==2, 3 if pwm_enable==3
|
||||
pwm1_auto_point1_pwm ro Hardwired to 0, shared for both
|
||||
temperature channels.
|
||||
pwm1_auto_point2_pwm rw This value is shared for both temperature
|
||||
channels.
|
||||
pwm1_auto_point3_pwm rw Hardwired to 255, shared for both
|
||||
temperature channels.
|
||||
|
||||
temp1_auto_point1_temp ro Hardwired to temp2_auto_point1_temp
|
||||
which is rw. Below this temperature fan stops.
|
||||
temp1_auto_point2_temp rw The low-temperature limit of the proportional
|
||||
range. Below this temperature
|
||||
pwm1 = pwm1_auto_point2_pwm. It can go from
|
||||
0 degree C to 124 degree C in steps of
|
||||
4 degree C. Read it out after writing to get
|
||||
the actual value.
|
||||
temp1_auto_point3_temp rw Above this temperature fan runs at maximum
|
||||
speed. It can go from temp1_auto_point2_temp.
|
||||
It can only have certain discrete values
|
||||
which depend on temp1_auto_point2_temp and
|
||||
pwm1_auto_point2_pwm. Read it out after
|
||||
writing to get the actual value.
|
||||
|
||||
temp2_auto_point1_temp rw Must be between 0 degree C and 63 degree C and
|
||||
it defines the passive cooling temperature.
|
||||
Below this temperature the fan stops in
|
||||
the closed loop mode.
|
||||
temp2_auto_point2_temp rw The low-temperature limit of the proportional
|
||||
range. Below this temperature
|
||||
pwm1 = pwm1_auto_point2_pwm. It can go from
|
||||
0 degree C to 124 degree C in steps
|
||||
of 4 degree C.
|
||||
|
||||
temp2_auto_point3_temp rw Above this temperature fan runs at maximum
|
||||
speed. It can only have certain discrete
|
||||
values which depend on temp2_auto_point2_temp
|
||||
and pwm1_auto_point2_pwm. Read it out after
|
||||
writing to get actual value.
|
||||
|
||||
|
||||
Module parameters
|
||||
-----------------
|
||||
|
||||
If your board has a BIOS that initializes the amc6821 correctly, you should
|
||||
load the module with: init=0.
|
||||
|
||||
If your board BIOS doesn't initialize the chip, or you want
|
||||
different settings, you can set the following parameters:
|
||||
init=1,
|
||||
pwminv: 0 default pwm output, 1 inverts pwm output.
|
||||
|
296
Documentation/hwmon/asc7621
Normal file
296
Documentation/hwmon/asc7621
Normal file
@ -0,0 +1,296 @@
|
||||
Kernel driver asc7621
|
||||
==================
|
||||
|
||||
Supported chips:
|
||||
Andigilog aSC7621 and aSC7621a
|
||||
Prefix: 'asc7621'
|
||||
Addresses scanned: I2C 0x2c, 0x2d, 0x2e
|
||||
Datasheet: http://www.fairview5.com/linux/asc7621/asc7621.pdf
|
||||
|
||||
Author:
|
||||
George Joseph
|
||||
|
||||
Description provided by Dave Pivin @ Andigilog:
|
||||
|
||||
Andigilog has both the PECI and pre-PECI versions of the Heceta-6, as
|
||||
Intel calls them. Heceta-6e has high frequency PWM and Heceta-6p has
|
||||
added PECI and a 4th thermal zone. The Andigilog aSC7611 is the
|
||||
Heceta-6e part and aSC7621 is the Heceta-6p part. They are both in
|
||||
volume production, shipping to Intel and their subs.
|
||||
|
||||
We have enhanced both parts relative to the governing Intel
|
||||
specification. First enhancement is temperature reading resolution. We
|
||||
have used registers below 20h for vendor-specific functions in addition
|
||||
to those in the Intel-specified vendor range.
|
||||
|
||||
Our conversion process produces a result that is reported as two bytes.
|
||||
The fan speed control uses this finer value to produce a "step-less" fan
|
||||
PWM output. These two bytes are "read-locked" to guarantee that once a
|
||||
high or low byte is read, the other byte is locked-in until after the
|
||||
next read of any register. So to get an atomic reading, read high or low
|
||||
byte, then the very next read should be the opposite byte. Our data
|
||||
sheet says 10-bits of resolution, although you may find the lower bits
|
||||
are active, they are not necessarily reliable or useful externally. We
|
||||
chose not to mask them.
|
||||
|
||||
We employ significant filtering that is user tunable as described in the
|
||||
data sheet. Our temperature reports and fan PWM outputs are very smooth
|
||||
when compared to the competition, in addition to the higher resolution
|
||||
temperature reports. The smoother PWM output does not require user
|
||||
intervention.
|
||||
|
||||
We offer GPIO features on the former VID pins. These are open-drain
|
||||
outputs or inputs and may be used as general purpose I/O or as alarm
|
||||
outputs that are based on temperature limits. These are in 19h and 1Ah.
|
||||
|
||||
We offer flexible mapping of temperature readings to thermal zones. Any
|
||||
temperature may be mapped to any zone, which has a default assignment
|
||||
that follows Intel's specs.
|
||||
|
||||
Since there is a fan to zone assignment that allows for the "hotter" of
|
||||
a set of zones to control the PWM of an individual fan, but there is no
|
||||
indication to the user, we have added an indicator that shows which zone
|
||||
is currently controlling the PWM for a given fan. This is in register
|
||||
00h.
|
||||
|
||||
Both remote diode temperature readings may be given an offset value such
|
||||
that the reported reading as well as the temperature used to determine
|
||||
PWM may be offset for system calibration purposes.
|
||||
|
||||
PECI Extended configuration allows for having more than two domains per
|
||||
PECI address and also provides an enabling function for each PECI
|
||||
address. One could use our flexible zone assignment to have a zone
|
||||
assigned to up to 4 PECI addresses. This is not possible in the default
|
||||
Intel configuration. This would be useful in multi-CPU systems with
|
||||
individual fans on each that would benefit from individual fan control.
|
||||
This is in register 0Eh.
|
||||
|
||||
The tachometer measurement system is flexible and able to adapt to many
|
||||
fan types. We can also support pulse-stretched PWM so that 3-wire fans
|
||||
may be used. These characteristics are in registers 04h to 07h.
|
||||
|
||||
Finally, we have added a tach disable function that turns off the tach
|
||||
measurement system for individual tachs in order to save power. That is
|
||||
in register 75h.
|
||||
|
||||
--
|
||||
aSC7621 Product Description
|
||||
|
||||
The aSC7621 has a two wire digital interface compatible with SMBus 2.0.
|
||||
Using a 10-bit ADC, the aSC7621 measures the temperature of two remote diode
|
||||
connected transistors as well as its own die. Support for Platform
|
||||
Environmental Control Interface (PECI) is included.
|
||||
|
||||
Using temperature information from these four zones, an automatic fan speed
|
||||
control algorithm is employed to minimize acoustic impact while achieving
|
||||
recommended CPU temperature under varying operational loads.
|
||||
|
||||
To set fan speed, the aSC7621 has three independent pulse width modulation
|
||||
(PWM) outputs that are controlled by one, or a combination of three,
|
||||
temperature zones. Both high- and low-frequency PWM ranges are supported.
|
||||
|
||||
The aSC7621 also includes a digital filter that can be invoked to smooth
|
||||
temperature readings for better control of fan speed and minimum acoustic
|
||||
impact.
|
||||
|
||||
The aSC7621 has tachometer inputs to measure fan speed on up to four fans.
|
||||
Limit and status registers for all measured values are included to alert
|
||||
the system host that any measurements are outside of programmed limits
|
||||
via status registers.
|
||||
|
||||
System voltages of VCCP, 2.5V, 3.3V, 5.0V, and 12V motherboard power are
|
||||
monitored efficiently with internal scaling resistors.
|
||||
|
||||
Features
|
||||
- Supports PECI interface and monitors internal and remote thermal diodes
|
||||
- 2-wire, SMBus 2.0 compliant, serial interface
|
||||
- 10-bit ADC
|
||||
- Monitors VCCP, 2.5V, 3.3V, 5.0V, and 12V motherboard/processor supplies
|
||||
- Programmable autonomous fan control based on temperature readings
|
||||
- Noise filtering of temperature reading for fan speed control
|
||||
- 0.25C digital temperature sensor resolution
|
||||
- 3 PWM fan speed control outputs for 2-, 3- or 4-wire fans and up to 4 fan
|
||||
tachometer inputs
|
||||
- Enhanced measured temperature to Temperature Zone assignment.
|
||||
- Provides high and low PWM frequency ranges
|
||||
- 3 GPIO pins for custom use
|
||||
- 24-Lead QSOP package
|
||||
|
||||
Configuration Notes
|
||||
===================
|
||||
|
||||
Except where noted below, the sysfs entries created by this driver follow
|
||||
the standards defined in "sysfs-interface".
|
||||
|
||||
temp1_source
|
||||
0 (default) peci_legacy = 0, Remote 1 Temperature
|
||||
peci_legacy = 1, PECI Processor Temperature 0
|
||||
1 Remote 1 Temperature
|
||||
2 Remote 2 Temperature
|
||||
3 Internal Temperature
|
||||
4 PECI Processor Temperature 0
|
||||
5 PECI Processor Temperature 1
|
||||
6 PECI Processor Temperature 2
|
||||
7 PECI Processor Temperature 3
|
||||
|
||||
temp2_source
|
||||
0 (default) Internal Temperature
|
||||
1 Remote 1 Temperature
|
||||
2 Remote 2 Temperature
|
||||
3 Internal Temperature
|
||||
4 PECI Processor Temperature 0
|
||||
5 PECI Processor Temperature 1
|
||||
6 PECI Processor Temperature 2
|
||||
7 PECI Processor Temperature 3
|
||||
|
||||
temp3_source
|
||||
0 (default) Remote 2 Temperature
|
||||
1 Remote 1 Temperature
|
||||
2 Remote 2 Temperature
|
||||
3 Internal Temperature
|
||||
4 PECI Processor Temperature 0
|
||||
5 PECI Processor Temperature 1
|
||||
6 PECI Processor Temperature 2
|
||||
7 PECI Processor Temperature 3
|
||||
|
||||
temp4_source
|
||||
0 (default) peci_legacy = 0, PECI Processor Temperature 0
|
||||
peci_legacy = 1, Remote 1 Temperature
|
||||
1 Remote 1 Temperature
|
||||
2 Remote 2 Temperature
|
||||
3 Internal Temperature
|
||||
4 PECI Processor Temperature 0
|
||||
5 PECI Processor Temperature 1
|
||||
6 PECI Processor Temperature 2
|
||||
7 PECI Processor Temperature 3
|
||||
|
||||
temp[1-4]_smoothing_enable
|
||||
temp[1-4]_smoothing_time
|
||||
Smooths spikes in temp readings caused by noise.
|
||||
Valid values in milliseconds are:
|
||||
35000
|
||||
17600
|
||||
11800
|
||||
7000
|
||||
4400
|
||||
3000
|
||||
1600
|
||||
800
|
||||
|
||||
temp[1-4]_crit
|
||||
When the corresponding zone temperature reaches this value,
|
||||
ALL pwm outputs will got to 100%.
|
||||
|
||||
temp[5-8]_input
|
||||
temp[5-8]_enable
|
||||
The aSC7621 can also read temperatures provided by the processor
|
||||
via the PECI bus. Usually these are "core" temps and are relative
|
||||
to the point where the automatic thermal control circuit starts
|
||||
throttling. This means that these are usually negative numbers.
|
||||
|
||||
pwm[1-3]_enable
|
||||
0 Fan off.
|
||||
1 Fan on manual control.
|
||||
2 Fan on automatic control and will run at the minimum pwm
|
||||
if the temperature for the zone is below the minimum.
|
||||
3 Fan on automatic control but will be off if the temperature
|
||||
for the zone is below the minimum.
|
||||
4-254 Ignored.
|
||||
255 Fan on full.
|
||||
|
||||
pwm[1-3]_auto_channels
|
||||
Bitmap as described in sysctl-interface with the following
|
||||
exceptions...
|
||||
Only the following combination of zones (and their corresponding masks)
|
||||
are valid:
|
||||
1
|
||||
2
|
||||
3
|
||||
2,3
|
||||
1,2,3
|
||||
4
|
||||
1,2,3,4
|
||||
|
||||
Special values:
|
||||
0 Disabled.
|
||||
16 Fan on manual control.
|
||||
31 Fan on full.
|
||||
|
||||
|
||||
pwm[1-3]_invert
|
||||
When set, inverts the meaning of pwm[1-3].
|
||||
i.e. when pwm = 0, the fan will be on full and
|
||||
when pwm = 255 the fan will be off.
|
||||
|
||||
pwm[1-3]_freq
|
||||
PWM frequency in Hz
|
||||
Valid values in Hz are:
|
||||
|
||||
10
|
||||
15
|
||||
23
|
||||
30 (default)
|
||||
38
|
||||
47
|
||||
62
|
||||
94
|
||||
23000
|
||||
24000
|
||||
25000
|
||||
26000
|
||||
27000
|
||||
28000
|
||||
29000
|
||||
30000
|
||||
|
||||
Setting any other value will be ignored.
|
||||
|
||||
peci_enable
|
||||
Enables or disables PECI
|
||||
|
||||
peci_avg
|
||||
Input filter average time.
|
||||
|
||||
0 0 Sec. (no Smoothing) (default)
|
||||
1 0.25 Sec.
|
||||
2 0.5 Sec.
|
||||
3 1.0 Sec.
|
||||
4 2.0 Sec.
|
||||
5 4.0 Sec.
|
||||
6 8.0 Sec.
|
||||
7 0.0 Sec.
|
||||
|
||||
peci_legacy
|
||||
|
||||
0 Standard Mode (default)
|
||||
Remote Diode 1 reading is associated with
|
||||
Temperature Zone 1, PECI is associated with
|
||||
Zone 4
|
||||
|
||||
1 Legacy Mode
|
||||
PECI is associated with Temperature Zone 1,
|
||||
Remote Diode 1 is associated with Zone 4
|
||||
|
||||
peci_diode
|
||||
Diode filter
|
||||
|
||||
0 0.25 Sec.
|
||||
1 1.1 Sec.
|
||||
2 2.4 Sec. (default)
|
||||
3 3.4 Sec.
|
||||
4 5.0 Sec.
|
||||
5 6.8 Sec.
|
||||
6 10.2 Sec.
|
||||
7 16.4 Sec.
|
||||
|
||||
peci_4domain
|
||||
Four domain enable
|
||||
|
||||
0 1 or 2 Domains for enabled processors (default)
|
||||
1 3 or 4 Domains for enabled processors
|
||||
|
||||
peci_domain
|
||||
Domain
|
||||
|
||||
0 Processor contains a single domain (0) (default)
|
||||
1 Processor contains two domains (0,1)
|
@ -5,31 +5,23 @@ Supported chips:
|
||||
* IT8705F
|
||||
Prefix: 'it87'
|
||||
Addresses scanned: from Super I/O config space (8 I/O ports)
|
||||
Datasheet: Publicly available at the ITE website
|
||||
http://www.ite.com.tw/product_info/file/pc/IT8705F_V.0.4.1.pdf
|
||||
Datasheet: Once publicly available at the ITE website, but no longer
|
||||
* IT8712F
|
||||
Prefix: 'it8712'
|
||||
Addresses scanned: from Super I/O config space (8 I/O ports)
|
||||
Datasheet: Publicly available at the ITE website
|
||||
http://www.ite.com.tw/product_info/file/pc/IT8712F_V0.9.1.pdf
|
||||
http://www.ite.com.tw/product_info/file/pc/Errata%20V0.1%20for%20IT8712F%20V0.9.1.pdf
|
||||
http://www.ite.com.tw/product_info/file/pc/IT8712F_V0.9.3.pdf
|
||||
Datasheet: Once publicly available at the ITE website, but no longer
|
||||
* IT8716F/IT8726F
|
||||
Prefix: 'it8716'
|
||||
Addresses scanned: from Super I/O config space (8 I/O ports)
|
||||
Datasheet: Publicly available at the ITE website
|
||||
http://www.ite.com.tw/product_info/file/pc/IT8716F_V0.3.ZIP
|
||||
http://www.ite.com.tw/product_info/file/pc/IT8726F_V0.3.pdf
|
||||
Datasheet: Once publicly available at the ITE website, but no longer
|
||||
* IT8718F
|
||||
Prefix: 'it8718'
|
||||
Addresses scanned: from Super I/O config space (8 I/O ports)
|
||||
Datasheet: Publicly available at the ITE website
|
||||
http://www.ite.com.tw/product_info/file/pc/IT8718F_V0.2.zip
|
||||
http://www.ite.com.tw/product_info/file/pc/IT8718F_V0%203_(for%20C%20version).zip
|
||||
Datasheet: Once publicly available at the ITE website, but no longer
|
||||
* IT8720F
|
||||
Prefix: 'it8720'
|
||||
Addresses scanned: from Super I/O config space (8 I/O ports)
|
||||
Datasheet: Not yet publicly available.
|
||||
Datasheet: Not publicly available
|
||||
* SiS950 [clone of IT8705F]
|
||||
Prefix: 'it87'
|
||||
Addresses scanned: from Super I/O config space (8 I/O ports)
|
||||
@ -136,6 +128,10 @@ registers are read whenever any data is read (unless it is less than 1.5
|
||||
seconds since the last update). This means that you can easily miss
|
||||
once-only alarms.
|
||||
|
||||
Out-of-limit readings can also result in beeping, if the chip is properly
|
||||
wired and configured. Beeping can be enabled or disabled per sensor type
|
||||
(temperatures, voltages and fans.)
|
||||
|
||||
The IT87xx only updates its values each 1.5 seconds; reading it more often
|
||||
will do no harm, but will return 'old' values.
|
||||
|
||||
@ -150,11 +146,38 @@ Fan speed control
|
||||
-----------------
|
||||
|
||||
The fan speed control features are limited to manual PWM mode. Automatic
|
||||
"Smart Guardian" mode control handling is not implemented. However
|
||||
if you want to go for "manual mode" just write 1 to pwmN_enable.
|
||||
"Smart Guardian" mode control handling is only implemented for older chips
|
||||
(see below.) However if you want to go for "manual mode" just write 1 to
|
||||
pwmN_enable.
|
||||
|
||||
If you are only able to control the fan speed with very small PWM values,
|
||||
try lowering the PWM base frequency (pwm1_freq). Depending on the fan,
|
||||
it may give you a somewhat greater control range. The same frequency is
|
||||
used to drive all fan outputs, which is why pwm2_freq and pwm3_freq are
|
||||
read-only.
|
||||
|
||||
|
||||
Automatic fan speed control (old interface)
|
||||
-------------------------------------------
|
||||
|
||||
The driver supports the old interface to automatic fan speed control
|
||||
which is implemented by IT8705F chips up to revision F and IT8712F
|
||||
chips up to revision G.
|
||||
|
||||
This interface implements 4 temperature vs. PWM output trip points.
|
||||
The PWM output of trip point 4 is always the maximum value (fan running
|
||||
at full speed) while the PWM output of the other 3 trip points can be
|
||||
freely chosen. The temperature of all 4 trip points can be freely chosen.
|
||||
Additionally, trip point 1 has an hysteresis temperature attached, to
|
||||
prevent fast switching between fan on and off.
|
||||
|
||||
The chip automatically computes the PWM output value based on the input
|
||||
temperature, based on this simple rule: if the temperature value is
|
||||
between trip point N and trip point N+1 then the PWM output value is
|
||||
the one of trip point N. The automatic control mode is less flexible
|
||||
than the manual control mode, but it reacts faster, is more robust and
|
||||
doesn't use CPU cycles.
|
||||
|
||||
Trip points must be set properly before switching to automatic fan speed
|
||||
control mode. The driver will perform basic integrity checks before
|
||||
actually switching to automatic control mode.
|
||||
|
@ -3,8 +3,8 @@ Kernel driver k10temp
|
||||
|
||||
Supported chips:
|
||||
* AMD Family 10h processors:
|
||||
Socket F: Quad-Core/Six-Core/Embedded Opteron
|
||||
Socket AM2+: Opteron, Phenom (II) X3/X4
|
||||
Socket F: Quad-Core/Six-Core/Embedded Opteron (but see below)
|
||||
Socket AM2+: Quad-Core Opteron, Phenom (II) X3/X4, Athlon X2 (but see below)
|
||||
Socket AM3: Quad-Core Opteron, Athlon/Phenom II X2/X3/X4, Sempron II
|
||||
Socket S1G3: Athlon II, Sempron, Turion II
|
||||
* AMD Family 11h processors:
|
||||
@ -36,10 +36,15 @@ Description
|
||||
This driver permits reading of the internal temperature sensor of AMD
|
||||
Family 10h and 11h processors.
|
||||
|
||||
All these processors have a sensor, but on older revisions of Family 10h
|
||||
processors, the sensor may return inconsistent values (erratum 319). The
|
||||
driver will refuse to load on these revisions unless you specify the
|
||||
"force=1" module parameter.
|
||||
All these processors have a sensor, but on those for Socket F or AM2+,
|
||||
the sensor may return inconsistent values (erratum 319). The driver
|
||||
will refuse to load on these revisions unless you specify the "force=1"
|
||||
module parameter.
|
||||
|
||||
Due to technical reasons, the driver can detect only the mainboard's
|
||||
socket type, not the processor's actual capabilities. Therefore, if you
|
||||
are using an AM3 processor on an AM2+ mainboard, you can safely use the
|
||||
"force=1" parameter.
|
||||
|
||||
There is one temperature measurement value, available as temp1_input in
|
||||
sysfs. It is measured in degrees Celsius with a resolution of 1/8th degree.
|
||||
|
@ -84,6 +84,10 @@ Supported chips:
|
||||
Addresses scanned: I2C 0x4c
|
||||
Datasheet: Publicly available at the Maxim website
|
||||
http://www.maxim-ic.com/quick_view2.cfm/qv_pk/3500
|
||||
* Winbond/Nuvoton W83L771AWG/ASG
|
||||
Prefix: 'w83l771'
|
||||
Addresses scanned: I2C 0x4c
|
||||
Datasheet: Not publicly available, can be requested from Nuvoton
|
||||
|
||||
|
||||
Author: Jean Delvare <khali@linux-fr.org>
|
||||
@ -147,6 +151,12 @@ MAX6680 and MAX6681:
|
||||
* Selectable address
|
||||
* Remote sensor type selection
|
||||
|
||||
W83L771AWG/ASG
|
||||
* The AWG and ASG variants only differ in package format.
|
||||
* Filter and alert configuration register at 0xBF
|
||||
* Diode ideality factor configuration (remote sensor) at 0xE3
|
||||
* Moving average (depending on conversion rate)
|
||||
|
||||
All temperature values are given in degrees Celsius. Resolution
|
||||
is 1.0 degree for the local temperature, 0.125 degree for the remote
|
||||
temperature, except for the MAX6657, MAX6658 and MAX6659 which have a
|
||||
@ -163,6 +173,18 @@ The lm90 driver will not update its values more frequently than every
|
||||
other second; reading them more often will do no harm, but will return
|
||||
'old' values.
|
||||
|
||||
SMBus Alert Support
|
||||
-------------------
|
||||
|
||||
This driver has basic support for SMBus alert. When an alert is received,
|
||||
the status register is read and the faulty temperature channel is logged.
|
||||
|
||||
The Analog Devices chips (ADM1032 and ADT7461) do not implement the SMBus
|
||||
alert protocol properly so additional care is needed: the ALERT output is
|
||||
disabled when an alert is received, and is re-enabled only when the alarm
|
||||
is gone. Otherwise the chip would block alerts from other chips in the bus
|
||||
as long as the alarm is active.
|
||||
|
||||
PEC Support
|
||||
-----------
|
||||
|
||||
|
@ -15,7 +15,8 @@ Supported adapters:
|
||||
* Intel 82801I (ICH9)
|
||||
* Intel EP80579 (Tolapai)
|
||||
* Intel 82801JI (ICH10)
|
||||
* Intel PCH
|
||||
* Intel 3400/5 Series (PCH)
|
||||
* Intel Cougar Point (PCH)
|
||||
Datasheets: Publicly available at the Intel website
|
||||
|
||||
Authors:
|
||||
|
@ -29,6 +29,9 @@ can be easily added when needed.
|
||||
Earlier kernels defaulted to type=0 (Philips). But now, if the type
|
||||
parameter is missing, the driver will simply fail to initialize.
|
||||
|
||||
SMBus alert support is available on adapters which have this line properly
|
||||
connected to the parallel port's interrupt pin.
|
||||
|
||||
|
||||
Building your own adapter
|
||||
-------------------------
|
||||
|
@ -9,3 +9,14 @@ parport handling is not an option. The drawback is a reduced portability
|
||||
and the impossibility to daisy-chain other parallel port devices.
|
||||
|
||||
Please see i2c-parport for documentation.
|
||||
|
||||
Module parameters:
|
||||
|
||||
* type: type of adapter (see i2c-parport or modinfo)
|
||||
|
||||
* base: base I/O address
|
||||
Default is 0x378 which is fairly common for parallel ports, at least on PC.
|
||||
|
||||
* irq: optional IRQ
|
||||
This must be passed if you want SMBus alert support, assuming your adapter
|
||||
actually supports this.
|
||||
|
@ -185,6 +185,22 @@ the protocol. All ARP communications use slave address 0x61 and
|
||||
require PEC checksums.
|
||||
|
||||
|
||||
SMBus Alert
|
||||
===========
|
||||
|
||||
SMBus Alert was introduced in Revision 1.0 of the specification.
|
||||
|
||||
The SMBus alert protocol allows several SMBus slave devices to share a
|
||||
single interrupt pin on the SMBus master, while still allowing the master
|
||||
to know which slave triggered the interrupt.
|
||||
|
||||
This is implemented the following way in the Linux kernel:
|
||||
* I2C bus drivers which support SMBus alert should call
|
||||
i2c_setup_smbus_alert() to setup SMBus alert support.
|
||||
* I2C drivers for devices which can trigger SMBus alerts should implement
|
||||
the optional alert() callback.
|
||||
|
||||
|
||||
I2C Block Transactions
|
||||
======================
|
||||
|
||||
|
@ -318,8 +318,9 @@ Plain I2C communication
|
||||
These routines read and write some bytes from/to a client. The client
|
||||
contains the i2c address, so you do not have to include it. The second
|
||||
parameter contains the bytes to read/write, the third the number of bytes
|
||||
to read/write (must be less than the length of the buffer.) Returned is
|
||||
the actual number of bytes read/written.
|
||||
to read/write (must be less than the length of the buffer, also should be
|
||||
less than 64k since msg.len is u16.) Returned is the actual number of bytes
|
||||
read/written.
|
||||
|
||||
int i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msg,
|
||||
int num);
|
||||
|
49
Documentation/init.txt
Normal file
49
Documentation/init.txt
Normal file
@ -0,0 +1,49 @@
|
||||
Explaining the dreaded "No init found." boot hang message
|
||||
=========================================================
|
||||
|
||||
OK, so you've got this pretty unintuitive message (currently located
|
||||
in init/main.c) and are wondering what the H*** went wrong.
|
||||
Some high-level reasons for failure (listed roughly in order of execution)
|
||||
to load the init binary are:
|
||||
A) Unable to mount root FS
|
||||
B) init binary doesn't exist on rootfs
|
||||
C) broken console device
|
||||
D) binary exists but dependencies not available
|
||||
E) binary cannot be loaded
|
||||
|
||||
Detailed explanations:
|
||||
0) Set "debug" kernel parameter (in bootloader config file or CONFIG_CMDLINE)
|
||||
to get more detailed kernel messages.
|
||||
A) make sure you have the correct root FS type
|
||||
(and root= kernel parameter points to the correct partition),
|
||||
required drivers such as storage hardware (such as SCSI or USB!)
|
||||
and filesystem (ext3, jffs2 etc.) are builtin (alternatively as modules,
|
||||
to be pre-loaded by an initrd)
|
||||
C) Possibly a conflict in console= setup --> initial console unavailable.
|
||||
E.g. some serial consoles are unreliable due to serial IRQ issues (e.g.
|
||||
missing interrupt-based configuration).
|
||||
Try using a different console= device or e.g. netconsole= .
|
||||
D) e.g. required library dependencies of the init binary such as
|
||||
/lib/ld-linux.so.2 missing or broken. Use readelf -d <INIT>|grep NEEDED
|
||||
to find out which libraries are required.
|
||||
E) make sure the binary's architecture matches your hardware.
|
||||
E.g. i386 vs. x86_64 mismatch, or trying to load x86 on ARM hardware.
|
||||
In case you tried loading a non-binary file here (shell script?),
|
||||
you should make sure that the script specifies an interpreter in its shebang
|
||||
header line (#!/...) that is fully working (including its library
|
||||
dependencies). And before tackling scripts, better first test a simple
|
||||
non-script binary such as /bin/sh and confirm its successful execution.
|
||||
To find out more, add code to init/main.c to display kernel_execve()s
|
||||
return values.
|
||||
|
||||
Please extend this explanation whenever you find new failure causes
|
||||
(after all loading the init binary is a CRITICAL and hard transition step
|
||||
which needs to be made as painless as possible), then submit patch to LKML.
|
||||
Further TODOs:
|
||||
- Implement the various run_init_process() invocations via a struct array
|
||||
which can then store the kernel_execve() result value and on failure
|
||||
log it all by iterating over _all_ results (very important usability fix).
|
||||
- try to make the implementation itself more helpful in general,
|
||||
e.g. by providing additional error messages at affected places.
|
||||
|
||||
Andreas Mohr <andi at lisas period de>
|
@ -27,12 +27,30 @@ set of events/packets.
|
||||
|
||||
A set of ABS_MT events with the desired properties is defined. The events
|
||||
are divided into categories, to allow for partial implementation. The
|
||||
minimum set consists of ABS_MT_TOUCH_MAJOR, ABS_MT_POSITION_X and
|
||||
ABS_MT_POSITION_Y, which allows for multiple fingers to be tracked. If the
|
||||
device supports it, the ABS_MT_WIDTH_MAJOR may be used to provide the size
|
||||
of the approaching finger. Anisotropy and direction may be specified with
|
||||
ABS_MT_TOUCH_MINOR, ABS_MT_WIDTH_MINOR and ABS_MT_ORIENTATION. The
|
||||
ABS_MT_TOOL_TYPE may be used to specify whether the touching tool is a
|
||||
minimum set consists of ABS_MT_POSITION_X and ABS_MT_POSITION_Y, which
|
||||
allows for multiple fingers to be tracked. If the device supports it, the
|
||||
ABS_MT_TOUCH_MAJOR and ABS_MT_WIDTH_MAJOR may be used to provide the size
|
||||
of the contact area and approaching finger, respectively.
|
||||
|
||||
The TOUCH and WIDTH parameters have a geometrical interpretation; imagine
|
||||
looking through a window at someone gently holding a finger against the
|
||||
glass. You will see two regions, one inner region consisting of the part
|
||||
of the finger actually touching the glass, and one outer region formed by
|
||||
the perimeter of the finger. The diameter of the inner region is the
|
||||
ABS_MT_TOUCH_MAJOR, the diameter of the outer region is
|
||||
ABS_MT_WIDTH_MAJOR. Now imagine the person pressing the finger harder
|
||||
against the glass. The inner region will increase, and in general, the
|
||||
ratio ABS_MT_TOUCH_MAJOR / ABS_MT_WIDTH_MAJOR, which is always smaller than
|
||||
unity, is related to the finger pressure. For pressure-based devices,
|
||||
ABS_MT_PRESSURE may be used to provide the pressure on the contact area
|
||||
instead.
|
||||
|
||||
In addition to the MAJOR parameters, the oval shape of the finger can be
|
||||
described by adding the MINOR parameters, such that MAJOR and MINOR are the
|
||||
major and minor axis of an ellipse. Finally, the orientation of the oval
|
||||
shape can be describe with the ORIENTATION parameter.
|
||||
|
||||
The ABS_MT_TOOL_TYPE may be used to specify whether the touching tool is a
|
||||
finger or a pen or something else. Devices with more granular information
|
||||
may specify general shapes as blobs, i.e., as a sequence of rectangular
|
||||
shapes grouped together by an ABS_MT_BLOB_ID. Finally, for the few devices
|
||||
@ -42,16 +60,30 @@ report finger tracking from hardware [5].
|
||||
Here is what a minimal event sequence for a two-finger touch would look
|
||||
like:
|
||||
|
||||
ABS_MT_TOUCH_MAJOR
|
||||
ABS_MT_POSITION_X
|
||||
ABS_MT_POSITION_Y
|
||||
SYN_MT_REPORT
|
||||
ABS_MT_TOUCH_MAJOR
|
||||
ABS_MT_POSITION_X
|
||||
ABS_MT_POSITION_Y
|
||||
SYN_MT_REPORT
|
||||
SYN_REPORT
|
||||
|
||||
Here is the sequence after lifting one of the fingers:
|
||||
|
||||
ABS_MT_POSITION_X
|
||||
ABS_MT_POSITION_Y
|
||||
SYN_MT_REPORT
|
||||
SYN_REPORT
|
||||
|
||||
And here is the sequence after lifting the remaining finger:
|
||||
|
||||
SYN_MT_REPORT
|
||||
SYN_REPORT
|
||||
|
||||
If the driver reports one of BTN_TOUCH or ABS_PRESSURE in addition to the
|
||||
ABS_MT events, the last SYN_MT_REPORT event may be omitted. Otherwise, the
|
||||
last SYN_REPORT will be dropped by the input core, resulting in no
|
||||
zero-finger event reaching userland.
|
||||
|
||||
Event Semantics
|
||||
---------------
|
||||
@ -87,6 +119,12 @@ the contact. The ratio ABS_MT_TOUCH_MAJOR / ABS_MT_WIDTH_MAJOR approximates
|
||||
the notion of pressure. The fingers of the hand and the palm all have
|
||||
different characteristic widths [1].
|
||||
|
||||
ABS_MT_PRESSURE
|
||||
|
||||
The pressure, in arbitrary units, on the contact area. May be used instead
|
||||
of TOUCH and WIDTH for pressure-based devices or any device with a spatial
|
||||
signal intensity distribution.
|
||||
|
||||
ABS_MT_ORIENTATION
|
||||
|
||||
The orientation of the ellipse. The value should describe a signed quarter
|
||||
@ -170,6 +208,16 @@ There are a few devices that support trackingID in hardware. User space can
|
||||
make use of these native identifiers to reduce bandwidth and cpu usage.
|
||||
|
||||
|
||||
Gestures
|
||||
--------
|
||||
|
||||
In the specific application of creating gesture events, the TOUCH and WIDTH
|
||||
parameters can be used to, e.g., approximate finger pressure or distinguish
|
||||
between index finger and thumb. With the addition of the MINOR parameters,
|
||||
one can also distinguish between a sweeping finger and a pointing finger,
|
||||
and with ORIENTATION, one can detect twisting of fingers.
|
||||
|
||||
|
||||
Notes
|
||||
-----
|
||||
|
||||
@ -185,11 +233,6 @@ where examples can be found.
|
||||
difference between the contact position and the approaching tool position
|
||||
could be used to derive tilt.
|
||||
[2] The list can of course be extended.
|
||||
[3] The multi-touch X driver is currently in the prototyping stage. At the
|
||||
time of writing (April 2009), the MT protocol is not yet merged, and the
|
||||
prototype implements finger matching, basic mouse support and two-finger
|
||||
scrolling. The project aims at improving the quality of current multi-touch
|
||||
functionality available in the Synaptics X driver, and in addition
|
||||
implement more advanced gestures.
|
||||
[3] Multitouch X driver project: http://bitmath.org/code/multitouch/.
|
||||
[4] See the section on event computation.
|
||||
[5] See the section on finger tracking.
|
||||
|
@ -75,7 +75,7 @@ and the number of steps or will clamp at the maximum and zero depending on
|
||||
the configuration.
|
||||
|
||||
Because GPIO to IRQ mapping is platform specific, this information must
|
||||
be given in seperately to the driver. See the example below.
|
||||
be given in separately to the driver. See the example below.
|
||||
|
||||
---------<snip>---------
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
Copyright (C) 2002-2008 Sentelic Corporation.
|
||||
Last update: Oct-31-2008
|
||||
Copyright (C) 2002-2010 Sentelic Corporation.
|
||||
Last update: Jan-13-2010
|
||||
|
||||
==============================================================================
|
||||
* Finger Sensing Pad Intellimouse Mode(scrolling wheel, 4th and 5th buttons)
|
||||
@ -44,7 +44,7 @@ B) MSID 6: Horizontal and Vertical scrolling.
|
||||
Packet 1
|
||||
Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
|
||||
BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
|
||||
1 |Y|X|y|x|1|M|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 | | |B|F|l|r|u|d|
|
||||
1 |Y|X|y|x|1|M|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 | | |B|F|r|l|u|d|
|
||||
|---------------| |---------------| |---------------| |---------------|
|
||||
|
||||
Byte 1: Bit7 => Y overflow
|
||||
@ -59,15 +59,15 @@ Byte 2: X Movement(9-bit 2's complement integers)
|
||||
Byte 3: Y Movement(9-bit 2's complement integers)
|
||||
Byte 4: Bit0 => the Vertical scrolling movement downward.
|
||||
Bit1 => the Vertical scrolling movement upward.
|
||||
Bit2 => the Vertical scrolling movement rightward.
|
||||
Bit3 => the Vertical scrolling movement leftward.
|
||||
Bit2 => the Horizontal scrolling movement leftward.
|
||||
Bit3 => the Horizontal scrolling movement rightward.
|
||||
Bit4 => 1 = 4th mouse button is pressed, Forward one page.
|
||||
0 = 4th mouse button is not pressed.
|
||||
Bit5 => 1 = 5th mouse button is pressed, Backward one page.
|
||||
0 = 5th mouse button is not pressed.
|
||||
|
||||
C) MSID 7:
|
||||
# FSP uses 2 packets(8 Bytes) data to represent Absolute Position
|
||||
# FSP uses 2 packets (8 Bytes) to represent Absolute Position.
|
||||
so we have PACKET NUMBER to identify packets.
|
||||
If PACKET NUMBER is 0, the packet is Packet 1.
|
||||
If PACKET NUMBER is 1, the packet is Packet 2.
|
||||
@ -129,7 +129,7 @@ Byte 3: Message Type => 0x00 (Disabled)
|
||||
Byte 4: Bit7~Bit0 => Don't Care
|
||||
|
||||
==============================================================================
|
||||
* Absolute position for STL3888-A0.
|
||||
* Absolute position for STL3888-Ax.
|
||||
==============================================================================
|
||||
Packet 1 (ABSOLUTE POSITION)
|
||||
Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
|
||||
@ -179,14 +179,14 @@ Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0])
|
||||
Bit5~Bit4 => y2_g
|
||||
Bit7~Bit6 => x2_g
|
||||
|
||||
Notify Packet for STL3888-A0
|
||||
Notify Packet for STL3888-Ax
|
||||
Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
|
||||
BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
|
||||
1 |1|0|1|P|1|M|R|L| 2 |C|C|C|C|C|C|C|C| 3 |0|0|F|F|0|0|0|i| 4 |r|l|d|u|0|0|0|0|
|
||||
|---------------| |---------------| |---------------| |---------------|
|
||||
|
||||
Byte 1: Bit7~Bit6 => 00, Normal data packet
|
||||
=> 01, Absolute coordination packet
|
||||
=> 01, Absolute coordinates packet
|
||||
=> 10, Notify packet
|
||||
Bit5 => 1
|
||||
Bit4 => when in absolute coordinates mode (valid when EN_PKT_GO is 1):
|
||||
@ -205,7 +205,7 @@ Byte 4: Bit7 => scroll right button
|
||||
Bit6 => scroll left button
|
||||
Bit5 => scroll down button
|
||||
Bit4 => scroll up button
|
||||
* Note that if gesture and additional button (Bit4~Bit7)
|
||||
* Note that if gesture and additional buttoni (Bit4~Bit7)
|
||||
happen at the same time, the button information will not
|
||||
be sent.
|
||||
Bit3~Bit0 => Reserved
|
||||
@ -213,7 +213,98 @@ Byte 4: Bit7 => scroll right button
|
||||
Sample sequence of Multi-finger, Multi-coordinate mode:
|
||||
|
||||
notify packet (valid bit == 1), abs pkt 1, abs pkt 2, abs pkt 1,
|
||||
abs pkt 2, ..., notify packet(valid bit == 0)
|
||||
abs pkt 2, ..., notify packet (valid bit == 0)
|
||||
|
||||
==============================================================================
|
||||
* Absolute position for STL3888-B0.
|
||||
==============================================================================
|
||||
Packet 1(ABSOLUTE POSITION)
|
||||
Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
|
||||
BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
|
||||
1 |0|1|V|F|1|0|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |r|l|u|d|X|X|Y|Y|
|
||||
|---------------| |---------------| |---------------| |---------------|
|
||||
|
||||
Byte 1: Bit7~Bit6 => 00, Normal data packet
|
||||
=> 01, Absolute coordinates packet
|
||||
=> 10, Notify packet
|
||||
Bit5 => Valid bit, 0 means that the coordinate is invalid or finger up.
|
||||
When both fingers are up, the last two reports have zero valid
|
||||
bit.
|
||||
Bit4 => finger up/down information. 1: finger down, 0: finger up.
|
||||
Bit3 => 1
|
||||
Bit2 => finger index, 0 is the first finger, 1 is the second finger.
|
||||
Bit1 => Right Button, 1 is pressed, 0 is not pressed.
|
||||
Bit0 => Left Button, 1 is pressed, 0 is not pressed.
|
||||
Byte 2: X coordinate (xpos[9:2])
|
||||
Byte 3: Y coordinate (ypos[9:2])
|
||||
Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0])
|
||||
Bit3~Bit2 => X coordinate (ypos[1:0])
|
||||
Bit4 => scroll down button
|
||||
Bit5 => scroll up button
|
||||
Bit6 => scroll left button
|
||||
Bit7 => scroll right button
|
||||
|
||||
Packet 2 (ABSOLUTE POSITION)
|
||||
Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
|
||||
BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
|
||||
1 |0|1|V|F|1|1|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |r|l|u|d|X|X|Y|Y|
|
||||
|---------------| |---------------| |---------------| |---------------|
|
||||
|
||||
Byte 1: Bit7~Bit6 => 00, Normal data packet
|
||||
=> 01, Absolute coordination packet
|
||||
=> 10, Notify packet
|
||||
Bit5 => Valid bit, 0 means that the coordinate is invalid or finger up.
|
||||
When both fingers are up, the last two reports have zero valid
|
||||
bit.
|
||||
Bit4 => finger up/down information. 1: finger down, 0: finger up.
|
||||
Bit3 => 1
|
||||
Bit2 => finger index, 0 is the first finger, 1 is the second finger.
|
||||
Bit1 => Right Button, 1 is pressed, 0 is not pressed.
|
||||
Bit0 => Left Button, 1 is pressed, 0 is not pressed.
|
||||
Byte 2: X coordinate (xpos[9:2])
|
||||
Byte 3: Y coordinate (ypos[9:2])
|
||||
Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0])
|
||||
Bit3~Bit2 => X coordinate (ypos[1:0])
|
||||
Bit4 => scroll down button
|
||||
Bit5 => scroll up button
|
||||
Bit6 => scroll left button
|
||||
Bit7 => scroll right button
|
||||
|
||||
Notify Packet for STL3888-B0
|
||||
Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
|
||||
BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
|
||||
1 |1|0|1|P|1|M|R|L| 2 |C|C|C|C|C|C|C|C| 3 |0|0|F|F|0|0|0|i| 4 |r|l|u|d|0|0|0|0|
|
||||
|---------------| |---------------| |---------------| |---------------|
|
||||
|
||||
Byte 1: Bit7~Bit6 => 00, Normal data packet
|
||||
=> 01, Absolute coordination packet
|
||||
=> 10, Notify packet
|
||||
Bit5 => 1
|
||||
Bit4 => when in absolute coordinate mode (valid when EN_PKT_GO is 1):
|
||||
0: left button is generated by the on-pad command
|
||||
1: left button is generated by the external button
|
||||
Bit3 => 1
|
||||
Bit2 => Middle Button, 1 is pressed, 0 is not pressed.
|
||||
Bit1 => Right Button, 1 is pressed, 0 is not pressed.
|
||||
Bit0 => Left Button, 1 is pressed, 0 is not pressed.
|
||||
Byte 2: Message Type => 0xB7 (Multi Finger, Multi Coordinate mode)
|
||||
Byte 3: Bit7~Bit6 => Don't care
|
||||
Bit5~Bit4 => Number of fingers
|
||||
Bit3~Bit1 => Reserved
|
||||
Bit0 => 1: enter gesture mode; 0: leaving gesture mode
|
||||
Byte 4: Bit7 => scroll right button
|
||||
Bit6 => scroll left button
|
||||
Bit5 => scroll up button
|
||||
Bit4 => scroll down button
|
||||
* Note that if gesture and additional button(Bit4~Bit7)
|
||||
happen at the same time, the button information will not
|
||||
be sent.
|
||||
Bit3~Bit0 => Reserved
|
||||
|
||||
Sample sequence of Multi-finger, Multi-coordinate mode:
|
||||
|
||||
notify packet (valid bit == 1), abs pkt 1, abs pkt 2, abs pkt 1,
|
||||
abs pkt 2, ..., notify packet (valid bit == 0)
|
||||
|
||||
==============================================================================
|
||||
* FSP Enable/Disable packet
|
||||
@ -409,7 +500,8 @@ offset width default r/w name
|
||||
0: read only, 1: read/write enable
|
||||
(Note that following registers does not require clock gating being
|
||||
enabled prior to write: 05 06 07 08 09 0c 0f 10 11 12 16 17 18 23 2e
|
||||
40 41 42 43.)
|
||||
40 41 42 43. In addition to that, this bit must be 1 when gesture
|
||||
mode is enabled)
|
||||
|
||||
0x31 RW on-pad command detection
|
||||
bit7 0 RW on-pad command left button down tag
|
||||
@ -463,6 +555,10 @@ offset width default r/w name
|
||||
absolute coordinates; otherwise, host only receives packets with
|
||||
relative coordinate.)
|
||||
|
||||
bit7 0 RW EN_PS2_F2: PS/2 gesture mode 2nd
|
||||
finger packet enable
|
||||
0: disable, 1: enable
|
||||
|
||||
0x43 RW on-pad control
|
||||
bit0 0 RW on-pad control enable
|
||||
0: disable, 1: enable
|
||||
|
@ -56,10 +56,11 @@ Following this convention is good because:
|
||||
(5) When following the convention, the driver code can use generic
|
||||
code to copy the parameters between user and kernel space.
|
||||
|
||||
This table lists ioctls visible from user land for Linux/i386. It contains
|
||||
most drivers up to 2.3.14, but I know I am missing some.
|
||||
This table lists ioctls visible from user land for Linux/x86. It contains
|
||||
most drivers up to 2.6.31, but I know I am missing some. There has been
|
||||
no attempt to list non-X86 architectures or ioctls from drivers/staging/.
|
||||
|
||||
Code Seq# Include File Comments
|
||||
Code Seq#(hex) Include File Comments
|
||||
========================================================
|
||||
0x00 00-1F linux/fs.h conflict!
|
||||
0x00 00-1F scsi/scsi_ioctl.h conflict!
|
||||
@ -69,119 +70,228 @@ Code Seq# Include File Comments
|
||||
0x03 all linux/hdreg.h
|
||||
0x04 D2-DC linux/umsdos_fs.h Dead since 2.6.11, but don't reuse these.
|
||||
0x06 all linux/lp.h
|
||||
0x09 all linux/md.h
|
||||
0x09 all linux/raid/md_u.h
|
||||
0x10 00-0F drivers/char/s390/vmcp.h
|
||||
0x12 all linux/fs.h
|
||||
linux/blkpg.h
|
||||
0x1b all InfiniBand Subsystem <http://www.openib.org/>
|
||||
0x20 all drivers/cdrom/cm206.h
|
||||
0x22 all scsi/sg.h
|
||||
'#' 00-3F IEEE 1394 Subsystem Block for the entire subsystem
|
||||
'$' 00-0F linux/perf_counter.h, linux/perf_event.h
|
||||
'1' 00-1F <linux/timepps.h> PPS kit from Ulrich Windl
|
||||
<ftp://ftp.de.kernel.org/pub/linux/daemons/ntp/PPS/>
|
||||
'2' 01-04 linux/i2o.h
|
||||
'3' 00-0F drivers/s390/char/raw3270.h conflict!
|
||||
'3' 00-1F linux/suspend_ioctls.h conflict!
|
||||
and kernel/power/user.c
|
||||
'8' all SNP8023 advanced NIC card
|
||||
<mailto:mcr@solidum.com>
|
||||
'A' 00-1F linux/apm_bios.h
|
||||
'@' 00-0F linux/radeonfb.h conflict!
|
||||
'@' 00-0F drivers/video/aty/aty128fb.c conflict!
|
||||
'A' 00-1F linux/apm_bios.h conflict!
|
||||
'A' 00-0F linux/agpgart.h conflict!
|
||||
and drivers/char/agp/compat_ioctl.h
|
||||
'A' 00-7F sound/asound.h conflict!
|
||||
'B' 00-1F linux/cciss_ioctl.h conflict!
|
||||
'B' 00-0F include/linux/pmu.h conflict!
|
||||
'B' C0-FF advanced bbus
|
||||
<mailto:maassen@uni-freiburg.de>
|
||||
'C' all linux/soundcard.h
|
||||
'C' all linux/soundcard.h conflict!
|
||||
'C' 01-2F linux/capi.h conflict!
|
||||
'C' F0-FF drivers/net/wan/cosa.h conflict!
|
||||
'D' all arch/s390/include/asm/dasd.h
|
||||
'E' all linux/input.h
|
||||
'F' all linux/fb.h
|
||||
'H' all linux/hiddev.h
|
||||
'I' all linux/isdn.h
|
||||
'D' 40-5F drivers/scsi/dpt/dtpi_ioctl.h
|
||||
'D' 05 drivers/scsi/pmcraid.h
|
||||
'E' all linux/input.h conflict!
|
||||
'E' 00-0F xen/evtchn.h conflict!
|
||||
'F' all linux/fb.h conflict!
|
||||
'F' 01-02 drivers/scsi/pmcraid.h conflict!
|
||||
'F' 20 drivers/video/fsl-diu-fb.h conflict!
|
||||
'F' 20 drivers/video/intelfb/intelfb.h conflict!
|
||||
'F' 20 linux/ivtvfb.h conflict!
|
||||
'F' 20 linux/matroxfb.h conflict!
|
||||
'F' 20 drivers/video/aty/atyfb_base.c conflict!
|
||||
'F' 00-0F video/da8xx-fb.h conflict!
|
||||
'F' 80-8F linux/arcfb.h conflict!
|
||||
'F' DD video/sstfb.h conflict!
|
||||
'G' 00-3F drivers/misc/sgi-gru/grulib.h conflict!
|
||||
'G' 00-0F linux/gigaset_dev.h conflict!
|
||||
'H' 00-7F linux/hiddev.h conflict!
|
||||
'H' 00-0F linux/hidraw.h conflict!
|
||||
'H' 00-0F sound/asound.h conflict!
|
||||
'H' 20-40 sound/asound_fm.h conflict!
|
||||
'H' 80-8F sound/sfnt_info.h conflict!
|
||||
'H' 10-8F sound/emu10k1.h conflict!
|
||||
'H' 10-1F sound/sb16_csp.h conflict!
|
||||
'H' 10-1F sound/hda_hwdep.h conflict!
|
||||
'H' 40-4F sound/hdspm.h conflict!
|
||||
'H' 40-4F sound/hdsp.h conflict!
|
||||
'H' 90 sound/usb/usx2y/usb_stream.h
|
||||
'H' C0-F0 net/bluetooth/hci.h conflict!
|
||||
'H' C0-DF net/bluetooth/hidp/hidp.h conflict!
|
||||
'H' C0-DF net/bluetooth/cmtp/cmtp.h conflict!
|
||||
'H' C0-DF net/bluetooth/bnep/bnep.h conflict!
|
||||
'I' all linux/isdn.h conflict!
|
||||
'I' 00-0F drivers/isdn/divert/isdn_divert.h conflict!
|
||||
'I' 40-4F linux/mISDNif.h conflict!
|
||||
'J' 00-1F drivers/scsi/gdth_ioctl.h
|
||||
'K' all linux/kd.h
|
||||
'L' 00-1F linux/loop.h
|
||||
'L' 20-2F driver/usb/misc/vstusb.h
|
||||
'L' 00-1F linux/loop.h conflict!
|
||||
'L' 10-1F drivers/scsi/mpt2sas/mpt2sas_ctl.h conflict!
|
||||
'L' E0-FF linux/ppdd.h encrypted disk device driver
|
||||
<http://linux01.gwdg.de/~alatham/ppdd.html>
|
||||
'M' all linux/soundcard.h
|
||||
'M' all linux/soundcard.h conflict!
|
||||
'M' 01-16 mtd/mtd-abi.h conflict!
|
||||
and drivers/mtd/mtdchar.c
|
||||
'M' 01-03 drivers/scsi/megaraid/megaraid_sas.h
|
||||
'M' 00-0F drivers/video/fsl-diu-fb.h conflict!
|
||||
'N' 00-1F drivers/usb/scanner.h
|
||||
'O' 00-02 include/mtd/ubi-user.h UBI
|
||||
'P' all linux/soundcard.h
|
||||
'O' 00-06 mtd/ubi-user.h UBI
|
||||
'P' all linux/soundcard.h conflict!
|
||||
'P' 60-6F sound/sscape_ioctl.h conflict!
|
||||
'P' 00-0F drivers/usb/class/usblp.c conflict!
|
||||
'Q' all linux/soundcard.h
|
||||
'R' 00-1F linux/random.h
|
||||
'R' 00-1F linux/random.h conflict!
|
||||
'R' 01 linux/rfkill.h conflict!
|
||||
'R' 01-0F media/rds.h conflict!
|
||||
'R' C0-DF net/bluetooth/rfcomm.h
|
||||
'S' all linux/cdrom.h conflict!
|
||||
'S' 80-81 scsi/scsi_ioctl.h conflict!
|
||||
'S' 82-FF scsi/scsi.h conflict!
|
||||
'S' 00-7F sound/asequencer.h conflict!
|
||||
'T' all linux/soundcard.h conflict!
|
||||
'T' 00-AF sound/asound.h conflict!
|
||||
'T' all arch/x86/include/asm/ioctls.h conflict!
|
||||
'U' 00-EF linux/drivers/usb/usb.h
|
||||
'V' all linux/vt.h
|
||||
'T' C0-DF linux/if_tun.h conflict!
|
||||
'U' all sound/asound.h conflict!
|
||||
'U' 00-0F drivers/media/video/uvc/uvcvideo.h conflict!
|
||||
'U' 00-CF linux/uinput.h conflict!
|
||||
'U' 00-EF linux/usbdevice_fs.h
|
||||
'U' C0-CF drivers/bluetooth/hci_uart.h
|
||||
'V' all linux/vt.h conflict!
|
||||
'V' all linux/videodev2.h conflict!
|
||||
'V' C0 linux/ivtvfb.h conflict!
|
||||
'V' C0 linux/ivtv.h conflict!
|
||||
'V' C0 media/davinci/vpfe_capture.h conflict!
|
||||
'V' C0 media/si4713.h conflict!
|
||||
'V' C0-CF drivers/media/video/mxb.h conflict!
|
||||
'W' 00-1F linux/watchdog.h conflict!
|
||||
'W' 00-1F linux/wanrouter.h conflict!
|
||||
'X' all linux/xfs_fs.h
|
||||
'W' 00-3F sound/asound.h conflict!
|
||||
'X' all fs/xfs/xfs_fs.h conflict!
|
||||
and fs/xfs/linux-2.6/xfs_ioctl32.h
|
||||
and include/linux/falloc.h
|
||||
and linux/fs.h
|
||||
'X' all fs/ocfs2/ocfs_fs.h conflict!
|
||||
'X' 01 linux/pktcdvd.h conflict!
|
||||
'Y' all linux/cyclades.h
|
||||
'[' 00-07 linux/usb/usbtmc.h USB Test and Measurement Devices
|
||||
'Z' 14-15 drivers/message/fusion/mptctl.h
|
||||
'[' 00-07 linux/usb/tmc.h USB Test and Measurement Devices
|
||||
<mailto:gregkh@suse.de>
|
||||
'a' all ATM on linux
|
||||
'a' all linux/atm*.h, linux/sonet.h ATM on linux
|
||||
<http://lrcwww.epfl.ch/linux-atm/magic.html>
|
||||
'b' 00-FF bit3 vme host bridge
|
||||
'b' 00-FF conflict! bit3 vme host bridge
|
||||
<mailto:natalia@nikhefk.nikhef.nl>
|
||||
'b' 00-0F media/bt819.h conflict!
|
||||
'c' all linux/cm4000_cs.h conflict!
|
||||
'c' 00-7F linux/comstats.h conflict!
|
||||
'c' 00-7F linux/coda.h conflict!
|
||||
'c' 80-9F arch/s390/include/asm/chsc.h
|
||||
'c' A0-AF arch/x86/include/asm/msr.h
|
||||
'c' 00-1F linux/chio.h conflict!
|
||||
'c' 80-9F arch/s390/include/asm/chsc.h conflict!
|
||||
'c' A0-AF arch/x86/include/asm/msr.h conflict!
|
||||
'd' 00-FF linux/char/drm/drm/h conflict!
|
||||
'd' 02-40 pcmcia/ds.h conflict!
|
||||
'd' 10-3F drivers/media/video/dabusb.h conflict!
|
||||
'd' C0-CF drivers/media/video/saa7191.h conflict!
|
||||
'd' F0-FF linux/digi1.h
|
||||
'e' all linux/digi1.h conflict!
|
||||
'e' 00-1F net/irda/irtty.h conflict!
|
||||
'f' 00-1F linux/ext2_fs.h
|
||||
'h' 00-7F Charon filesystem
|
||||
'e' 00-1F drivers/net/irda/irtty-sir.h conflict!
|
||||
'f' 00-1F linux/ext2_fs.h conflict!
|
||||
'f' 00-1F linux/ext3_fs.h conflict!
|
||||
'f' 00-0F fs/jfs/jfs_dinode.h conflict!
|
||||
'f' 00-0F fs/ext4/ext4.h conflict!
|
||||
'f' 00-0F linux/fs.h conflict!
|
||||
'f' 00-0F fs/ocfs2/ocfs2_fs.h conflict!
|
||||
'g' 00-0F linux/usb/gadgetfs.h
|
||||
'g' 20-2F linux/usb/g_printer.h
|
||||
'h' 00-7F conflict! Charon filesystem
|
||||
<mailto:zapman@interlan.net>
|
||||
'i' 00-3F linux/i2o.h
|
||||
'h' 00-1F linux/hpet.h conflict!
|
||||
'i' 00-3F linux/i2o-dev.h conflict!
|
||||
'i' 0B-1F linux/ipmi.h conflict!
|
||||
'i' 80-8F linux/i8k.h
|
||||
'j' 00-3F linux/joystick.h
|
||||
'k' 00-0F linux/spi/spidev.h conflict!
|
||||
'k' 00-05 video/kyro.h conflict!
|
||||
'l' 00-3F linux/tcfs_fs.h transparent cryptographic file system
|
||||
<http://mikonos.dia.unisa.it/tcfs>
|
||||
'l' 40-7F linux/udf_fs_i.h in development:
|
||||
<http://sourceforge.net/projects/linux-udf/>
|
||||
'm' 00-09 linux/mmtimer.h
|
||||
'm' 00-09 linux/mmtimer.h conflict!
|
||||
'm' all linux/mtio.h conflict!
|
||||
'm' all linux/soundcard.h conflict!
|
||||
'm' all linux/synclink.h conflict!
|
||||
'm' 00-19 drivers/message/fusion/mptctl.h conflict!
|
||||
'm' 00 drivers/scsi/megaraid/megaraid_ioctl.h conflict!
|
||||
'm' 00-1F net/irda/irmod.h conflict!
|
||||
'n' 00-7F linux/ncp_fs.h
|
||||
'n' 00-7F linux/ncp_fs.h and fs/ncpfs/ioctl.c
|
||||
'n' 80-8F linux/nilfs2_fs.h NILFS2
|
||||
'n' E0-FF video/matrox.h matroxfb
|
||||
'n' E0-FF linux/matroxfb.h matroxfb
|
||||
'o' 00-1F fs/ocfs2/ocfs2_fs.h OCFS2
|
||||
'o' 00-03 include/mtd/ubi-user.h conflict! (OCFS2 and UBI overlaps)
|
||||
'o' 40-41 include/mtd/ubi-user.h UBI
|
||||
'o' 01-A1 include/linux/dvb/*.h DVB
|
||||
'o' 00-03 mtd/ubi-user.h conflict! (OCFS2 and UBI overlaps)
|
||||
'o' 40-41 mtd/ubi-user.h UBI
|
||||
'o' 01-A1 linux/dvb/*.h DVB
|
||||
'p' 00-0F linux/phantom.h conflict! (OpenHaptics needs this)
|
||||
'p' 00-1F linux/rtc.h conflict!
|
||||
'p' 00-3F linux/mc146818rtc.h conflict!
|
||||
'p' 40-7F linux/nvram.h
|
||||
'p' 80-9F user-space parport
|
||||
'p' 80-9F linux/ppdev.h user-space parport
|
||||
<mailto:tim@cyberelk.net>
|
||||
'p' a1-a4 linux/pps.h LinuxPPS
|
||||
'p' A1-A4 linux/pps.h LinuxPPS
|
||||
<mailto:giometti@linux.it>
|
||||
'q' 00-1F linux/serio.h
|
||||
'q' 80-FF Internet PhoneJACK, Internet LineJACK
|
||||
<http://www.quicknet.net>
|
||||
'r' 00-1F linux/msdos_fs.h
|
||||
'q' 80-FF linux/telephony.h Internet PhoneJACK, Internet LineJACK
|
||||
linux/ixjuser.h <http://www.quicknet.net>
|
||||
'r' 00-1F linux/msdos_fs.h and fs/fat/dir.c
|
||||
's' all linux/cdk.h
|
||||
't' 00-7F linux/if_ppp.h
|
||||
't' 80-8F linux/isdn_ppp.h
|
||||
't' 90 linux/toshiba.h
|
||||
'u' 00-1F linux/smb_fs.h
|
||||
'v' 00-1F linux/ext2_fs.h conflict!
|
||||
'v' all linux/videodev.h conflict!
|
||||
'v' 00-1F linux/ext2_fs.h conflict!
|
||||
'v' 00-1F linux/fs.h conflict!
|
||||
'v' 00-0F linux/sonypi.h conflict!
|
||||
'v' C0-CF drivers/media/video/ov511.h conflict!
|
||||
'v' C0-DF media/pwc-ioctl.h conflict!
|
||||
'v' C0-FF linux/meye.h conflict!
|
||||
'v' C0-CF drivers/media/video/zoran/zoran.h conflict!
|
||||
'v' D0-DF drivers/media/video/cpia2/cpia2dev.h conflict!
|
||||
'w' all CERN SCI driver
|
||||
'y' 00-1F packet based user level communications
|
||||
<mailto:zapman@interlan.net>
|
||||
'z' 00-3F CAN bus card
|
||||
'z' 00-3F CAN bus card conflict!
|
||||
<mailto:hdstich@connectu.ulm.circular.de>
|
||||
'z' 40-7F CAN bus card
|
||||
'z' 40-7F CAN bus card conflict!
|
||||
<mailto:oe@port.de>
|
||||
'z' 10-4F drivers/s390/crypto/zcrypt_api.h conflict!
|
||||
0x80 00-1F linux/fb.h
|
||||
0x81 00-1F linux/videotext.h
|
||||
0x88 00-3F media/ovcamchip.h
|
||||
0x89 00-06 arch/x86/include/asm/sockios.h
|
||||
0x89 0B-DF linux/sockios.h
|
||||
0x89 E0-EF linux/sockios.h SIOCPROTOPRIVATE range
|
||||
0x89 E0-EF linux/dn.h PROTOPRIVATE range
|
||||
0x89 F0-FF linux/sockios.h SIOCDEVPRIVATE range
|
||||
0x8B all linux/wireless.h
|
||||
0x8C 00-3F WiNRADiO driver
|
||||
<http://www.proximity.com.au/~brian/winradio/>
|
||||
0x90 00 drivers/cdrom/sbpcd.h
|
||||
0x92 00-0F drivers/usb/mon/mon_bin.c
|
||||
0x93 60-7F linux/auto_fs.h
|
||||
0x94 all fs/btrfs/ioctl.h
|
||||
0x97 00-7F fs/ceph/ioctl.h Ceph file system
|
||||
0x99 00-0F 537-Addinboard driver
|
||||
<mailto:buk@buks.ipn.de>
|
||||
0xA0 all linux/sdp/sdp.h Industrial Device Project
|
||||
@ -198,11 +308,16 @@ Code Seq# Include File Comments
|
||||
0xB0 all RATIO devices in development:
|
||||
<mailto:vgo@ratio.de>
|
||||
0xB1 00-1F PPPoX <mailto:mostrows@styx.uwaterloo.ca>
|
||||
0xC0 00-0F linux/usb/iowarrior.h
|
||||
0xCB 00-1F CBM serial IEC bus in development:
|
||||
<mailto:michael.klein@puffin.lb.shuttle.de>
|
||||
0xCD 01 linux/reiserfs_fs.h
|
||||
0xCF 02 fs/cifs/ioctl.c
|
||||
0xDB 00-0F drivers/char/mwave/mwavepub.h
|
||||
0xDD 00-3F ZFCP device driver see drivers/s390/scsi/
|
||||
<mailto:aherrman@de.ibm.com>
|
||||
0xF3 00-3F video/sisfb.h sisfb (in development)
|
||||
0xF3 00-3F drivers/usb/misc/sisusbvga/sisusb.h sisfb (in development)
|
||||
<mailto:thomas@winischhofer.net>
|
||||
0xF4 00-1F video/mbxfb.h mbxfb
|
||||
<mailto:raph@8d.com>
|
||||
0xFD all linux/dm-ioctl.h
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user