mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-09 15:29:16 +00:00
sync to Linus v4.13-rc2 for subsystem developers to work against
This commit is contained in:
commit
53a2ebaaab
@ -24,8 +24,6 @@ DMA-ISA-LPC.txt
|
||||
- How to do DMA with ISA (and LPC) devices.
|
||||
DMA-attributes.txt
|
||||
- listing of the various possible attributes a DMA region can have
|
||||
DocBook/
|
||||
- directory with DocBook templates etc. for kernel documentation.
|
||||
EDID/
|
||||
- directory with info on customizing EDID for broken gfx/displays.
|
||||
IPMI.txt
|
||||
@ -40,8 +38,6 @@ Intel-IOMMU.txt
|
||||
- basic info on the Intel IOMMU virtualization support.
|
||||
Makefile
|
||||
- It's not of interest for those who aren't touching the build system.
|
||||
Makefile.sphinx
|
||||
- It's not of interest for those who aren't touching the build system.
|
||||
PCI/
|
||||
- info related to PCI drivers.
|
||||
RCU/
|
||||
@ -246,8 +242,6 @@ kprobes.txt
|
||||
- documents the kernel probes debugging feature.
|
||||
kref.txt
|
||||
- docs on adding reference counters (krefs) to kernel objects.
|
||||
kselftest.txt
|
||||
- small unittests for (some) individual codepaths in the kernel.
|
||||
laptops/
|
||||
- directory with laptop related info and laptop driver documentation.
|
||||
ldm.txt
|
||||
@ -264,6 +258,8 @@ logo.gif
|
||||
- full colour GIF image of Linux logo (penguin - Tux).
|
||||
logo.txt
|
||||
- info on creator of above logo & site to get additional images from.
|
||||
lsm.txt
|
||||
- Linux Security Modules: General Security Hooks for Linux
|
||||
lzo.txt
|
||||
- kernel LZO decompressor input formats
|
||||
m68k/
|
||||
|
@ -55,14 +55,6 @@ Description:
|
||||
Indicates the maximum USB speed supported by this port.
|
||||
Users:
|
||||
|
||||
What: /sys/class/udc/<udc>/maximum_speed
|
||||
Date: June 2011
|
||||
KernelVersion: 3.1
|
||||
Contact: Felipe Balbi <balbi@kernel.org>
|
||||
Description:
|
||||
Indicates the maximum USB speed supported by this port.
|
||||
Users:
|
||||
|
||||
What: /sys/class/udc/<udc>/soft_connect
|
||||
Date: June 2011
|
||||
KernelVersion: 3.1
|
||||
@ -91,3 +83,11 @@ Description:
|
||||
'configured', and 'suspended'; however not all USB Device
|
||||
Controllers support reporting all states.
|
||||
Users:
|
||||
|
||||
What: /sys/class/udc/<udc>/function
|
||||
Date: June 2017
|
||||
KernelVersion: 4.13
|
||||
Contact: Felipe Balbi <balbi@kernel.org>
|
||||
Description:
|
||||
Prints out name of currently running USB Gadget Driver.
|
||||
Users:
|
||||
|
15
Documentation/ABI/stable/sysfs-driver-aspeed-vuart
Normal file
15
Documentation/ABI/stable/sysfs-driver-aspeed-vuart
Normal file
@ -0,0 +1,15 @@
|
||||
What: /sys/bus/platform/drivers/aspeed-vuart/*/lpc_address
|
||||
Date: April 2017
|
||||
Contact: Jeremy Kerr <jk@ozlabs.org>
|
||||
Description: Configures which IO port the host side of the UART
|
||||
will appear on the host <-> BMC LPC bus.
|
||||
Users: OpenBMC. Proposed changes should be mailed to
|
||||
openbmc@lists.ozlabs.org
|
||||
|
||||
What: /sys/bus/platform/drivers/aspeed-vuart*/sirq
|
||||
Date: April 2017
|
||||
Contact: Jeremy Kerr <jk@ozlabs.org>
|
||||
Description: Configures which interrupt number the host side of
|
||||
the UART will appear on the host <-> BMC LPC bus.
|
||||
Users: OpenBMC. Proposed changes should be mailed to
|
||||
openbmc@lists.ozlabs.org
|
119
Documentation/ABI/stable/sysfs-hypervisor-xen
Normal file
119
Documentation/ABI/stable/sysfs-hypervisor-xen
Normal file
@ -0,0 +1,119 @@
|
||||
What: /sys/hypervisor/compilation/compile_date
|
||||
Date: March 2009
|
||||
KernelVersion: 2.6.30
|
||||
Contact: xen-devel@lists.xenproject.org
|
||||
Description: If running under Xen:
|
||||
Contains the build time stamp of the Xen hypervisor
|
||||
Might return "<denied>" in case of special security settings
|
||||
in the hypervisor.
|
||||
|
||||
What: /sys/hypervisor/compilation/compiled_by
|
||||
Date: March 2009
|
||||
KernelVersion: 2.6.30
|
||||
Contact: xen-devel@lists.xenproject.org
|
||||
Description: If running under Xen:
|
||||
Contains information who built the Xen hypervisor
|
||||
Might return "<denied>" in case of special security settings
|
||||
in the hypervisor.
|
||||
|
||||
What: /sys/hypervisor/compilation/compiler
|
||||
Date: March 2009
|
||||
KernelVersion: 2.6.30
|
||||
Contact: xen-devel@lists.xenproject.org
|
||||
Description: If running under Xen:
|
||||
Compiler which was used to build the Xen hypervisor
|
||||
Might return "<denied>" in case of special security settings
|
||||
in the hypervisor.
|
||||
|
||||
What: /sys/hypervisor/properties/capabilities
|
||||
Date: March 2009
|
||||
KernelVersion: 2.6.30
|
||||
Contact: xen-devel@lists.xenproject.org
|
||||
Description: If running under Xen:
|
||||
Space separated list of supported guest system types. Each type
|
||||
is in the format: <class>-<major>.<minor>-<arch>
|
||||
With:
|
||||
<class>: "xen" -- x86: paravirtualized, arm: standard
|
||||
"hvm" -- x86 only: fully virtualized
|
||||
<major>: major guest interface version
|
||||
<minor>: minor guest interface version
|
||||
<arch>: architecture, e.g.:
|
||||
"x86_32": 32 bit x86 guest without PAE
|
||||
"x86_32p": 32 bit x86 guest with PAE
|
||||
"x86_64": 64 bit x86 guest
|
||||
"armv7l": 32 bit arm guest
|
||||
"aarch64": 64 bit arm guest
|
||||
|
||||
What: /sys/hypervisor/properties/changeset
|
||||
Date: March 2009
|
||||
KernelVersion: 2.6.30
|
||||
Contact: xen-devel@lists.xenproject.org
|
||||
Description: If running under Xen:
|
||||
Changeset of the hypervisor (git commit)
|
||||
Might return "<denied>" in case of special security settings
|
||||
in the hypervisor.
|
||||
|
||||
What: /sys/hypervisor/properties/features
|
||||
Date: March 2009
|
||||
KernelVersion: 2.6.30
|
||||
Contact: xen-devel@lists.xenproject.org
|
||||
Description: If running under Xen:
|
||||
Features the Xen hypervisor supports for the guest as defined
|
||||
in include/xen/interface/features.h printed as a hex value.
|
||||
|
||||
What: /sys/hypervisor/properties/pagesize
|
||||
Date: March 2009
|
||||
KernelVersion: 2.6.30
|
||||
Contact: xen-devel@lists.xenproject.org
|
||||
Description: If running under Xen:
|
||||
Default page size of the hypervisor printed as a hex value.
|
||||
Might return "0" in case of special security settings
|
||||
in the hypervisor.
|
||||
|
||||
What: /sys/hypervisor/properties/virtual_start
|
||||
Date: March 2009
|
||||
KernelVersion: 2.6.30
|
||||
Contact: xen-devel@lists.xenproject.org
|
||||
Description: If running under Xen:
|
||||
Virtual address of the hypervisor as a hex value.
|
||||
|
||||
What: /sys/hypervisor/type
|
||||
Date: March 2009
|
||||
KernelVersion: 2.6.30
|
||||
Contact: xen-devel@lists.xenproject.org
|
||||
Description: If running under Xen:
|
||||
Type of hypervisor:
|
||||
"xen": Xen hypervisor
|
||||
|
||||
What: /sys/hypervisor/uuid
|
||||
Date: March 2009
|
||||
KernelVersion: 2.6.30
|
||||
Contact: xen-devel@lists.xenproject.org
|
||||
Description: If running under Xen:
|
||||
UUID of the guest as known to the Xen hypervisor.
|
||||
|
||||
What: /sys/hypervisor/version/extra
|
||||
Date: March 2009
|
||||
KernelVersion: 2.6.30
|
||||
Contact: xen-devel@lists.xenproject.org
|
||||
Description: If running under Xen:
|
||||
The Xen version is in the format <major>.<minor><extra>
|
||||
This is the <extra> part of it.
|
||||
Might return "<denied>" in case of special security settings
|
||||
in the hypervisor.
|
||||
|
||||
What: /sys/hypervisor/version/major
|
||||
Date: March 2009
|
||||
KernelVersion: 2.6.30
|
||||
Contact: xen-devel@lists.xenproject.org
|
||||
Description: If running under Xen:
|
||||
The Xen version is in the format <major>.<minor><extra>
|
||||
This is the <major> part of it.
|
||||
|
||||
What: /sys/hypervisor/version/minor
|
||||
Date: March 2009
|
||||
KernelVersion: 2.6.30
|
||||
Contact: xen-devel@lists.xenproject.org
|
||||
Description: If running under Xen:
|
||||
The Xen version is in the format <major>.<minor><extra>
|
||||
This is the <minor> part of it.
|
@ -1,12 +1,14 @@
|
||||
What: /config/usb-gadget/gadget/functions/uac1.name
|
||||
Date: Sep 2014
|
||||
KernelVersion: 3.18
|
||||
Date: June 2017
|
||||
KernelVersion: 4.14
|
||||
Description:
|
||||
The attributes:
|
||||
|
||||
audio_buf_size - audio buffer size
|
||||
fn_cap - capture pcm device file name
|
||||
fn_cntl - control device file name
|
||||
fn_play - playback pcm device file name
|
||||
req_buf_size - ISO OUT endpoint request buffer size
|
||||
req_count - ISO OUT endpoint request count
|
||||
c_chmask - capture channel mask
|
||||
c_srate - capture sampling rate
|
||||
c_ssize - capture sample size (bytes)
|
||||
p_chmask - playback channel mask
|
||||
p_srate - playback sampling rate
|
||||
p_ssize - playback sample size (bytes)
|
||||
req_number - the number of pre-allocated request
|
||||
for both capture and playback
|
||||
|
12
Documentation/ABI/testing/configfs-usb-gadget-uac1_legacy
Normal file
12
Documentation/ABI/testing/configfs-usb-gadget-uac1_legacy
Normal file
@ -0,0 +1,12 @@
|
||||
What: /config/usb-gadget/gadget/functions/uac1_legacy.name
|
||||
Date: Sep 2014
|
||||
KernelVersion: 3.18
|
||||
Description:
|
||||
The attributes:
|
||||
|
||||
audio_buf_size - audio buffer size
|
||||
fn_cap - capture pcm device file name
|
||||
fn_cntl - control device file name
|
||||
fn_play - playback pcm device file name
|
||||
req_buf_size - ISO OUT endpoint request buffer size
|
||||
req_count - ISO OUT endpoint request count
|
38
Documentation/ABI/testing/sysfs-bus-fsi
Normal file
38
Documentation/ABI/testing/sysfs-bus-fsi
Normal file
@ -0,0 +1,38 @@
|
||||
What: /sys/bus/platform/devices/fsi-master/rescan
|
||||
Date: May 2017
|
||||
KernelVersion: 4.12
|
||||
Contact: cbostic@linux.vnet.ibm.com
|
||||
Description:
|
||||
Initiates a FSI master scan for all connected slave devices
|
||||
on its links.
|
||||
|
||||
What: /sys/bus/platform/devices/fsi-master/break
|
||||
Date: May 2017
|
||||
KernelVersion: 4.12
|
||||
Contact: cbostic@linux.vnet.ibm.com
|
||||
Description:
|
||||
Sends an FSI BREAK command on a master's communication
|
||||
link to any connnected slaves. A BREAK resets connected
|
||||
device's logic and preps it to receive further commands
|
||||
from the master.
|
||||
|
||||
What: /sys/bus/platform/devices/fsi-master/slave@00:00/term
|
||||
Date: May 2017
|
||||
KernelVersion: 4.12
|
||||
Contact: cbostic@linux.vnet.ibm.com
|
||||
Description:
|
||||
Sends an FSI terminate command from the master to its
|
||||
connected slave. A terminate resets the slave's state machines
|
||||
that control access to the internally connected engines. In
|
||||
addition the slave freezes its internal error register for
|
||||
debugging purposes. This command is also needed to abort any
|
||||
ongoing operation in case of an expired 'Master Time Out'
|
||||
timer.
|
||||
|
||||
What: /sys/bus/platform/devices/fsi-master/slave@00:00/raw
|
||||
Date: May 2017
|
||||
KernelVersion: 4.12
|
||||
Contact: cbostic@linux.vnet.ibm.com
|
||||
Description:
|
||||
Provides a means of reading/writing a 32 bit value from/to a
|
||||
specified FSI bus address.
|
@ -1425,6 +1425,17 @@ Description:
|
||||
guarantees that the hardware fifo is flushed to the device
|
||||
buffer.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:device*/buffer/hwfifo_timeout
|
||||
KernelVersion: 4.12
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
A read/write property to provide capability to delay reporting of
|
||||
samples till a timeout is reached. This allows host processors to
|
||||
sleep, while the sensor is storing samples in its internal fifo.
|
||||
The maximum timeout in seconds can be specified by setting
|
||||
hwfifo_timeout.The current delay can be read by reading
|
||||
hwfifo_timeout. A value of 0 means that there is no timeout.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/buffer/hwfifo_watermark
|
||||
KernelVersion: 4.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
|
@ -5,4 +5,3 @@ Description:
|
||||
Reading returns either '1' or '0'. '1' means that the
|
||||
battery level supplied to sensor is below 2.25V.
|
||||
This ABI is available for tsys02d, htu21, ms8607
|
||||
This ABI is available for htu21, ms8607
|
||||
|
@ -16,6 +16,54 @@ Description:
|
||||
- "OC2REF" : OC2REF signal is used as trigger output.
|
||||
- "OC3REF" : OC3REF signal is used as trigger output.
|
||||
- "OC4REF" : OC4REF signal is used as trigger output.
|
||||
Additional modes (on TRGO2 only):
|
||||
- "OC5REF" : OC5REF signal is used as trigger output.
|
||||
- "OC6REF" : OC6REF signal is used as trigger output.
|
||||
- "compare_pulse_OC4REF":
|
||||
OC4REF rising or falling edges generate pulses.
|
||||
- "compare_pulse_OC6REF":
|
||||
OC6REF rising or falling edges generate pulses.
|
||||
- "compare_pulse_OC4REF_r_or_OC6REF_r":
|
||||
OC4REF or OC6REF rising edges generate pulses.
|
||||
- "compare_pulse_OC4REF_r_or_OC6REF_f":
|
||||
OC4REF rising or OC6REF falling edges generate pulses.
|
||||
- "compare_pulse_OC5REF_r_or_OC6REF_r":
|
||||
OC5REF or OC6REF rising edges generate pulses.
|
||||
- "compare_pulse_OC5REF_r_or_OC6REF_f":
|
||||
OC5REF rising or OC6REF falling edges generate pulses.
|
||||
|
||||
+-----------+ +-------------+ +---------+
|
||||
| Prescaler +-> | Counter | +-> | Master | TRGO(2)
|
||||
+-----------+ +--+--------+-+ |-> | Control +-->
|
||||
| | || +---------+
|
||||
+--v--------+-+ OCxREF || +---------+
|
||||
| Chx compare +----------> | Output | ChX
|
||||
+-----------+-+ | | Control +-->
|
||||
. | | +---------+
|
||||
. | | .
|
||||
+-----------v-+ OC6REF | .
|
||||
| Ch6 compare +---------+>
|
||||
+-------------+
|
||||
|
||||
Example with: "compare_pulse_OC4REF_r_or_OC6REF_r":
|
||||
|
||||
X
|
||||
X X
|
||||
X . . X
|
||||
X . . X
|
||||
X . . X
|
||||
count X . . . . X
|
||||
. . . .
|
||||
. . . .
|
||||
+---------------+
|
||||
OC4REF | . . |
|
||||
+-+ . . +-+
|
||||
. +---+ .
|
||||
OC6REF . | | .
|
||||
+-------+ +-------+
|
||||
+-+ +-+
|
||||
TRGO2 | | | |
|
||||
+-+ +---+ +---------+
|
||||
|
||||
What: /sys/bus/iio/devices/triggerX/master_mode
|
||||
KernelVersion: 4.11
|
||||
@ -90,3 +138,18 @@ Description:
|
||||
Counting is enabled on rising edge of the connected
|
||||
trigger, and remains enabled for the duration of this
|
||||
selected mode.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_count_trigger_mode_available
|
||||
KernelVersion: 4.13
|
||||
Contact: benjamin.gaignard@st.com
|
||||
Description:
|
||||
Reading returns the list possible trigger modes.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_count0_trigger_mode
|
||||
KernelVersion: 4.13
|
||||
Contact: benjamin.gaignard@st.com
|
||||
Description:
|
||||
Configure the device counter trigger mode
|
||||
counting direction is set by in_count0_count_direction
|
||||
attribute and the counter is clocked by the connected trigger
|
||||
rising edges.
|
||||
|
110
Documentation/ABI/testing/sysfs-bus-thunderbolt
Normal file
110
Documentation/ABI/testing/sysfs-bus-thunderbolt
Normal file
@ -0,0 +1,110 @@
|
||||
What: /sys/bus/thunderbolt/devices/.../domainX/security
|
||||
Date: Sep 2017
|
||||
KernelVersion: 4.13
|
||||
Contact: thunderbolt-software@lists.01.org
|
||||
Description: This attribute holds current Thunderbolt security level
|
||||
set by the system BIOS. Possible values are:
|
||||
|
||||
none: All devices are automatically authorized
|
||||
user: Devices are only authorized based on writing
|
||||
appropriate value to the authorized attribute
|
||||
secure: Require devices that support secure connect at
|
||||
minimum. User needs to authorize each device.
|
||||
dponly: Automatically tunnel Display port (and USB). No
|
||||
PCIe tunnels are created.
|
||||
|
||||
What: /sys/bus/thunderbolt/devices/.../authorized
|
||||
Date: Sep 2017
|
||||
KernelVersion: 4.13
|
||||
Contact: thunderbolt-software@lists.01.org
|
||||
Description: This attribute is used to authorize Thunderbolt devices
|
||||
after they have been connected. If the device is not
|
||||
authorized, no devices such as PCIe and Display port are
|
||||
available to the system.
|
||||
|
||||
Contents of this attribute will be 0 when the device is not
|
||||
yet authorized.
|
||||
|
||||
Possible values are supported:
|
||||
1: The device will be authorized and connected
|
||||
|
||||
When key attribute contains 32 byte hex string the possible
|
||||
values are:
|
||||
1: The 32 byte hex string is added to the device NVM and
|
||||
the device is authorized.
|
||||
2: Send a challenge based on the 32 byte hex string. If the
|
||||
challenge response from device is valid, the device is
|
||||
authorized. In case of failure errno will be ENOKEY if
|
||||
the device did not contain a key at all, and
|
||||
EKEYREJECTED if the challenge response did not match.
|
||||
|
||||
What: /sys/bus/thunderbolt/devices/.../key
|
||||
Date: Sep 2017
|
||||
KernelVersion: 4.13
|
||||
Contact: thunderbolt-software@lists.01.org
|
||||
Description: When a devices supports Thunderbolt secure connect it will
|
||||
have this attribute. Writing 32 byte hex string changes
|
||||
authorization to use the secure connection method instead.
|
||||
|
||||
What: /sys/bus/thunderbolt/devices/.../device
|
||||
Date: Sep 2017
|
||||
KernelVersion: 4.13
|
||||
Contact: thunderbolt-software@lists.01.org
|
||||
Description: This attribute contains id of this device extracted from
|
||||
the device DROM.
|
||||
|
||||
What: /sys/bus/thunderbolt/devices/.../device_name
|
||||
Date: Sep 2017
|
||||
KernelVersion: 4.13
|
||||
Contact: thunderbolt-software@lists.01.org
|
||||
Description: This attribute contains name of this device extracted from
|
||||
the device DROM.
|
||||
|
||||
What: /sys/bus/thunderbolt/devices/.../vendor
|
||||
Date: Sep 2017
|
||||
KernelVersion: 4.13
|
||||
Contact: thunderbolt-software@lists.01.org
|
||||
Description: This attribute contains vendor id of this device extracted
|
||||
from the device DROM.
|
||||
|
||||
What: /sys/bus/thunderbolt/devices/.../vendor_name
|
||||
Date: Sep 2017
|
||||
KernelVersion: 4.13
|
||||
Contact: thunderbolt-software@lists.01.org
|
||||
Description: This attribute contains vendor name of this device extracted
|
||||
from the device DROM.
|
||||
|
||||
What: /sys/bus/thunderbolt/devices/.../unique_id
|
||||
Date: Sep 2017
|
||||
KernelVersion: 4.13
|
||||
Contact: thunderbolt-software@lists.01.org
|
||||
Description: This attribute contains unique_id string of this device.
|
||||
This is either read from hardware registers (UUID on
|
||||
newer hardware) or based on UID from the device DROM.
|
||||
Can be used to uniquely identify particular device.
|
||||
|
||||
What: /sys/bus/thunderbolt/devices/.../nvm_version
|
||||
Date: Sep 2017
|
||||
KernelVersion: 4.13
|
||||
Contact: thunderbolt-software@lists.01.org
|
||||
Description: If the device has upgradeable firmware the version
|
||||
number is available here. Format: %x.%x, major.minor.
|
||||
If the device is in safe mode reading the file returns
|
||||
-ENODATA instead as the NVM version is not available.
|
||||
|
||||
What: /sys/bus/thunderbolt/devices/.../nvm_authenticate
|
||||
Date: Sep 2017
|
||||
KernelVersion: 4.13
|
||||
Contact: thunderbolt-software@lists.01.org
|
||||
Description: When new NVM image is written to the non-active NVM
|
||||
area (through non_activeX NVMem device), the
|
||||
authentication procedure is started by writing 1 to
|
||||
this file. If everything goes well, the device is
|
||||
restarted with the new NVM firmware. If the image
|
||||
verification fails an error code is returned instead.
|
||||
|
||||
When read holds status of the last authentication
|
||||
operation if an error occurred during the process. This
|
||||
is directly the status value from the DMA configuration
|
||||
based mailbox before the device is power cycled. Writing
|
||||
0 here clears the status.
|
@ -229,6 +229,6 @@ KernelVersion: 4.1
|
||||
Contact: linux-mtd@lists.infradead.org
|
||||
Description:
|
||||
For a partition, the offset of that partition from the start
|
||||
of the master device in bytes. This attribute is absent on
|
||||
main devices, so it can be used to distinguish between
|
||||
partitions and devices that aren't partitions.
|
||||
of the parent (another partition or a flash device) in bytes.
|
||||
This attribute is absent on flash devices, so it can be used
|
||||
to distinguish them from partitions.
|
||||
|
16
Documentation/ABI/testing/sysfs-class-mux
Normal file
16
Documentation/ABI/testing/sysfs-class-mux
Normal file
@ -0,0 +1,16 @@
|
||||
What: /sys/class/mux/
|
||||
Date: April 2017
|
||||
KernelVersion: 4.13
|
||||
Contact: Peter Rosin <peda@axentia.se>
|
||||
Description:
|
||||
The mux/ class sub-directory belongs to the Generic MUX
|
||||
Framework and provides a sysfs interface for using MUX
|
||||
controllers.
|
||||
|
||||
What: /sys/class/mux/muxchipN/
|
||||
Date: April 2017
|
||||
KernelVersion: 4.13
|
||||
Contact: Peter Rosin <peda@axentia.se>
|
||||
Description:
|
||||
A /sys/class/mux/muxchipN directory is created for each
|
||||
probed MUX chip where N is a simple enumeration.
|
@ -251,3 +251,11 @@ Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
Indicates the unique physical switch identifier of a switch this
|
||||
port belongs to, as a string.
|
||||
|
||||
What: /sys/class/net/<iface>/phydev
|
||||
Date: May 2017
|
||||
KernelVersion: 4.13
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
Symbolic link to the PHY device this network device is attached
|
||||
to.
|
||||
|
36
Documentation/ABI/testing/sysfs-class-net-phydev
Normal file
36
Documentation/ABI/testing/sysfs-class-net-phydev
Normal file
@ -0,0 +1,36 @@
|
||||
What: /sys/class/mdio_bus/<bus>/<device>/attached_dev
|
||||
Date: May 2017
|
||||
KernelVersion: 4.13
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
Symbolic link to the network device this PHY device is
|
||||
attached to.
|
||||
|
||||
What: /sys/class/mdio_bus/<bus>/<device>/phy_has_fixups
|
||||
Date: February 2014
|
||||
KernelVersion: 3.15
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
Boolean value indicating whether the PHY device has
|
||||
any fixups registered against it (phy_register_fixup)
|
||||
|
||||
What: /sys/class/mdio_bus/<bus>/<device>/phy_id
|
||||
Date: November 2012
|
||||
KernelVersion: 3.8
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
32-bit hexadecimal value corresponding to the PHY device's OUI,
|
||||
model and revision number.
|
||||
|
||||
What: /sys/class/mdio_bus/<bus>/<device>/phy_interface
|
||||
Date: February 2014
|
||||
KernelVersion: 3.15
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
String value indicating the PHY interface, possible
|
||||
values are:.
|
||||
<empty> (not available), mii, gmii, sgmii, tbi, rev-mii,
|
||||
rmii, rgmii, rgmii-id, rgmii-rxid, rgmii-txid, rtbi, smii
|
||||
xgmii, moca, qsgmii, trgmii, 1000base-x, 2500base-x, rxaui,
|
||||
xaui, 10gbase-kr, unknown
|
||||
|
@ -1,20 +1,3 @@
|
||||
What: /sys/class/power_supply/twl4030_ac/max_current
|
||||
/sys/class/power_supply/twl4030_usb/max_current
|
||||
Description:
|
||||
Read/Write limit on current which may
|
||||
be drawn from the ac (Accessory Charger) or
|
||||
USB port.
|
||||
|
||||
Value is in micro-Amps.
|
||||
|
||||
Value is set automatically to an appropriate
|
||||
value when a cable is plugged or unplugged.
|
||||
|
||||
Value can the set by writing to the attribute.
|
||||
The change will only persist until the next
|
||||
plug event. These event are reported via udev.
|
||||
|
||||
|
||||
What: /sys/class/power_supply/twl4030_usb/mode
|
||||
Description:
|
||||
Changing mode for USB port.
|
||||
|
@ -30,6 +30,21 @@ Description:
|
||||
|
||||
Valid values: source, sink
|
||||
|
||||
What: /sys/class/typec/<port>/port_type
|
||||
Date: May 2017
|
||||
Contact: Badhri Jagan Sridharan <Badhri@google.com>
|
||||
Description:
|
||||
Indicates the type of the port. This attribute can be used for
|
||||
requesting a change in the port type. Port type change is
|
||||
supported as a synchronous operation, so write(2) to the
|
||||
attribute will not return until the operation has finished.
|
||||
|
||||
Valid values:
|
||||
- source (The port will behave as source only DFP port)
|
||||
- sink (The port will behave as sink only UFP port)
|
||||
- dual (The port will behave as dual-role-data and
|
||||
dual-role-power port)
|
||||
|
||||
What: /sys/class/typec/<port>/vconn_source
|
||||
Date: April 2017
|
||||
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||
|
@ -1,6 +1,6 @@
|
||||
What: /sys/firmware/devicetree/*
|
||||
Date: November 2013
|
||||
Contact: Grant Likely <grant.likely@linaro.org>
|
||||
Contact: Grant Likely <grant.likely@arm.com>, devicetree@vger.kernel.org
|
||||
Description:
|
||||
When using OpenFirmware or a Flattened Device Tree to enumerate
|
||||
hardware, the device tree structure will be exposed in this
|
||||
@ -26,3 +26,27 @@ Description:
|
||||
name plus address). Properties are represented as files
|
||||
in the directory. The contents of each file is the exact
|
||||
binary data from the device tree.
|
||||
|
||||
What: /sys/firmware/fdt
|
||||
Date: February 2015
|
||||
KernelVersion: 3.19
|
||||
Contact: Frank Rowand <frowand.list@gmail.com>, devicetree@vger.kernel.org
|
||||
Description:
|
||||
Exports the FDT blob that was passed to the kernel by
|
||||
the bootloader. This allows userland applications such
|
||||
as kexec to access the raw binary. This blob is also
|
||||
useful when debugging since it contains any changes
|
||||
made to the blob by the bootloader.
|
||||
|
||||
The fact that this node does not reside under
|
||||
/sys/firmware/device-tree is deliberate: FDT is also used
|
||||
on arm64 UEFI/ACPI systems to communicate just the UEFI
|
||||
and ACPI entry points, but the FDT is never unflattened
|
||||
and used to configure the system.
|
||||
|
||||
A CRC32 checksum is calculated over the entire FDT
|
||||
blob, and verified at late_initcall time. The sysfs
|
||||
entry is instantiated only if the checksum is valid,
|
||||
i.e., if the FDT blob has not been modified in the mean
|
||||
time. Otherwise, a warning is printed.
|
||||
Users: kexec, debugging
|
||||
|
@ -75,7 +75,7 @@ Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||
Description:
|
||||
Controls the memory footprint used by f2fs.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/trim_sections
|
||||
What: /sys/fs/f2fs/<disk>/batched_trim_sections
|
||||
Date: February 2015
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description:
|
||||
@ -112,3 +112,21 @@ Date: January 2016
|
||||
Contact: "Shuoran Liu" <liushuoran@huawei.com>
|
||||
Description:
|
||||
Shows total written kbytes issued to disk.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/inject_rate
|
||||
Date: May 2016
|
||||
Contact: "Sheng Yong" <shengyong1@huawei.com>
|
||||
Description:
|
||||
Controls the injection rate.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/inject_type
|
||||
Date: May 2016
|
||||
Contact: "Sheng Yong" <shengyong1@huawei.com>
|
||||
Description:
|
||||
Controls the injection type.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/reserved_blocks
|
||||
Date: June 2017
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Description:
|
||||
Controls current reserved blocks in system.
|
||||
|
@ -1,8 +1,19 @@
|
||||
What: /sys/hypervisor/guest_type
|
||||
Date: June 2017
|
||||
KernelVersion: 4.13
|
||||
Contact: xen-devel@lists.xenproject.org
|
||||
Description: If running under Xen:
|
||||
Type of guest:
|
||||
"Xen": standard guest type on arm
|
||||
"HVM": fully virtualized guest (x86)
|
||||
"PV": paravirtualized guest (x86)
|
||||
"PVH": fully virtualized guest without legacy emulation (x86)
|
||||
|
||||
What: /sys/hypervisor/pmu/pmu_mode
|
||||
Date: August 2015
|
||||
KernelVersion: 4.3
|
||||
Contact: Boris Ostrovsky <boris.ostrovsky@oracle.com>
|
||||
Description:
|
||||
Description: If running under Xen:
|
||||
Describes mode that Xen's performance-monitoring unit (PMU)
|
||||
uses. Accepted values are
|
||||
"off" -- PMU is disabled
|
||||
@ -17,7 +28,16 @@ What: /sys/hypervisor/pmu/pmu_features
|
||||
Date: August 2015
|
||||
KernelVersion: 4.3
|
||||
Contact: Boris Ostrovsky <boris.ostrovsky@oracle.com>
|
||||
Description:
|
||||
Description: If running under Xen:
|
||||
Describes Xen PMU features (as an integer). A set bit indicates
|
||||
that the corresponding feature is enabled. See
|
||||
include/xen/interface/xenpmu.h for available features
|
||||
|
||||
What: /sys/hypervisor/properties/buildid
|
||||
Date: June 2017
|
||||
KernelVersion: 4.13
|
||||
Contact: xen-devel@lists.xenproject.org
|
||||
Description: If running under Xen:
|
||||
Build id of the hypervisor, needed for hypervisor live patching.
|
||||
Might return "<denied>" in case of special security settings
|
||||
in the hypervisor.
|
@ -17,3 +17,11 @@ Description:
|
||||
* 2 -> Dust Cleaning
|
||||
* 4 -> Efficient Thermal Dissipation Mode
|
||||
|
||||
What: /sys/devices/platform/ideapad/touchpad
|
||||
Date: May 2017
|
||||
KernelVersion: 4.13
|
||||
Contact: "Ritesh Raj Sarraf <rrs@debian.org>"
|
||||
Description:
|
||||
Control touchpad mode.
|
||||
* 1 -> Switched On
|
||||
* 0 -> Switched Off
|
||||
|
47
Documentation/ABI/testing/sysfs-uevent
Normal file
47
Documentation/ABI/testing/sysfs-uevent
Normal file
@ -0,0 +1,47 @@
|
||||
What: /sys/.../uevent
|
||||
Date: May 2017
|
||||
KernelVersion: 4.13
|
||||
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
|
||||
Description:
|
||||
Enable passing additional variables for synthetic uevents that
|
||||
are generated by writing /sys/.../uevent file.
|
||||
|
||||
Recognized extended format is ACTION [UUID [KEY=VALUE ...].
|
||||
|
||||
The ACTION is compulsory - it is the name of the uevent action
|
||||
("add", "change", "remove"). There is no change compared to
|
||||
previous functionality here. The rest of the extended format
|
||||
is optional.
|
||||
|
||||
You need to pass UUID first before any KEY=VALUE pairs.
|
||||
The UUID must be in "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
|
||||
format where 'x' is a hex digit. The UUID is considered to be
|
||||
a transaction identifier so it's possible to use the same UUID
|
||||
value for one or more synthetic uevents in which case we
|
||||
logically group these uevents together for any userspace
|
||||
listeners. The UUID value appears in uevent as
|
||||
"SYNTH_UUID=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" environment
|
||||
variable.
|
||||
|
||||
If UUID is not passed in, the generated synthetic uevent gains
|
||||
"SYNTH_UUID=0" environment variable automatically.
|
||||
|
||||
The KEY=VALUE pairs can contain alphanumeric characters only.
|
||||
It's possible to define zero or more pairs - each pair is then
|
||||
delimited by a space character ' '. Each pair appears in
|
||||
synthetic uevent as "SYNTH_ARG_KEY=VALUE". That means the KEY
|
||||
name gains "SYNTH_ARG_" prefix to avoid possible collisions
|
||||
with existing variables.
|
||||
|
||||
Example of valid sequence written to the uevent file:
|
||||
|
||||
add fe4d7c9d-b8c6-4a70-9ef1-3d8a58d18eed A=1 B=abc
|
||||
|
||||
This generates synthetic uevent including these variables:
|
||||
|
||||
ACTION=add
|
||||
SYNTH_ARG_A=1
|
||||
SYNTH_ARG_B=abc
|
||||
SYNTH_UUID=fe4d7c9d-b8c6-4a70-9ef1-3d8a58d18eed
|
||||
Users:
|
||||
udev, userspace tools generating synthetic uevents
|
@ -1,22 +1,24 @@
|
||||
Dynamic DMA mapping Guide
|
||||
=========================
|
||||
=========================
|
||||
Dynamic DMA mapping Guide
|
||||
=========================
|
||||
|
||||
David S. Miller <davem@redhat.com>
|
||||
Richard Henderson <rth@cygnus.com>
|
||||
Jakub Jelinek <jakub@redhat.com>
|
||||
:Author: David S. Miller <davem@redhat.com>
|
||||
:Author: Richard Henderson <rth@cygnus.com>
|
||||
:Author: Jakub Jelinek <jakub@redhat.com>
|
||||
|
||||
This is a guide to device driver writers on how to use the DMA API
|
||||
with example pseudo-code. For a concise description of the API, see
|
||||
DMA-API.txt.
|
||||
|
||||
CPU and DMA addresses
|
||||
CPU and DMA addresses
|
||||
=====================
|
||||
|
||||
There are several kinds of addresses involved in the DMA API, and it's
|
||||
important to understand the differences.
|
||||
|
||||
The kernel normally uses virtual addresses. Any address returned by
|
||||
kmalloc(), vmalloc(), and similar interfaces is a virtual address and can
|
||||
be stored in a "void *".
|
||||
be stored in a ``void *``.
|
||||
|
||||
The virtual memory system (TLB, page tables, etc.) translates virtual
|
||||
addresses to CPU physical addresses, which are stored as "phys_addr_t" or
|
||||
@ -37,7 +39,7 @@ be restricted to a subset of that space. For example, even if a system
|
||||
supports 64-bit addresses for main memory and PCI BARs, it may use an IOMMU
|
||||
so devices only need to use 32-bit DMA addresses.
|
||||
|
||||
Here's a picture and some examples:
|
||||
Here's a picture and some examples::
|
||||
|
||||
CPU CPU Bus
|
||||
Virtual Physical Address
|
||||
@ -98,15 +100,16 @@ microprocessor architecture. You should use the DMA API rather than the
|
||||
bus-specific DMA API, i.e., use the dma_map_*() interfaces rather than the
|
||||
pci_map_*() interfaces.
|
||||
|
||||
First of all, you should make sure
|
||||
First of all, you should make sure::
|
||||
|
||||
#include <linux/dma-mapping.h>
|
||||
#include <linux/dma-mapping.h>
|
||||
|
||||
is in your driver, which provides the definition of dma_addr_t. This type
|
||||
can hold any valid DMA address for the platform and should be used
|
||||
everywhere you hold a DMA address returned from the DMA mapping functions.
|
||||
|
||||
What memory is DMA'able?
|
||||
What memory is DMA'able?
|
||||
========================
|
||||
|
||||
The first piece of information you must know is what kernel memory can
|
||||
be used with the DMA mapping facilities. There has been an unwritten
|
||||
@ -143,7 +146,8 @@ What about block I/O and networking buffers? The block I/O and
|
||||
networking subsystems make sure that the buffers they use are valid
|
||||
for you to DMA from/to.
|
||||
|
||||
DMA addressing limitations
|
||||
DMA addressing limitations
|
||||
==========================
|
||||
|
||||
Does your device have any DMA addressing limitations? For example, is
|
||||
your device only capable of driving the low order 24-bits of address?
|
||||
@ -166,7 +170,7 @@ style to do this even if your device holds the default setting,
|
||||
because this shows that you did think about these issues wrt. your
|
||||
device.
|
||||
|
||||
The query is performed via a call to dma_set_mask_and_coherent():
|
||||
The query is performed via a call to dma_set_mask_and_coherent()::
|
||||
|
||||
int dma_set_mask_and_coherent(struct device *dev, u64 mask);
|
||||
|
||||
@ -175,12 +179,12 @@ If you have some special requirements, then the following two separate
|
||||
queries can be used instead:
|
||||
|
||||
The query for streaming mappings is performed via a call to
|
||||
dma_set_mask():
|
||||
dma_set_mask()::
|
||||
|
||||
int dma_set_mask(struct device *dev, u64 mask);
|
||||
|
||||
The query for consistent allocations is performed via a call
|
||||
to dma_set_coherent_mask():
|
||||
to dma_set_coherent_mask()::
|
||||
|
||||
int dma_set_coherent_mask(struct device *dev, u64 mask);
|
||||
|
||||
@ -209,7 +213,7 @@ of your driver reports that performance is bad or that the device is not
|
||||
even detected, you can ask them for the kernel messages to find out
|
||||
exactly why.
|
||||
|
||||
The standard 32-bit addressing device would do something like this:
|
||||
The standard 32-bit addressing device would do something like this::
|
||||
|
||||
if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32))) {
|
||||
dev_warn(dev, "mydev: No suitable DMA available\n");
|
||||
@ -225,7 +229,7 @@ than 64-bit addressing. For example, Sparc64 PCI SAC addressing is
|
||||
more efficient than DAC addressing.
|
||||
|
||||
Here is how you would handle a 64-bit capable device which can drive
|
||||
all 64-bits when accessing streaming DMA:
|
||||
all 64-bits when accessing streaming DMA::
|
||||
|
||||
int using_dac;
|
||||
|
||||
@ -239,7 +243,7 @@ all 64-bits when accessing streaming DMA:
|
||||
}
|
||||
|
||||
If a card is capable of using 64-bit consistent allocations as well,
|
||||
the case would look like this:
|
||||
the case would look like this::
|
||||
|
||||
int using_dac, consistent_using_dac;
|
||||
|
||||
@ -260,7 +264,7 @@ uses consistent allocations, one would have to check the return value from
|
||||
dma_set_coherent_mask().
|
||||
|
||||
Finally, if your device can only drive the low 24-bits of
|
||||
address you might do something like:
|
||||
address you might do something like::
|
||||
|
||||
if (dma_set_mask(dev, DMA_BIT_MASK(24))) {
|
||||
dev_warn(dev, "mydev: 24-bit DMA addressing not available\n");
|
||||
@ -280,7 +284,7 @@ only provide the functionality which the machine can handle. It
|
||||
is important that the last call to dma_set_mask() be for the
|
||||
most specific mask.
|
||||
|
||||
Here is pseudo-code showing how this might be done:
|
||||
Here is pseudo-code showing how this might be done::
|
||||
|
||||
#define PLAYBACK_ADDRESS_BITS DMA_BIT_MASK(32)
|
||||
#define RECORD_ADDRESS_BITS DMA_BIT_MASK(24)
|
||||
@ -308,7 +312,8 @@ A sound card was used as an example here because this genre of PCI
|
||||
devices seems to be littered with ISA chips given a PCI front end,
|
||||
and thus retaining the 16MB DMA addressing limitations of ISA.
|
||||
|
||||
Types of DMA mappings
|
||||
Types of DMA mappings
|
||||
=====================
|
||||
|
||||
There are two types of DMA mappings:
|
||||
|
||||
@ -336,12 +341,14 @@ There are two types of DMA mappings:
|
||||
to memory is immediately visible to the device, and vice
|
||||
versa. Consistent mappings guarantee this.
|
||||
|
||||
IMPORTANT: Consistent DMA memory does not preclude the usage of
|
||||
proper memory barriers. The CPU may reorder stores to
|
||||
.. important::
|
||||
|
||||
Consistent DMA memory does not preclude the usage of
|
||||
proper memory barriers. The CPU may reorder stores to
|
||||
consistent memory just as it may normal memory. Example:
|
||||
if it is important for the device to see the first word
|
||||
of a descriptor updated before the second, you must do
|
||||
something like:
|
||||
something like::
|
||||
|
||||
desc->word0 = address;
|
||||
wmb();
|
||||
@ -377,16 +384,17 @@ Also, systems with caches that aren't DMA-coherent will work better
|
||||
when the underlying buffers don't share cache lines with other data.
|
||||
|
||||
|
||||
Using Consistent DMA mappings.
|
||||
Using Consistent DMA mappings
|
||||
=============================
|
||||
|
||||
To allocate and map large (PAGE_SIZE or so) consistent DMA regions,
|
||||
you should do:
|
||||
you should do::
|
||||
|
||||
dma_addr_t dma_handle;
|
||||
|
||||
cpu_addr = dma_alloc_coherent(dev, size, &dma_handle, gfp);
|
||||
|
||||
where device is a struct device *. This may be called in interrupt
|
||||
where device is a ``struct device *``. This may be called in interrupt
|
||||
context with the GFP_ATOMIC flag.
|
||||
|
||||
Size is the length of the region you want to allocate, in bytes.
|
||||
@ -415,7 +423,7 @@ exists (for example) to guarantee that if you allocate a chunk
|
||||
which is smaller than or equal to 64 kilobytes, the extent of the
|
||||
buffer you receive will not cross a 64K boundary.
|
||||
|
||||
To unmap and free such a DMA region, you call:
|
||||
To unmap and free such a DMA region, you call::
|
||||
|
||||
dma_free_coherent(dev, size, cpu_addr, dma_handle);
|
||||
|
||||
@ -430,7 +438,7 @@ a kmem_cache, but it uses dma_alloc_coherent(), not __get_free_pages().
|
||||
Also, it understands common hardware constraints for alignment,
|
||||
like queue heads needing to be aligned on N byte boundaries.
|
||||
|
||||
Create a dma_pool like this:
|
||||
Create a dma_pool like this::
|
||||
|
||||
struct dma_pool *pool;
|
||||
|
||||
@ -444,7 +452,7 @@ pass 0 for boundary; passing 4096 says memory allocated from this pool
|
||||
must not cross 4KByte boundaries (but at that time it may be better to
|
||||
use dma_alloc_coherent() directly instead).
|
||||
|
||||
Allocate memory from a DMA pool like this:
|
||||
Allocate memory from a DMA pool like this::
|
||||
|
||||
cpu_addr = dma_pool_alloc(pool, flags, &dma_handle);
|
||||
|
||||
@ -452,7 +460,7 @@ flags are GFP_KERNEL if blocking is permitted (not in_interrupt nor
|
||||
holding SMP locks), GFP_ATOMIC otherwise. Like dma_alloc_coherent(),
|
||||
this returns two values, cpu_addr and dma_handle.
|
||||
|
||||
Free memory that was allocated from a dma_pool like this:
|
||||
Free memory that was allocated from a dma_pool like this::
|
||||
|
||||
dma_pool_free(pool, cpu_addr, dma_handle);
|
||||
|
||||
@ -460,7 +468,7 @@ where pool is what you passed to dma_pool_alloc(), and cpu_addr and
|
||||
dma_handle are the values dma_pool_alloc() returned. This function
|
||||
may be called in interrupt context.
|
||||
|
||||
Destroy a dma_pool by calling:
|
||||
Destroy a dma_pool by calling::
|
||||
|
||||
dma_pool_destroy(pool);
|
||||
|
||||
@ -468,11 +476,12 @@ Make sure you've called dma_pool_free() for all memory allocated
|
||||
from a pool before you destroy the pool. This function may not
|
||||
be called in interrupt context.
|
||||
|
||||
DMA Direction
|
||||
DMA Direction
|
||||
=============
|
||||
|
||||
The interfaces described in subsequent portions of this document
|
||||
take a DMA direction argument, which is an integer and takes on
|
||||
one of the following values:
|
||||
one of the following values::
|
||||
|
||||
DMA_BIDIRECTIONAL
|
||||
DMA_TO_DEVICE
|
||||
@ -521,14 +530,15 @@ packets, map/unmap them with the DMA_TO_DEVICE direction
|
||||
specifier. For receive packets, just the opposite, map/unmap them
|
||||
with the DMA_FROM_DEVICE direction specifier.
|
||||
|
||||
Using Streaming DMA mappings
|
||||
Using Streaming DMA mappings
|
||||
============================
|
||||
|
||||
The streaming DMA mapping routines can be called from interrupt
|
||||
context. There are two versions of each map/unmap, one which will
|
||||
map/unmap a single memory region, and one which will map/unmap a
|
||||
scatterlist.
|
||||
|
||||
To map a single region, you do:
|
||||
To map a single region, you do::
|
||||
|
||||
struct device *dev = &my_dev->dev;
|
||||
dma_addr_t dma_handle;
|
||||
@ -545,37 +555,16 @@ To map a single region, you do:
|
||||
goto map_error_handling;
|
||||
}
|
||||
|
||||
and to unmap it:
|
||||
and to unmap it::
|
||||
|
||||
dma_unmap_single(dev, dma_handle, size, direction);
|
||||
|
||||
You should call dma_mapping_error() as dma_map_single() could fail and return
|
||||
error. Not all DMA implementations support the dma_mapping_error() interface.
|
||||
However, it is a good practice to call dma_mapping_error() interface, which
|
||||
will invoke the generic mapping error check interface. Doing so will ensure
|
||||
that the mapping code will work correctly on all DMA implementations without
|
||||
any dependency on the specifics of the underlying implementation. Using the
|
||||
returned address without checking for errors could result in failures ranging
|
||||
from panics to silent data corruption. A couple of examples of incorrect ways
|
||||
to check for errors that make assumptions about the underlying DMA
|
||||
implementation are as follows and these are applicable to dma_map_page() as
|
||||
well.
|
||||
|
||||
Incorrect example 1:
|
||||
dma_addr_t dma_handle;
|
||||
|
||||
dma_handle = dma_map_single(dev, addr, size, direction);
|
||||
if ((dma_handle & 0xffff != 0) || (dma_handle >= 0x1000000)) {
|
||||
goto map_error;
|
||||
}
|
||||
|
||||
Incorrect example 2:
|
||||
dma_addr_t dma_handle;
|
||||
|
||||
dma_handle = dma_map_single(dev, addr, size, direction);
|
||||
if (dma_handle == DMA_ERROR_CODE) {
|
||||
goto map_error;
|
||||
}
|
||||
error. Doing so will ensure that the mapping code will work correctly on all
|
||||
DMA implementations without any dependency on the specifics of the underlying
|
||||
implementation. Using the returned address without checking for errors could
|
||||
result in failures ranging from panics to silent data corruption. The same
|
||||
applies to dma_map_page() as well.
|
||||
|
||||
You should call dma_unmap_single() when the DMA activity is finished, e.g.,
|
||||
from the interrupt which told you that the DMA transfer is done.
|
||||
@ -584,7 +573,7 @@ Using CPU pointers like this for single mappings has a disadvantage:
|
||||
you cannot reference HIGHMEM memory in this way. Thus, there is a
|
||||
map/unmap interface pair akin to dma_{map,unmap}_single(). These
|
||||
interfaces deal with page/offset pairs instead of CPU pointers.
|
||||
Specifically:
|
||||
Specifically::
|
||||
|
||||
struct device *dev = &my_dev->dev;
|
||||
dma_addr_t dma_handle;
|
||||
@ -614,7 +603,7 @@ error as outlined under the dma_map_single() discussion.
|
||||
You should call dma_unmap_page() when the DMA activity is finished, e.g.,
|
||||
from the interrupt which told you that the DMA transfer is done.
|
||||
|
||||
With scatterlists, you map a region gathered from several regions by:
|
||||
With scatterlists, you map a region gathered from several regions by::
|
||||
|
||||
int i, count = dma_map_sg(dev, sglist, nents, direction);
|
||||
struct scatterlist *sg;
|
||||
@ -638,16 +627,18 @@ Then you should loop count times (note: this can be less than nents times)
|
||||
and use sg_dma_address() and sg_dma_len() macros where you previously
|
||||
accessed sg->address and sg->length as shown above.
|
||||
|
||||
To unmap a scatterlist, just call:
|
||||
To unmap a scatterlist, just call::
|
||||
|
||||
dma_unmap_sg(dev, sglist, nents, direction);
|
||||
|
||||
Again, make sure DMA activity has already finished.
|
||||
|
||||
PLEASE NOTE: The 'nents' argument to the dma_unmap_sg call must be
|
||||
the _same_ one you passed into the dma_map_sg call,
|
||||
it should _NOT_ be the 'count' value _returned_ from the
|
||||
dma_map_sg call.
|
||||
.. note::
|
||||
|
||||
The 'nents' argument to the dma_unmap_sg call must be
|
||||
the _same_ one you passed into the dma_map_sg call,
|
||||
it should _NOT_ be the 'count' value _returned_ from the
|
||||
dma_map_sg call.
|
||||
|
||||
Every dma_map_{single,sg}() call should have its dma_unmap_{single,sg}()
|
||||
counterpart, because the DMA address space is a shared resource and
|
||||
@ -659,11 +650,11 @@ properly in order for the CPU and device to see the most up-to-date and
|
||||
correct copy of the DMA buffer.
|
||||
|
||||
So, firstly, just map it with dma_map_{single,sg}(), and after each DMA
|
||||
transfer call either:
|
||||
transfer call either::
|
||||
|
||||
dma_sync_single_for_cpu(dev, dma_handle, size, direction);
|
||||
|
||||
or:
|
||||
or::
|
||||
|
||||
dma_sync_sg_for_cpu(dev, sglist, nents, direction);
|
||||
|
||||
@ -671,17 +662,19 @@ as appropriate.
|
||||
|
||||
Then, if you wish to let the device get at the DMA area again,
|
||||
finish accessing the data with the CPU, and then before actually
|
||||
giving the buffer to the hardware call either:
|
||||
giving the buffer to the hardware call either::
|
||||
|
||||
dma_sync_single_for_device(dev, dma_handle, size, direction);
|
||||
|
||||
or:
|
||||
or::
|
||||
|
||||
dma_sync_sg_for_device(dev, sglist, nents, direction);
|
||||
|
||||
as appropriate.
|
||||
|
||||
PLEASE NOTE: The 'nents' argument to dma_sync_sg_for_cpu() and
|
||||
.. note::
|
||||
|
||||
The 'nents' argument to dma_sync_sg_for_cpu() and
|
||||
dma_sync_sg_for_device() must be the same passed to
|
||||
dma_map_sg(). It is _NOT_ the count returned by
|
||||
dma_map_sg().
|
||||
@ -692,7 +685,7 @@ dma_map_*() call till dma_unmap_*(), then you don't have to call the
|
||||
dma_sync_*() routines at all.
|
||||
|
||||
Here is pseudo code which shows a situation in which you would need
|
||||
to use the dma_sync_*() interfaces.
|
||||
to use the dma_sync_*() interfaces::
|
||||
|
||||
my_card_setup_receive_buffer(struct my_card *cp, char *buffer, int len)
|
||||
{
|
||||
@ -768,7 +761,8 @@ is planned to completely remove virt_to_bus() and bus_to_virt() as
|
||||
they are entirely deprecated. Some ports already do not provide these
|
||||
as it is impossible to correctly support them.
|
||||
|
||||
Handling Errors
|
||||
Handling Errors
|
||||
===============
|
||||
|
||||
DMA address space is limited on some architectures and an allocation
|
||||
failure can be determined by:
|
||||
@ -776,7 +770,7 @@ failure can be determined by:
|
||||
- checking if dma_alloc_coherent() returns NULL or dma_map_sg returns 0
|
||||
|
||||
- checking the dma_addr_t returned from dma_map_single() and dma_map_page()
|
||||
by using dma_mapping_error():
|
||||
by using dma_mapping_error()::
|
||||
|
||||
dma_addr_t dma_handle;
|
||||
|
||||
@ -794,7 +788,8 @@ failure can be determined by:
|
||||
of a multiple page mapping attempt. These example are applicable to
|
||||
dma_map_page() as well.
|
||||
|
||||
Example 1:
|
||||
Example 1::
|
||||
|
||||
dma_addr_t dma_handle1;
|
||||
dma_addr_t dma_handle2;
|
||||
|
||||
@ -823,8 +818,12 @@ Example 1:
|
||||
dma_unmap_single(dma_handle1);
|
||||
map_error_handling1:
|
||||
|
||||
Example 2: (if buffers are allocated in a loop, unmap all mapped buffers when
|
||||
mapping error is detected in the middle)
|
||||
Example 2::
|
||||
|
||||
/*
|
||||
* if buffers are allocated in a loop, unmap all mapped buffers when
|
||||
* mapping error is detected in the middle
|
||||
*/
|
||||
|
||||
dma_addr_t dma_addr;
|
||||
dma_addr_t array[DMA_BUFFERS];
|
||||
@ -867,7 +866,8 @@ SCSI drivers must return SCSI_MLQUEUE_HOST_BUSY if the DMA mapping
|
||||
fails in the queuecommand hook. This means that the SCSI subsystem
|
||||
passes the command to the driver again later.
|
||||
|
||||
Optimizing Unmap State Space Consumption
|
||||
Optimizing Unmap State Space Consumption
|
||||
========================================
|
||||
|
||||
On many platforms, dma_unmap_{single,page}() is simply a nop.
|
||||
Therefore, keeping track of the mapping address and length is a waste
|
||||
@ -879,7 +879,7 @@ Actually, instead of describing the macros one by one, we'll
|
||||
transform some example code.
|
||||
|
||||
1) Use DEFINE_DMA_UNMAP_{ADDR,LEN} in state saving structures.
|
||||
Example, before:
|
||||
Example, before::
|
||||
|
||||
struct ring_state {
|
||||
struct sk_buff *skb;
|
||||
@ -887,7 +887,7 @@ transform some example code.
|
||||
__u32 len;
|
||||
};
|
||||
|
||||
after:
|
||||
after::
|
||||
|
||||
struct ring_state {
|
||||
struct sk_buff *skb;
|
||||
@ -896,23 +896,23 @@ transform some example code.
|
||||
};
|
||||
|
||||
2) Use dma_unmap_{addr,len}_set() to set these values.
|
||||
Example, before:
|
||||
Example, before::
|
||||
|
||||
ringp->mapping = FOO;
|
||||
ringp->len = BAR;
|
||||
|
||||
after:
|
||||
after::
|
||||
|
||||
dma_unmap_addr_set(ringp, mapping, FOO);
|
||||
dma_unmap_len_set(ringp, len, BAR);
|
||||
|
||||
3) Use dma_unmap_{addr,len}() to access these values.
|
||||
Example, before:
|
||||
Example, before::
|
||||
|
||||
dma_unmap_single(dev, ringp->mapping, ringp->len,
|
||||
DMA_FROM_DEVICE);
|
||||
|
||||
after:
|
||||
after::
|
||||
|
||||
dma_unmap_single(dev,
|
||||
dma_unmap_addr(ringp, mapping),
|
||||
@ -923,7 +923,8 @@ It really should be self-explanatory. We treat the ADDR and LEN
|
||||
separately, because it is possible for an implementation to only
|
||||
need the address in order to perform the unmap operation.
|
||||
|
||||
Platform Issues
|
||||
Platform Issues
|
||||
===============
|
||||
|
||||
If you are just writing drivers for Linux and do not maintain
|
||||
an architecture port for the kernel, you can safely skip down
|
||||
@ -949,12 +950,13 @@ to "Closing".
|
||||
alignment constraints (e.g. the alignment constraints about 64-bit
|
||||
objects).
|
||||
|
||||
Closing
|
||||
Closing
|
||||
=======
|
||||
|
||||
This document, and the API itself, would not be in its current
|
||||
form without the feedback and suggestions from numerous individuals.
|
||||
We would like to specifically mention, in no particular order, the
|
||||
following people:
|
||||
following people::
|
||||
|
||||
Russell King <rmk@arm.linux.org.uk>
|
||||
Leo Dagum <dagum@barrel.engr.sgi.com>
|
||||
|
@ -1,7 +1,8 @@
|
||||
Dynamic DMA mapping using the generic device
|
||||
============================================
|
||||
============================================
|
||||
Dynamic DMA mapping using the generic device
|
||||
============================================
|
||||
|
||||
James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
|
||||
:Author: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
|
||||
|
||||
This document describes the DMA API. For a more gentle introduction
|
||||
of the API (and actual examples), see Documentation/DMA-API-HOWTO.txt.
|
||||
@ -12,10 +13,10 @@ machines. Unless you know that your driver absolutely has to support
|
||||
non-consistent platforms (this is usually only legacy platforms) you
|
||||
should only use the API described in part I.
|
||||
|
||||
Part I - dma_ API
|
||||
-------------------------------------
|
||||
Part I - dma_API
|
||||
----------------
|
||||
|
||||
To get the dma_ API, you must #include <linux/dma-mapping.h>. This
|
||||
To get the dma_API, you must #include <linux/dma-mapping.h>. This
|
||||
provides dma_addr_t and the interfaces described below.
|
||||
|
||||
A dma_addr_t can hold any valid DMA address for the platform. It can be
|
||||
@ -26,9 +27,11 @@ address space and the DMA address space.
|
||||
Part Ia - Using large DMA-coherent buffers
|
||||
------------------------------------------
|
||||
|
||||
void *
|
||||
dma_alloc_coherent(struct device *dev, size_t size,
|
||||
dma_addr_t *dma_handle, gfp_t flag)
|
||||
::
|
||||
|
||||
void *
|
||||
dma_alloc_coherent(struct device *dev, size_t size,
|
||||
dma_addr_t *dma_handle, gfp_t flag)
|
||||
|
||||
Consistent memory is memory for which a write by either the device or
|
||||
the processor can immediately be read by the processor or device
|
||||
@ -51,20 +54,24 @@ consolidate your requests for consistent memory as much as possible.
|
||||
The simplest way to do that is to use the dma_pool calls (see below).
|
||||
|
||||
The flag parameter (dma_alloc_coherent() only) allows the caller to
|
||||
specify the GFP_ flags (see kmalloc()) for the allocation (the
|
||||
specify the ``GFP_`` flags (see kmalloc()) for the allocation (the
|
||||
implementation may choose to ignore flags that affect the location of
|
||||
the returned memory, like GFP_DMA).
|
||||
|
||||
void *
|
||||
dma_zalloc_coherent(struct device *dev, size_t size,
|
||||
dma_addr_t *dma_handle, gfp_t flag)
|
||||
::
|
||||
|
||||
void *
|
||||
dma_zalloc_coherent(struct device *dev, size_t size,
|
||||
dma_addr_t *dma_handle, gfp_t flag)
|
||||
|
||||
Wraps dma_alloc_coherent() and also zeroes the returned memory if the
|
||||
allocation attempt succeeded.
|
||||
|
||||
void
|
||||
dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
|
||||
dma_addr_t dma_handle)
|
||||
::
|
||||
|
||||
void
|
||||
dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
|
||||
dma_addr_t dma_handle)
|
||||
|
||||
Free a region of consistent memory you previously allocated. dev,
|
||||
size and dma_handle must all be the same as those passed into
|
||||
@ -78,7 +85,7 @@ may only be called with IRQs enabled.
|
||||
Part Ib - Using small DMA-coherent buffers
|
||||
------------------------------------------
|
||||
|
||||
To get this part of the dma_ API, you must #include <linux/dmapool.h>
|
||||
To get this part of the dma_API, you must #include <linux/dmapool.h>
|
||||
|
||||
Many drivers need lots of small DMA-coherent memory regions for DMA
|
||||
descriptors or I/O buffers. Rather than allocating in units of a page
|
||||
@ -88,6 +95,8 @@ not __get_free_pages(). Also, they understand common hardware constraints
|
||||
for alignment, like queue heads needing to be aligned on N-byte boundaries.
|
||||
|
||||
|
||||
::
|
||||
|
||||
struct dma_pool *
|
||||
dma_pool_create(const char *name, struct device *dev,
|
||||
size_t size, size_t align, size_t alloc);
|
||||
@ -103,16 +112,21 @@ in bytes, and must be a power of two). If your device has no boundary
|
||||
crossing restrictions, pass 0 for alloc; passing 4096 says memory allocated
|
||||
from this pool must not cross 4KByte boundaries.
|
||||
|
||||
::
|
||||
|
||||
void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags,
|
||||
dma_addr_t *handle)
|
||||
void *
|
||||
dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags,
|
||||
dma_addr_t *handle)
|
||||
|
||||
Wraps dma_pool_alloc() and also zeroes the returned memory if the
|
||||
allocation attempt succeeded.
|
||||
|
||||
|
||||
void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags,
|
||||
dma_addr_t *dma_handle);
|
||||
::
|
||||
|
||||
void *
|
||||
dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags,
|
||||
dma_addr_t *dma_handle);
|
||||
|
||||
This allocates memory from the pool; the returned memory will meet the
|
||||
size and alignment requirements specified at creation time. Pass
|
||||
@ -122,16 +136,20 @@ blocking. Like dma_alloc_coherent(), this returns two values: an
|
||||
address usable by the CPU, and the DMA address usable by the pool's
|
||||
device.
|
||||
|
||||
::
|
||||
|
||||
void dma_pool_free(struct dma_pool *pool, void *vaddr,
|
||||
dma_addr_t addr);
|
||||
void
|
||||
dma_pool_free(struct dma_pool *pool, void *vaddr,
|
||||
dma_addr_t addr);
|
||||
|
||||
This puts memory back into the pool. The pool is what was passed to
|
||||
dma_pool_alloc(); the CPU (vaddr) and DMA addresses are what
|
||||
were returned when that routine allocated the memory being freed.
|
||||
|
||||
::
|
||||
|
||||
void dma_pool_destroy(struct dma_pool *pool);
|
||||
void
|
||||
dma_pool_destroy(struct dma_pool *pool);
|
||||
|
||||
dma_pool_destroy() frees the resources of the pool. It must be
|
||||
called in a context which can sleep. Make sure you've freed all allocated
|
||||
@ -141,32 +159,40 @@ memory back to the pool before you destroy it.
|
||||
Part Ic - DMA addressing limitations
|
||||
------------------------------------
|
||||
|
||||
int
|
||||
dma_set_mask_and_coherent(struct device *dev, u64 mask)
|
||||
::
|
||||
|
||||
int
|
||||
dma_set_mask_and_coherent(struct device *dev, u64 mask)
|
||||
|
||||
Checks to see if the mask is possible and updates the device
|
||||
streaming and coherent DMA mask parameters if it is.
|
||||
|
||||
Returns: 0 if successful and a negative error if not.
|
||||
|
||||
int
|
||||
dma_set_mask(struct device *dev, u64 mask)
|
||||
::
|
||||
|
||||
int
|
||||
dma_set_mask(struct device *dev, u64 mask)
|
||||
|
||||
Checks to see if the mask is possible and updates the device
|
||||
parameters if it is.
|
||||
|
||||
Returns: 0 if successful and a negative error if not.
|
||||
|
||||
int
|
||||
dma_set_coherent_mask(struct device *dev, u64 mask)
|
||||
::
|
||||
|
||||
int
|
||||
dma_set_coherent_mask(struct device *dev, u64 mask)
|
||||
|
||||
Checks to see if the mask is possible and updates the device
|
||||
parameters if it is.
|
||||
|
||||
Returns: 0 if successful and a negative error if not.
|
||||
|
||||
u64
|
||||
dma_get_required_mask(struct device *dev)
|
||||
::
|
||||
|
||||
u64
|
||||
dma_get_required_mask(struct device *dev)
|
||||
|
||||
This API returns the mask that the platform requires to
|
||||
operate efficiently. Usually this means the returned mask
|
||||
@ -182,94 +208,107 @@ call to set the mask to the value returned.
|
||||
Part Id - Streaming DMA mappings
|
||||
--------------------------------
|
||||
|
||||
dma_addr_t
|
||||
dma_map_single(struct device *dev, void *cpu_addr, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
::
|
||||
|
||||
dma_addr_t
|
||||
dma_map_single(struct device *dev, void *cpu_addr, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
|
||||
Maps a piece of processor virtual memory so it can be accessed by the
|
||||
device and returns the DMA address of the memory.
|
||||
|
||||
The direction for both APIs may be converted freely by casting.
|
||||
However the dma_ API uses a strongly typed enumerator for its
|
||||
However the dma_API uses a strongly typed enumerator for its
|
||||
direction:
|
||||
|
||||
======================= =============================================
|
||||
DMA_NONE no direction (used for debugging)
|
||||
DMA_TO_DEVICE data is going from the memory to the device
|
||||
DMA_FROM_DEVICE data is coming from the device to the memory
|
||||
DMA_BIDIRECTIONAL direction isn't known
|
||||
======================= =============================================
|
||||
|
||||
Notes: Not all memory regions in a machine can be mapped by this API.
|
||||
Further, contiguous kernel virtual space may not be contiguous as
|
||||
physical memory. Since this API does not provide any scatter/gather
|
||||
capability, it will fail if the user tries to map a non-physically
|
||||
contiguous piece of memory. For this reason, memory to be mapped by
|
||||
this API should be obtained from sources which guarantee it to be
|
||||
physically contiguous (like kmalloc).
|
||||
.. note::
|
||||
|
||||
Further, the DMA address of the memory must be within the
|
||||
dma_mask of the device (the dma_mask is a bit mask of the
|
||||
addressable region for the device, i.e., if the DMA address of
|
||||
the memory ANDed with the dma_mask is still equal to the DMA
|
||||
address, then the device can perform DMA to the memory). To
|
||||
ensure that the memory allocated by kmalloc is within the dma_mask,
|
||||
the driver may specify various platform-dependent flags to restrict
|
||||
the DMA address range of the allocation (e.g., on x86, GFP_DMA
|
||||
guarantees to be within the first 16MB of available DMA addresses,
|
||||
as required by ISA devices).
|
||||
Not all memory regions in a machine can be mapped by this API.
|
||||
Further, contiguous kernel virtual space may not be contiguous as
|
||||
physical memory. Since this API does not provide any scatter/gather
|
||||
capability, it will fail if the user tries to map a non-physically
|
||||
contiguous piece of memory. For this reason, memory to be mapped by
|
||||
this API should be obtained from sources which guarantee it to be
|
||||
physically contiguous (like kmalloc).
|
||||
|
||||
Note also that the above constraints on physical contiguity and
|
||||
dma_mask may not apply if the platform has an IOMMU (a device which
|
||||
maps an I/O DMA address to a physical memory address). However, to be
|
||||
portable, device driver writers may *not* assume that such an IOMMU
|
||||
exists.
|
||||
Further, the DMA address of the memory must be within the
|
||||
dma_mask of the device (the dma_mask is a bit mask of the
|
||||
addressable region for the device, i.e., if the DMA address of
|
||||
the memory ANDed with the dma_mask is still equal to the DMA
|
||||
address, then the device can perform DMA to the memory). To
|
||||
ensure that the memory allocated by kmalloc is within the dma_mask,
|
||||
the driver may specify various platform-dependent flags to restrict
|
||||
the DMA address range of the allocation (e.g., on x86, GFP_DMA
|
||||
guarantees to be within the first 16MB of available DMA addresses,
|
||||
as required by ISA devices).
|
||||
|
||||
Warnings: Memory coherency operates at a granularity called the cache
|
||||
line width. In order for memory mapped by this API to operate
|
||||
correctly, the mapped region must begin exactly on a cache line
|
||||
boundary and end exactly on one (to prevent two separately mapped
|
||||
regions from sharing a single cache line). Since the cache line size
|
||||
may not be known at compile time, the API will not enforce this
|
||||
requirement. Therefore, it is recommended that driver writers who
|
||||
don't take special care to determine the cache line size at run time
|
||||
only map virtual regions that begin and end on page boundaries (which
|
||||
are guaranteed also to be cache line boundaries).
|
||||
Note also that the above constraints on physical contiguity and
|
||||
dma_mask may not apply if the platform has an IOMMU (a device which
|
||||
maps an I/O DMA address to a physical memory address). However, to be
|
||||
portable, device driver writers may *not* assume that such an IOMMU
|
||||
exists.
|
||||
|
||||
DMA_TO_DEVICE synchronisation must be done after the last modification
|
||||
of the memory region by the software and before it is handed off to
|
||||
the device. Once this primitive is used, memory covered by this
|
||||
primitive should be treated as read-only by the device. If the device
|
||||
may write to it at any point, it should be DMA_BIDIRECTIONAL (see
|
||||
below).
|
||||
.. warning::
|
||||
|
||||
DMA_FROM_DEVICE synchronisation must be done before the driver
|
||||
accesses data that may be changed by the device. This memory should
|
||||
be treated as read-only by the driver. If the driver needs to write
|
||||
to it at any point, it should be DMA_BIDIRECTIONAL (see below).
|
||||
Memory coherency operates at a granularity called the cache
|
||||
line width. In order for memory mapped by this API to operate
|
||||
correctly, the mapped region must begin exactly on a cache line
|
||||
boundary and end exactly on one (to prevent two separately mapped
|
||||
regions from sharing a single cache line). Since the cache line size
|
||||
may not be known at compile time, the API will not enforce this
|
||||
requirement. Therefore, it is recommended that driver writers who
|
||||
don't take special care to determine the cache line size at run time
|
||||
only map virtual regions that begin and end on page boundaries (which
|
||||
are guaranteed also to be cache line boundaries).
|
||||
|
||||
DMA_BIDIRECTIONAL requires special handling: it means that the driver
|
||||
isn't sure if the memory was modified before being handed off to the
|
||||
device and also isn't sure if the device will also modify it. Thus,
|
||||
you must always sync bidirectional memory twice: once before the
|
||||
memory is handed off to the device (to make sure all memory changes
|
||||
are flushed from the processor) and once before the data may be
|
||||
accessed after being used by the device (to make sure any processor
|
||||
cache lines are updated with data that the device may have changed).
|
||||
DMA_TO_DEVICE synchronisation must be done after the last modification
|
||||
of the memory region by the software and before it is handed off to
|
||||
the device. Once this primitive is used, memory covered by this
|
||||
primitive should be treated as read-only by the device. If the device
|
||||
may write to it at any point, it should be DMA_BIDIRECTIONAL (see
|
||||
below).
|
||||
|
||||
void
|
||||
dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
DMA_FROM_DEVICE synchronisation must be done before the driver
|
||||
accesses data that may be changed by the device. This memory should
|
||||
be treated as read-only by the driver. If the driver needs to write
|
||||
to it at any point, it should be DMA_BIDIRECTIONAL (see below).
|
||||
|
||||
DMA_BIDIRECTIONAL requires special handling: it means that the driver
|
||||
isn't sure if the memory was modified before being handed off to the
|
||||
device and also isn't sure if the device will also modify it. Thus,
|
||||
you must always sync bidirectional memory twice: once before the
|
||||
memory is handed off to the device (to make sure all memory changes
|
||||
are flushed from the processor) and once before the data may be
|
||||
accessed after being used by the device (to make sure any processor
|
||||
cache lines are updated with data that the device may have changed).
|
||||
|
||||
::
|
||||
|
||||
void
|
||||
dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
|
||||
Unmaps the region previously mapped. All the parameters passed in
|
||||
must be identical to those passed in (and returned) by the mapping
|
||||
API.
|
||||
|
||||
dma_addr_t
|
||||
dma_map_page(struct device *dev, struct page *page,
|
||||
unsigned long offset, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
void
|
||||
dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
::
|
||||
|
||||
dma_addr_t
|
||||
dma_map_page(struct device *dev, struct page *page,
|
||||
unsigned long offset, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
|
||||
void
|
||||
dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
|
||||
API for mapping and unmapping for pages. All the notes and warnings
|
||||
for the other mapping APIs apply here. Also, although the <offset>
|
||||
@ -277,20 +316,24 @@ and <size> parameters are provided to do partial page mapping, it is
|
||||
recommended that you never use these unless you really know what the
|
||||
cache width is.
|
||||
|
||||
dma_addr_t
|
||||
dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
|
||||
enum dma_data_direction dir, unsigned long attrs)
|
||||
::
|
||||
|
||||
void
|
||||
dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
|
||||
enum dma_data_direction dir, unsigned long attrs)
|
||||
dma_addr_t
|
||||
dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
|
||||
enum dma_data_direction dir, unsigned long attrs)
|
||||
|
||||
void
|
||||
dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
|
||||
enum dma_data_direction dir, unsigned long attrs)
|
||||
|
||||
API for mapping and unmapping for MMIO resources. All the notes and
|
||||
warnings for the other mapping APIs apply here. The API should only be
|
||||
used to map device MMIO resources, mapping of RAM is not permitted.
|
||||
|
||||
int
|
||||
dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
|
||||
::
|
||||
|
||||
int
|
||||
dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
|
||||
|
||||
In some circumstances dma_map_single(), dma_map_page() and dma_map_resource()
|
||||
will fail to create a mapping. A driver can check for these errors by testing
|
||||
@ -298,9 +341,11 @@ the returned DMA address with dma_mapping_error(). A non-zero return value
|
||||
means the mapping could not be created and the driver should take appropriate
|
||||
action (e.g. reduce current DMA mapping usage or delay and try again later).
|
||||
|
||||
::
|
||||
|
||||
int
|
||||
dma_map_sg(struct device *dev, struct scatterlist *sg,
|
||||
int nents, enum dma_data_direction direction)
|
||||
int nents, enum dma_data_direction direction)
|
||||
|
||||
Returns: the number of DMA address segments mapped (this may be shorter
|
||||
than <nents> passed in if some elements of the scatter/gather list are
|
||||
@ -316,7 +361,7 @@ critical that the driver do something, in the case of a block driver
|
||||
aborting the request or even oopsing is better than doing nothing and
|
||||
corrupting the filesystem.
|
||||
|
||||
With scatterlists, you use the resulting mapping like this:
|
||||
With scatterlists, you use the resulting mapping like this::
|
||||
|
||||
int i, count = dma_map_sg(dev, sglist, nents, direction);
|
||||
struct scatterlist *sg;
|
||||
@ -337,9 +382,11 @@ Then you should loop count times (note: this can be less than nents times)
|
||||
and use sg_dma_address() and sg_dma_len() macros where you previously
|
||||
accessed sg->address and sg->length as shown above.
|
||||
|
||||
::
|
||||
|
||||
void
|
||||
dma_unmap_sg(struct device *dev, struct scatterlist *sg,
|
||||
int nents, enum dma_data_direction direction)
|
||||
int nents, enum dma_data_direction direction)
|
||||
|
||||
Unmap the previously mapped scatter/gather list. All the parameters
|
||||
must be the same as those and passed in to the scatter/gather mapping
|
||||
@ -348,18 +395,27 @@ API.
|
||||
Note: <nents> must be the number you passed in, *not* the number of
|
||||
DMA address entries returned.
|
||||
|
||||
void
|
||||
dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
void
|
||||
dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
void
|
||||
dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nents,
|
||||
enum dma_data_direction direction)
|
||||
void
|
||||
dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nents,
|
||||
enum dma_data_direction direction)
|
||||
::
|
||||
|
||||
void
|
||||
dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
|
||||
size_t size,
|
||||
enum dma_data_direction direction)
|
||||
|
||||
void
|
||||
dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle,
|
||||
size_t size,
|
||||
enum dma_data_direction direction)
|
||||
|
||||
void
|
||||
dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
|
||||
int nents,
|
||||
enum dma_data_direction direction)
|
||||
|
||||
void
|
||||
dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
|
||||
int nents,
|
||||
enum dma_data_direction direction)
|
||||
|
||||
Synchronise a single contiguous or scatter/gather mapping for the CPU
|
||||
and device. With the sync_sg API, all the parameters must be the same
|
||||
@ -367,36 +423,41 @@ as those passed into the single mapping API. With the sync_single API,
|
||||
you can use dma_handle and size parameters that aren't identical to
|
||||
those passed into the single mapping API to do a partial sync.
|
||||
|
||||
Notes: You must do this:
|
||||
|
||||
- Before reading values that have been written by DMA from the device
|
||||
(use the DMA_FROM_DEVICE direction)
|
||||
- After writing values that will be written to the device using DMA
|
||||
(use the DMA_TO_DEVICE) direction
|
||||
- before *and* after handing memory to the device if the memory is
|
||||
DMA_BIDIRECTIONAL
|
||||
.. note::
|
||||
|
||||
You must do this:
|
||||
|
||||
- Before reading values that have been written by DMA from the device
|
||||
(use the DMA_FROM_DEVICE direction)
|
||||
- After writing values that will be written to the device using DMA
|
||||
(use the DMA_TO_DEVICE) direction
|
||||
- before *and* after handing memory to the device if the memory is
|
||||
DMA_BIDIRECTIONAL
|
||||
|
||||
See also dma_map_single().
|
||||
|
||||
dma_addr_t
|
||||
dma_map_single_attrs(struct device *dev, void *cpu_addr, size_t size,
|
||||
enum dma_data_direction dir,
|
||||
unsigned long attrs)
|
||||
::
|
||||
|
||||
void
|
||||
dma_unmap_single_attrs(struct device *dev, dma_addr_t dma_addr,
|
||||
size_t size, enum dma_data_direction dir,
|
||||
unsigned long attrs)
|
||||
dma_addr_t
|
||||
dma_map_single_attrs(struct device *dev, void *cpu_addr, size_t size,
|
||||
enum dma_data_direction dir,
|
||||
unsigned long attrs)
|
||||
|
||||
int
|
||||
dma_map_sg_attrs(struct device *dev, struct scatterlist *sgl,
|
||||
int nents, enum dma_data_direction dir,
|
||||
unsigned long attrs)
|
||||
void
|
||||
dma_unmap_single_attrs(struct device *dev, dma_addr_t dma_addr,
|
||||
size_t size, enum dma_data_direction dir,
|
||||
unsigned long attrs)
|
||||
|
||||
void
|
||||
dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sgl,
|
||||
int nents, enum dma_data_direction dir,
|
||||
unsigned long attrs)
|
||||
int
|
||||
dma_map_sg_attrs(struct device *dev, struct scatterlist *sgl,
|
||||
int nents, enum dma_data_direction dir,
|
||||
unsigned long attrs)
|
||||
|
||||
void
|
||||
dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sgl,
|
||||
int nents, enum dma_data_direction dir,
|
||||
unsigned long attrs)
|
||||
|
||||
The four functions above are just like the counterpart functions
|
||||
without the _attrs suffixes, except that they pass an optional
|
||||
@ -410,37 +471,38 @@ is identical to those of the corresponding function
|
||||
without the _attrs suffix. As a result dma_map_single_attrs()
|
||||
can generally replace dma_map_single(), etc.
|
||||
|
||||
As an example of the use of the *_attrs functions, here's how
|
||||
As an example of the use of the ``*_attrs`` functions, here's how
|
||||
you could pass an attribute DMA_ATTR_FOO when mapping memory
|
||||
for DMA:
|
||||
for DMA::
|
||||
|
||||
#include <linux/dma-mapping.h>
|
||||
/* DMA_ATTR_FOO should be defined in linux/dma-mapping.h and
|
||||
* documented in Documentation/DMA-attributes.txt */
|
||||
...
|
||||
#include <linux/dma-mapping.h>
|
||||
/* DMA_ATTR_FOO should be defined in linux/dma-mapping.h and
|
||||
* documented in Documentation/DMA-attributes.txt */
|
||||
...
|
||||
|
||||
unsigned long attr;
|
||||
attr |= DMA_ATTR_FOO;
|
||||
....
|
||||
n = dma_map_sg_attrs(dev, sg, nents, DMA_TO_DEVICE, attr);
|
||||
....
|
||||
unsigned long attr;
|
||||
attr |= DMA_ATTR_FOO;
|
||||
....
|
||||
n = dma_map_sg_attrs(dev, sg, nents, DMA_TO_DEVICE, attr);
|
||||
....
|
||||
|
||||
Architectures that care about DMA_ATTR_FOO would check for its
|
||||
presence in their implementations of the mapping and unmapping
|
||||
routines, e.g.:
|
||||
routines, e.g.:::
|
||||
|
||||
void whizco_dma_map_sg_attrs(struct device *dev, dma_addr_t dma_addr,
|
||||
size_t size, enum dma_data_direction dir,
|
||||
unsigned long attrs)
|
||||
{
|
||||
....
|
||||
if (attrs & DMA_ATTR_FOO)
|
||||
/* twizzle the frobnozzle */
|
||||
....
|
||||
void whizco_dma_map_sg_attrs(struct device *dev, dma_addr_t dma_addr,
|
||||
size_t size, enum dma_data_direction dir,
|
||||
unsigned long attrs)
|
||||
{
|
||||
....
|
||||
if (attrs & DMA_ATTR_FOO)
|
||||
/* twizzle the frobnozzle */
|
||||
....
|
||||
}
|
||||
|
||||
|
||||
Part II - Advanced dma_ usage
|
||||
-----------------------------
|
||||
Part II - Advanced dma usage
|
||||
----------------------------
|
||||
|
||||
Warning: These pieces of the DMA API should not be used in the
|
||||
majority of cases, since they cater for unlikely corner cases that
|
||||
@ -450,9 +512,11 @@ If you don't understand how cache line coherency works between a
|
||||
processor and an I/O device, you should not be using this part of the
|
||||
API at all.
|
||||
|
||||
void *
|
||||
dma_alloc_noncoherent(struct device *dev, size_t size,
|
||||
dma_addr_t *dma_handle, gfp_t flag)
|
||||
::
|
||||
|
||||
void *
|
||||
dma_alloc_noncoherent(struct device *dev, size_t size,
|
||||
dma_addr_t *dma_handle, gfp_t flag)
|
||||
|
||||
Identical to dma_alloc_coherent() except that the platform will
|
||||
choose to return either consistent or non-consistent memory as it sees
|
||||
@ -468,39 +532,49 @@ only use this API if you positively know your driver will be
|
||||
required to work on one of the rare (usually non-PCI) architectures
|
||||
that simply cannot make consistent memory.
|
||||
|
||||
void
|
||||
dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr,
|
||||
dma_addr_t dma_handle)
|
||||
::
|
||||
|
||||
void
|
||||
dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr,
|
||||
dma_addr_t dma_handle)
|
||||
|
||||
Free memory allocated by the nonconsistent API. All parameters must
|
||||
be identical to those passed in (and returned by
|
||||
dma_alloc_noncoherent()).
|
||||
|
||||
int
|
||||
dma_get_cache_alignment(void)
|
||||
::
|
||||
|
||||
int
|
||||
dma_get_cache_alignment(void)
|
||||
|
||||
Returns the processor cache alignment. This is the absolute minimum
|
||||
alignment *and* width that you must observe when either mapping
|
||||
memory or doing partial flushes.
|
||||
|
||||
Notes: This API may return a number *larger* than the actual cache
|
||||
line, but it will guarantee that one or more cache lines fit exactly
|
||||
into the width returned by this call. It will also always be a power
|
||||
of two for easy alignment.
|
||||
.. note::
|
||||
|
||||
void
|
||||
dma_cache_sync(struct device *dev, void *vaddr, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
This API may return a number *larger* than the actual cache
|
||||
line, but it will guarantee that one or more cache lines fit exactly
|
||||
into the width returned by this call. It will also always be a power
|
||||
of two for easy alignment.
|
||||
|
||||
::
|
||||
|
||||
void
|
||||
dma_cache_sync(struct device *dev, void *vaddr, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
|
||||
Do a partial sync of memory that was allocated by
|
||||
dma_alloc_noncoherent(), starting at virtual address vaddr and
|
||||
continuing on for size. Again, you *must* observe the cache line
|
||||
boundaries when doing this.
|
||||
|
||||
int
|
||||
dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
|
||||
dma_addr_t device_addr, size_t size, int
|
||||
flags)
|
||||
::
|
||||
|
||||
int
|
||||
dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
|
||||
dma_addr_t device_addr, size_t size, int
|
||||
flags)
|
||||
|
||||
Declare region of memory to be handed out by dma_alloc_coherent() when
|
||||
it's asked for coherent memory for this device.
|
||||
@ -516,21 +590,21 @@ size is the size of the area (must be multiples of PAGE_SIZE).
|
||||
|
||||
flags can be ORed together and are:
|
||||
|
||||
DMA_MEMORY_MAP - request that the memory returned from
|
||||
dma_alloc_coherent() be directly writable.
|
||||
- DMA_MEMORY_MAP - request that the memory returned from
|
||||
dma_alloc_coherent() be directly writable.
|
||||
|
||||
DMA_MEMORY_IO - request that the memory returned from
|
||||
dma_alloc_coherent() be addressable using read()/write()/memcpy_toio() etc.
|
||||
- DMA_MEMORY_IO - request that the memory returned from
|
||||
dma_alloc_coherent() be addressable using read()/write()/memcpy_toio() etc.
|
||||
|
||||
One or both of these flags must be present.
|
||||
|
||||
DMA_MEMORY_INCLUDES_CHILDREN - make the declared memory be allocated by
|
||||
dma_alloc_coherent of any child devices of this one (for memory residing
|
||||
on a bridge).
|
||||
- DMA_MEMORY_INCLUDES_CHILDREN - make the declared memory be allocated by
|
||||
dma_alloc_coherent of any child devices of this one (for memory residing
|
||||
on a bridge).
|
||||
|
||||
DMA_MEMORY_EXCLUSIVE - only allocate memory from the declared regions.
|
||||
Do not allow dma_alloc_coherent() to fall back to system memory when
|
||||
it's out of memory in the declared region.
|
||||
- DMA_MEMORY_EXCLUSIVE - only allocate memory from the declared regions.
|
||||
Do not allow dma_alloc_coherent() to fall back to system memory when
|
||||
it's out of memory in the declared region.
|
||||
|
||||
The return value will be either DMA_MEMORY_MAP or DMA_MEMORY_IO and
|
||||
must correspond to a passed in flag (i.e. no returning DMA_MEMORY_IO
|
||||
@ -543,15 +617,17 @@ must be accessed using the correct bus functions. If your driver
|
||||
isn't prepared to handle this contingency, it should not specify
|
||||
DMA_MEMORY_IO in the input flags.
|
||||
|
||||
As a simplification for the platforms, only *one* such region of
|
||||
As a simplification for the platforms, only **one** such region of
|
||||
memory may be declared per device.
|
||||
|
||||
For reasons of efficiency, most platforms choose to track the declared
|
||||
region only at the granularity of a page. For smaller allocations,
|
||||
you should use the dma_pool() API.
|
||||
|
||||
void
|
||||
dma_release_declared_memory(struct device *dev)
|
||||
::
|
||||
|
||||
void
|
||||
dma_release_declared_memory(struct device *dev)
|
||||
|
||||
Remove the memory region previously declared from the system. This
|
||||
API performs *no* in-use checking for this region and will return
|
||||
@ -559,9 +635,11 @@ unconditionally having removed all the required structures. It is the
|
||||
driver's job to ensure that no parts of this memory region are
|
||||
currently in use.
|
||||
|
||||
void *
|
||||
dma_mark_declared_memory_occupied(struct device *dev,
|
||||
dma_addr_t device_addr, size_t size)
|
||||
::
|
||||
|
||||
void *
|
||||
dma_mark_declared_memory_occupied(struct device *dev,
|
||||
dma_addr_t device_addr, size_t size)
|
||||
|
||||
This is used to occupy specific regions of the declared space
|
||||
(dma_alloc_coherent() will hand out the first free region it finds).
|
||||
@ -592,38 +670,37 @@ option has a performance impact. Do not enable it in production kernels.
|
||||
If you boot the resulting kernel will contain code which does some bookkeeping
|
||||
about what DMA memory was allocated for which device. If this code detects an
|
||||
error it prints a warning message with some details into your kernel log. An
|
||||
example warning message may look like this:
|
||||
example warning message may look like this::
|
||||
|
||||
------------[ cut here ]------------
|
||||
WARNING: at /data2/repos/linux-2.6-iommu/lib/dma-debug.c:448
|
||||
check_unmap+0x203/0x490()
|
||||
Hardware name:
|
||||
forcedeth 0000:00:08.0: DMA-API: device driver frees DMA memory with wrong
|
||||
function [device address=0x00000000640444be] [size=66 bytes] [mapped as
|
||||
single] [unmapped as page]
|
||||
Modules linked in: nfsd exportfs bridge stp llc r8169
|
||||
Pid: 0, comm: swapper Tainted: G W 2.6.28-dmatest-09289-g8bb99c0 #1
|
||||
Call Trace:
|
||||
<IRQ> [<ffffffff80240b22>] warn_slowpath+0xf2/0x130
|
||||
[<ffffffff80647b70>] _spin_unlock+0x10/0x30
|
||||
[<ffffffff80537e75>] usb_hcd_link_urb_to_ep+0x75/0xc0
|
||||
[<ffffffff80647c22>] _spin_unlock_irqrestore+0x12/0x40
|
||||
[<ffffffff8055347f>] ohci_urb_enqueue+0x19f/0x7c0
|
||||
[<ffffffff80252f96>] queue_work+0x56/0x60
|
||||
[<ffffffff80237e10>] enqueue_task_fair+0x20/0x50
|
||||
[<ffffffff80539279>] usb_hcd_submit_urb+0x379/0xbc0
|
||||
[<ffffffff803b78c3>] cpumask_next_and+0x23/0x40
|
||||
[<ffffffff80235177>] find_busiest_group+0x207/0x8a0
|
||||
[<ffffffff8064784f>] _spin_lock_irqsave+0x1f/0x50
|
||||
[<ffffffff803c7ea3>] check_unmap+0x203/0x490
|
||||
[<ffffffff803c8259>] debug_dma_unmap_page+0x49/0x50
|
||||
[<ffffffff80485f26>] nv_tx_done_optimized+0xc6/0x2c0
|
||||
[<ffffffff80486c13>] nv_nic_irq_optimized+0x73/0x2b0
|
||||
[<ffffffff8026df84>] handle_IRQ_event+0x34/0x70
|
||||
[<ffffffff8026ffe9>] handle_edge_irq+0xc9/0x150
|
||||
[<ffffffff8020e3ab>] do_IRQ+0xcb/0x1c0
|
||||
[<ffffffff8020c093>] ret_from_intr+0x0/0xa
|
||||
<EOI> <4>---[ end trace f6435a98e2a38c0e ]---
|
||||
WARNING: at /data2/repos/linux-2.6-iommu/lib/dma-debug.c:448
|
||||
check_unmap+0x203/0x490()
|
||||
Hardware name:
|
||||
forcedeth 0000:00:08.0: DMA-API: device driver frees DMA memory with wrong
|
||||
function [device address=0x00000000640444be] [size=66 bytes] [mapped as
|
||||
single] [unmapped as page]
|
||||
Modules linked in: nfsd exportfs bridge stp llc r8169
|
||||
Pid: 0, comm: swapper Tainted: G W 2.6.28-dmatest-09289-g8bb99c0 #1
|
||||
Call Trace:
|
||||
<IRQ> [<ffffffff80240b22>] warn_slowpath+0xf2/0x130
|
||||
[<ffffffff80647b70>] _spin_unlock+0x10/0x30
|
||||
[<ffffffff80537e75>] usb_hcd_link_urb_to_ep+0x75/0xc0
|
||||
[<ffffffff80647c22>] _spin_unlock_irqrestore+0x12/0x40
|
||||
[<ffffffff8055347f>] ohci_urb_enqueue+0x19f/0x7c0
|
||||
[<ffffffff80252f96>] queue_work+0x56/0x60
|
||||
[<ffffffff80237e10>] enqueue_task_fair+0x20/0x50
|
||||
[<ffffffff80539279>] usb_hcd_submit_urb+0x379/0xbc0
|
||||
[<ffffffff803b78c3>] cpumask_next_and+0x23/0x40
|
||||
[<ffffffff80235177>] find_busiest_group+0x207/0x8a0
|
||||
[<ffffffff8064784f>] _spin_lock_irqsave+0x1f/0x50
|
||||
[<ffffffff803c7ea3>] check_unmap+0x203/0x490
|
||||
[<ffffffff803c8259>] debug_dma_unmap_page+0x49/0x50
|
||||
[<ffffffff80485f26>] nv_tx_done_optimized+0xc6/0x2c0
|
||||
[<ffffffff80486c13>] nv_nic_irq_optimized+0x73/0x2b0
|
||||
[<ffffffff8026df84>] handle_IRQ_event+0x34/0x70
|
||||
[<ffffffff8026ffe9>] handle_edge_irq+0xc9/0x150
|
||||
[<ffffffff8020e3ab>] do_IRQ+0xcb/0x1c0
|
||||
[<ffffffff8020c093>] ret_from_intr+0x0/0xa
|
||||
<EOI> <4>---[ end trace f6435a98e2a38c0e ]---
|
||||
|
||||
The driver developer can find the driver and the device including a stacktrace
|
||||
of the DMA-API call which caused this warning.
|
||||
@ -637,43 +714,42 @@ details.
|
||||
The debugfs directory for the DMA-API debugging code is called dma-api/. In
|
||||
this directory the following files can currently be found:
|
||||
|
||||
dma-api/all_errors This file contains a numeric value. If this
|
||||
=============================== ===============================================
|
||||
dma-api/all_errors This file contains a numeric value. If this
|
||||
value is not equal to zero the debugging code
|
||||
will print a warning for every error it finds
|
||||
into the kernel log. Be careful with this
|
||||
option, as it can easily flood your logs.
|
||||
|
||||
dma-api/disabled This read-only file contains the character 'Y'
|
||||
dma-api/disabled This read-only file contains the character 'Y'
|
||||
if the debugging code is disabled. This can
|
||||
happen when it runs out of memory or if it was
|
||||
disabled at boot time
|
||||
|
||||
dma-api/error_count This file is read-only and shows the total
|
||||
dma-api/error_count This file is read-only and shows the total
|
||||
numbers of errors found.
|
||||
|
||||
dma-api/num_errors The number in this file shows how many
|
||||
dma-api/num_errors The number in this file shows how many
|
||||
warnings will be printed to the kernel log
|
||||
before it stops. This number is initialized to
|
||||
one at system boot and be set by writing into
|
||||
this file
|
||||
|
||||
dma-api/min_free_entries
|
||||
This read-only file can be read to get the
|
||||
dma-api/min_free_entries This read-only file can be read to get the
|
||||
minimum number of free dma_debug_entries the
|
||||
allocator has ever seen. If this value goes
|
||||
down to zero the code will disable itself
|
||||
because it is not longer reliable.
|
||||
|
||||
dma-api/num_free_entries
|
||||
The current number of free dma_debug_entries
|
||||
dma-api/num_free_entries The current number of free dma_debug_entries
|
||||
in the allocator.
|
||||
|
||||
dma-api/driver-filter
|
||||
You can write a name of a driver into this file
|
||||
dma-api/driver-filter You can write a name of a driver into this file
|
||||
to limit the debug output to requests from that
|
||||
particular driver. Write an empty string to
|
||||
that file to disable the filter and see
|
||||
all errors again.
|
||||
=============================== ===============================================
|
||||
|
||||
If you have this code compiled into your kernel it will be enabled by default.
|
||||
If you want to boot without the bookkeeping anyway you can provide
|
||||
@ -692,7 +768,10 @@ of preallocated entries is defined per architecture. If it is too low for you
|
||||
boot with 'dma_debug_entries=<your_desired_number>' to overwrite the
|
||||
architectural default.
|
||||
|
||||
void debug_dmap_mapping_error(struct device *dev, dma_addr_t dma_addr);
|
||||
::
|
||||
|
||||
void
|
||||
debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr);
|
||||
|
||||
dma-debug interface debug_dma_mapping_error() to debug drivers that fail
|
||||
to check DMA mapping errors on addresses returned by dma_map_single() and
|
||||
@ -702,4 +781,3 @@ the driver. When driver does unmap, debug_dma_unmap() checks the flag and if
|
||||
this flag is still set, prints warning message that includes call trace that
|
||||
leads up to the unmap. This interface can be called from dma_mapping_error()
|
||||
routines to enable DMA mapping error check debugging.
|
||||
|
||||
|
@ -1,19 +1,20 @@
|
||||
DMA with ISA and LPC devices
|
||||
============================
|
||||
============================
|
||||
DMA with ISA and LPC devices
|
||||
============================
|
||||
|
||||
Pierre Ossman <drzeus@drzeus.cx>
|
||||
:Author: Pierre Ossman <drzeus@drzeus.cx>
|
||||
|
||||
This document describes how to do DMA transfers using the old ISA DMA
|
||||
controller. Even though ISA is more or less dead today the LPC bus
|
||||
uses the same DMA system so it will be around for quite some time.
|
||||
|
||||
Part I - Headers and dependencies
|
||||
---------------------------------
|
||||
Headers and dependencies
|
||||
------------------------
|
||||
|
||||
To do ISA style DMA you need to include two headers:
|
||||
To do ISA style DMA you need to include two headers::
|
||||
|
||||
#include <linux/dma-mapping.h>
|
||||
#include <asm/dma.h>
|
||||
#include <linux/dma-mapping.h>
|
||||
#include <asm/dma.h>
|
||||
|
||||
The first is the generic DMA API used to convert virtual addresses to
|
||||
bus addresses (see Documentation/DMA-API.txt for details).
|
||||
@ -23,8 +24,8 @@ this is not present on all platforms make sure you construct your
|
||||
Kconfig to be dependent on ISA_DMA_API (not ISA) so that nobody tries
|
||||
to build your driver on unsupported platforms.
|
||||
|
||||
Part II - Buffer allocation
|
||||
---------------------------
|
||||
Buffer allocation
|
||||
-----------------
|
||||
|
||||
The ISA DMA controller has some very strict requirements on which
|
||||
memory it can access so extra care must be taken when allocating
|
||||
@ -42,13 +43,13 @@ requirements you pass the flag GFP_DMA to kmalloc.
|
||||
|
||||
Unfortunately the memory available for ISA DMA is scarce so unless you
|
||||
allocate the memory during boot-up it's a good idea to also pass
|
||||
__GFP_REPEAT and __GFP_NOWARN to make the allocator try a bit harder.
|
||||
__GFP_RETRY_MAYFAIL and __GFP_NOWARN to make the allocator try a bit harder.
|
||||
|
||||
(This scarcity also means that you should allocate the buffer as
|
||||
early as possible and not release it until the driver is unloaded.)
|
||||
|
||||
Part III - Address translation
|
||||
------------------------------
|
||||
Address translation
|
||||
-------------------
|
||||
|
||||
To translate the virtual address to a bus address, use the normal DMA
|
||||
API. Do _not_ use isa_virt_to_phys() even though it does the same
|
||||
@ -61,8 +62,8 @@ Note: x86_64 had a broken DMA API when it came to ISA but has since
|
||||
been fixed. If your arch has problems then fix the DMA API instead of
|
||||
reverting to the ISA functions.
|
||||
|
||||
Part IV - Channels
|
||||
------------------
|
||||
Channels
|
||||
--------
|
||||
|
||||
A normal ISA DMA controller has 8 channels. The lower four are for
|
||||
8-bit transfers and the upper four are for 16-bit transfers.
|
||||
@ -80,8 +81,8 @@ The ability to use 16-bit or 8-bit transfers is _not_ up to you as a
|
||||
driver author but depends on what the hardware supports. Check your
|
||||
specs or test different channels.
|
||||
|
||||
Part V - Transfer data
|
||||
----------------------
|
||||
Transfer data
|
||||
-------------
|
||||
|
||||
Now for the good stuff, the actual DMA transfer. :)
|
||||
|
||||
@ -112,37 +113,37 @@ Once the DMA transfer is finished (or timed out) you should disable
|
||||
the channel again. You should also check get_dma_residue() to make
|
||||
sure that all data has been transferred.
|
||||
|
||||
Example:
|
||||
Example::
|
||||
|
||||
int flags, residue;
|
||||
int flags, residue;
|
||||
|
||||
flags = claim_dma_lock();
|
||||
flags = claim_dma_lock();
|
||||
|
||||
clear_dma_ff();
|
||||
clear_dma_ff();
|
||||
|
||||
set_dma_mode(channel, DMA_MODE_WRITE);
|
||||
set_dma_addr(channel, phys_addr);
|
||||
set_dma_count(channel, num_bytes);
|
||||
set_dma_mode(channel, DMA_MODE_WRITE);
|
||||
set_dma_addr(channel, phys_addr);
|
||||
set_dma_count(channel, num_bytes);
|
||||
|
||||
dma_enable(channel);
|
||||
dma_enable(channel);
|
||||
|
||||
release_dma_lock(flags);
|
||||
release_dma_lock(flags);
|
||||
|
||||
while (!device_done());
|
||||
while (!device_done());
|
||||
|
||||
flags = claim_dma_lock();
|
||||
flags = claim_dma_lock();
|
||||
|
||||
dma_disable(channel);
|
||||
dma_disable(channel);
|
||||
|
||||
residue = dma_get_residue(channel);
|
||||
if (residue != 0)
|
||||
printk(KERN_ERR "driver: Incomplete DMA transfer!"
|
||||
" %d bytes left!\n", residue);
|
||||
residue = dma_get_residue(channel);
|
||||
if (residue != 0)
|
||||
printk(KERN_ERR "driver: Incomplete DMA transfer!"
|
||||
" %d bytes left!\n", residue);
|
||||
|
||||
release_dma_lock(flags);
|
||||
release_dma_lock(flags);
|
||||
|
||||
Part VI - Suspend/resume
|
||||
------------------------
|
||||
Suspend/resume
|
||||
--------------
|
||||
|
||||
It is the driver's responsibility to make sure that the machine isn't
|
||||
suspended while a DMA transfer is in progress. Also, all DMA settings
|
||||
|
@ -1,5 +1,6 @@
|
||||
DMA attributes
|
||||
==============
|
||||
==============
|
||||
DMA attributes
|
||||
==============
|
||||
|
||||
This document describes the semantics of the DMA attributes that are
|
||||
defined in linux/dma-mapping.h.
|
||||
@ -108,6 +109,7 @@ This is a hint to the DMA-mapping subsystem that it's probably not worth
|
||||
the time to try to allocate memory to in a way that gives better TLB
|
||||
efficiency (AKA it's not worth trying to build the mapping out of larger
|
||||
pages). You might want to specify this if:
|
||||
|
||||
- You know that the accesses to this memory won't thrash the TLB.
|
||||
You might know that the accesses are likely to be sequential or
|
||||
that they aren't sequential but it's unlikely you'll ping-pong
|
||||
@ -121,11 +123,12 @@ pages). You might want to specify this if:
|
||||
the mapping to have a short lifetime then it may be worth it to
|
||||
optimize allocation (avoid coming up with large pages) instead of
|
||||
getting the slight performance win of larger pages.
|
||||
|
||||
Setting this hint doesn't guarantee that you won't get huge pages, but it
|
||||
means that we won't try quite as hard to get them.
|
||||
|
||||
NOTE: At the moment DMA_ATTR_ALLOC_SINGLE_PAGES is only implemented on ARM,
|
||||
though ARM64 patches will likely be posted soon.
|
||||
.. note:: At the moment DMA_ATTR_ALLOC_SINGLE_PAGES is only implemented on ARM,
|
||||
though ARM64 patches will likely be posted soon.
|
||||
|
||||
DMA_ATTR_NO_WARN
|
||||
----------------
|
||||
@ -142,10 +145,10 @@ problem at all, depending on the implementation of the retry mechanism.
|
||||
So, this provides a way for drivers to avoid those error messages on calls
|
||||
where allocation failures are not a problem, and shouldn't bother the logs.
|
||||
|
||||
NOTE: At the moment DMA_ATTR_NO_WARN is only implemented on PowerPC.
|
||||
.. note:: At the moment DMA_ATTR_NO_WARN is only implemented on PowerPC.
|
||||
|
||||
DMA_ATTR_PRIVILEGED
|
||||
------------------------------
|
||||
-------------------
|
||||
|
||||
Some advanced peripherals such as remote processors and GPUs perform
|
||||
accesses to DMA buffers in both privileged "supervisor" and unprivileged
|
||||
|
17
Documentation/DocBook/.gitignore
vendored
17
Documentation/DocBook/.gitignore
vendored
@ -1,17 +0,0 @@
|
||||
*.xml
|
||||
*.ps
|
||||
*.pdf
|
||||
*.html
|
||||
*.9.gz
|
||||
*.9
|
||||
*.aux
|
||||
*.dvi
|
||||
*.log
|
||||
*.out
|
||||
*.png
|
||||
*.gif
|
||||
*.svg
|
||||
*.proc
|
||||
*.db
|
||||
media-indices.tmpl
|
||||
media-entities.tmpl
|
@ -1,282 +0,0 @@
|
||||
###
|
||||
# This makefile is used to generate the kernel documentation,
|
||||
# primarily based on in-line comments in various source files.
|
||||
# See Documentation/kernel-doc-nano-HOWTO.txt for instruction in how
|
||||
# to document the SRC - and how to read it.
|
||||
# To add a new book the only step required is to add the book to the
|
||||
# list of DOCBOOKS.
|
||||
|
||||
DOCBOOKS := z8530book.xml \
|
||||
kernel-hacking.xml kernel-locking.xml \
|
||||
networking.xml \
|
||||
filesystems.xml lsm.xml kgdb.xml \
|
||||
libata.xml mtdnand.xml librs.xml rapidio.xml \
|
||||
s390-drivers.xml scsi.xml \
|
||||
sh.xml w1.xml
|
||||
|
||||
ifeq ($(DOCBOOKS),)
|
||||
|
||||
# Skip DocBook build if the user explicitly requested no DOCBOOKS.
|
||||
.DEFAULT:
|
||||
@echo " SKIP DocBook $@ target (DOCBOOKS=\"\" specified)."
|
||||
else
|
||||
ifneq ($(SPHINXDIRS),)
|
||||
|
||||
# Skip DocBook build if the user explicitly requested a sphinx dir
|
||||
.DEFAULT:
|
||||
@echo " SKIP DocBook $@ target (SPHINXDIRS specified)."
|
||||
else
|
||||
|
||||
|
||||
###
|
||||
# The build process is as follows (targets):
|
||||
# (xmldocs) [by docproc]
|
||||
# file.tmpl --> file.xml +--> file.ps (psdocs) [by db2ps or xmlto]
|
||||
# +--> file.pdf (pdfdocs) [by db2pdf or xmlto]
|
||||
# +--> DIR=file (htmldocs) [by xmlto]
|
||||
# +--> man/ (mandocs) [by xmlto]
|
||||
|
||||
|
||||
# for PDF and PS output you can choose between xmlto and docbook-utils tools
|
||||
PDF_METHOD = $(prefer-db2x)
|
||||
PS_METHOD = $(prefer-db2x)
|
||||
|
||||
|
||||
targets += $(DOCBOOKS)
|
||||
BOOKS := $(addprefix $(obj)/,$(DOCBOOKS))
|
||||
xmldocs: $(BOOKS)
|
||||
sgmldocs: xmldocs
|
||||
|
||||
PS := $(patsubst %.xml, %.ps, $(BOOKS))
|
||||
psdocs: $(PS)
|
||||
|
||||
PDF := $(patsubst %.xml, %.pdf, $(BOOKS))
|
||||
pdfdocs: $(PDF)
|
||||
|
||||
HTML := $(sort $(patsubst %.xml, %.html, $(BOOKS)))
|
||||
htmldocs: $(HTML)
|
||||
$(call cmd,build_main_index)
|
||||
|
||||
MAN := $(patsubst %.xml, %.9, $(BOOKS))
|
||||
mandocs: $(MAN)
|
||||
find $(obj)/man -name '*.9' | xargs gzip -nf
|
||||
|
||||
# Default location for installed man pages
|
||||
export INSTALL_MAN_PATH = $(objtree)/usr
|
||||
|
||||
installmandocs: mandocs
|
||||
mkdir -p $(INSTALL_MAN_PATH)/man/man9/
|
||||
find $(obj)/man -name '*.9.gz' -printf '%h %f\n' | \
|
||||
sort -k 2 -k 1 | uniq -f 1 | sed -e 's: :/:' | \
|
||||
xargs install -m 644 -t $(INSTALL_MAN_PATH)/man/man9/
|
||||
|
||||
# no-op for the DocBook toolchain
|
||||
epubdocs:
|
||||
latexdocs:
|
||||
linkcheckdocs:
|
||||
|
||||
###
|
||||
#External programs used
|
||||
KERNELDOCXMLREF = $(srctree)/scripts/kernel-doc-xml-ref
|
||||
KERNELDOC = $(srctree)/scripts/kernel-doc
|
||||
DOCPROC = $(objtree)/scripts/docproc
|
||||
CHECK_LC_CTYPE = $(objtree)/scripts/check-lc_ctype
|
||||
|
||||
# Use a fixed encoding - UTF-8 if the C library has support built-in
|
||||
# or ASCII if not
|
||||
LC_CTYPE := $(call try-run, LC_CTYPE=C.UTF-8 $(CHECK_LC_CTYPE),C.UTF-8,C)
|
||||
export LC_CTYPE
|
||||
|
||||
XMLTOFLAGS = -m $(srctree)/$(src)/stylesheet.xsl
|
||||
XMLTOFLAGS += --skip-validation
|
||||
|
||||
###
|
||||
# DOCPROC is used for two purposes:
|
||||
# 1) To generate a dependency list for a .tmpl file
|
||||
# 2) To preprocess a .tmpl file and call kernel-doc with
|
||||
# appropriate parameters.
|
||||
# The following rules are used to generate the .xml documentation
|
||||
# required to generate the final targets. (ps, pdf, html).
|
||||
quiet_cmd_docproc = DOCPROC $@
|
||||
cmd_docproc = SRCTREE=$(srctree)/ $(DOCPROC) doc $< >$@
|
||||
define rule_docproc
|
||||
set -e; \
|
||||
$(if $($(quiet)cmd_$(1)),echo ' $($(quiet)cmd_$(1))';) \
|
||||
$(cmd_$(1)); \
|
||||
( \
|
||||
echo 'cmd_$@ := $(cmd_$(1))'; \
|
||||
echo $@: `SRCTREE=$(srctree) $(DOCPROC) depend $<`; \
|
||||
) > $(dir $@).$(notdir $@).cmd
|
||||
endef
|
||||
|
||||
%.xml: %.tmpl $(KERNELDOC) $(DOCPROC) $(KERNELDOCXMLREF) FORCE
|
||||
$(call if_changed_rule,docproc)
|
||||
|
||||
# Tell kbuild to always build the programs
|
||||
always := $(hostprogs-y)
|
||||
|
||||
notfoundtemplate = echo "*** You have to install docbook-utils or xmlto ***"; \
|
||||
exit 1
|
||||
db2xtemplate = db2TYPE -o $(dir $@) $<
|
||||
xmltotemplate = xmlto TYPE $(XMLTOFLAGS) -o $(dir $@) $<
|
||||
|
||||
# determine which methods are available
|
||||
ifeq ($(shell which db2ps >/dev/null 2>&1 && echo found),found)
|
||||
use-db2x = db2x
|
||||
prefer-db2x = db2x
|
||||
else
|
||||
use-db2x = notfound
|
||||
prefer-db2x = $(use-xmlto)
|
||||
endif
|
||||
ifeq ($(shell which xmlto >/dev/null 2>&1 && echo found),found)
|
||||
use-xmlto = xmlto
|
||||
prefer-xmlto = xmlto
|
||||
else
|
||||
use-xmlto = notfound
|
||||
prefer-xmlto = $(use-db2x)
|
||||
endif
|
||||
|
||||
# the commands, generated from the chosen template
|
||||
quiet_cmd_db2ps = PS $@
|
||||
cmd_db2ps = $(subst TYPE,ps, $($(PS_METHOD)template))
|
||||
%.ps : %.xml
|
||||
$(call cmd,db2ps)
|
||||
|
||||
quiet_cmd_db2pdf = PDF $@
|
||||
cmd_db2pdf = $(subst TYPE,pdf, $($(PDF_METHOD)template))
|
||||
%.pdf : %.xml
|
||||
$(call cmd,db2pdf)
|
||||
|
||||
|
||||
index = index.html
|
||||
main_idx = $(obj)/$(index)
|
||||
quiet_cmd_build_main_index = HTML $(main_idx)
|
||||
cmd_build_main_index = rm -rf $(main_idx); \
|
||||
echo '<h1>Linux Kernel HTML Documentation</h1>' >> $(main_idx) && \
|
||||
echo '<h2>Kernel Version: $(KERNELVERSION)</h2>' >> $(main_idx) && \
|
||||
cat $(HTML) >> $(main_idx)
|
||||
|
||||
quiet_cmd_db2html = HTML $@
|
||||
cmd_db2html = xmlto html $(XMLTOFLAGS) -o $(patsubst %.html,%,$@) $< && \
|
||||
echo '<a HREF="$(patsubst %.html,%,$(notdir $@))/index.html"> \
|
||||
$(patsubst %.html,%,$(notdir $@))</a><p>' > $@
|
||||
|
||||
###
|
||||
# Rules to create an aux XML and .db, and use them to re-process the DocBook XML
|
||||
# to fill internal hyperlinks
|
||||
gen_aux_xml = :
|
||||
quiet_gen_aux_xml = echo ' XMLREF $@'
|
||||
silent_gen_aux_xml = :
|
||||
%.aux.xml: %.xml
|
||||
@$($(quiet)gen_aux_xml)
|
||||
@rm -rf $@
|
||||
@(cat $< | egrep "^<refentry id" | egrep -o "\".*\"" | cut -f 2 -d \" > $<.db)
|
||||
@$(KERNELDOCXMLREF) -db $<.db $< > $@
|
||||
.PRECIOUS: %.aux.xml
|
||||
|
||||
%.html: %.aux.xml
|
||||
@(which xmlto > /dev/null 2>&1) || \
|
||||
(echo "*** You need to install xmlto ***"; \
|
||||
exit 1)
|
||||
@rm -rf $@ $(patsubst %.html,%,$@)
|
||||
$(call cmd,db2html)
|
||||
@if [ ! -z "$(PNG-$(basename $(notdir $@)))" ]; then \
|
||||
cp $(PNG-$(basename $(notdir $@))) $(patsubst %.html,%,$@); fi
|
||||
|
||||
quiet_cmd_db2man = MAN $@
|
||||
cmd_db2man = if grep -q refentry $<; then xmlto man $(XMLTOFLAGS) -o $(obj)/man/$(*F) $< ; fi
|
||||
%.9 : %.xml
|
||||
@(which xmlto > /dev/null 2>&1) || \
|
||||
(echo "*** You need to install xmlto ***"; \
|
||||
exit 1)
|
||||
$(Q)mkdir -p $(obj)/man/$(*F)
|
||||
$(call cmd,db2man)
|
||||
@touch $@
|
||||
|
||||
###
|
||||
# Rules to generate postscripts and PNG images from .fig format files
|
||||
quiet_cmd_fig2eps = FIG2EPS $@
|
||||
cmd_fig2eps = fig2dev -Leps $< $@
|
||||
|
||||
%.eps: %.fig
|
||||
@(which fig2dev > /dev/null 2>&1) || \
|
||||
(echo "*** You need to install transfig ***"; \
|
||||
exit 1)
|
||||
$(call cmd,fig2eps)
|
||||
|
||||
quiet_cmd_fig2png = FIG2PNG $@
|
||||
cmd_fig2png = fig2dev -Lpng $< $@
|
||||
|
||||
%.png: %.fig
|
||||
@(which fig2dev > /dev/null 2>&1) || \
|
||||
(echo "*** You need to install transfig ***"; \
|
||||
exit 1)
|
||||
$(call cmd,fig2png)
|
||||
|
||||
###
|
||||
# Rule to convert a .c file to inline XML documentation
|
||||
gen_xml = :
|
||||
quiet_gen_xml = echo ' GEN $@'
|
||||
silent_gen_xml = :
|
||||
%.xml: %.c
|
||||
@$($(quiet)gen_xml)
|
||||
@( \
|
||||
echo "<programlisting>"; \
|
||||
expand --tabs=8 < $< | \
|
||||
sed -e "s/&/\\&/g" \
|
||||
-e "s/</\\</g" \
|
||||
-e "s/>/\\>/g"; \
|
||||
echo "</programlisting>") > $@
|
||||
|
||||
endif # DOCBOOKS=""
|
||||
endif # SPHINDIR=...
|
||||
|
||||
###
|
||||
# Help targets as used by the top-level makefile
|
||||
dochelp:
|
||||
@echo ' Linux kernel internal documentation in different formats (DocBook):'
|
||||
@echo ' htmldocs - HTML'
|
||||
@echo ' pdfdocs - PDF'
|
||||
@echo ' psdocs - Postscript'
|
||||
@echo ' xmldocs - XML DocBook'
|
||||
@echo ' mandocs - man pages'
|
||||
@echo ' installmandocs - install man pages generated by mandocs to INSTALL_MAN_PATH'; \
|
||||
echo ' (default: $(INSTALL_MAN_PATH))'; \
|
||||
echo ''
|
||||
@echo ' cleandocs - clean all generated DocBook files'
|
||||
@echo
|
||||
@echo ' make DOCBOOKS="s1.xml s2.xml" [target] Generate only docs s1.xml s2.xml'
|
||||
@echo ' valid values for DOCBOOKS are: $(DOCBOOKS)'
|
||||
@echo
|
||||
@echo " make DOCBOOKS=\"\" [target] Don't generate docs from Docbook"
|
||||
@echo ' This is useful to generate only the ReST docs (Sphinx)'
|
||||
|
||||
|
||||
###
|
||||
# Temporary files left by various tools
|
||||
clean-files := $(DOCBOOKS) \
|
||||
$(patsubst %.xml, %.dvi, $(DOCBOOKS)) \
|
||||
$(patsubst %.xml, %.aux, $(DOCBOOKS)) \
|
||||
$(patsubst %.xml, %.tex, $(DOCBOOKS)) \
|
||||
$(patsubst %.xml, %.log, $(DOCBOOKS)) \
|
||||
$(patsubst %.xml, %.out, $(DOCBOOKS)) \
|
||||
$(patsubst %.xml, %.ps, $(DOCBOOKS)) \
|
||||
$(patsubst %.xml, %.pdf, $(DOCBOOKS)) \
|
||||
$(patsubst %.xml, %.html, $(DOCBOOKS)) \
|
||||
$(patsubst %.xml, %.9, $(DOCBOOKS)) \
|
||||
$(patsubst %.xml, %.aux.xml, $(DOCBOOKS)) \
|
||||
$(patsubst %.xml, %.xml.db, $(DOCBOOKS)) \
|
||||
$(patsubst %.xml, %.xml, $(DOCBOOKS)) \
|
||||
$(patsubst %.xml, .%.xml.cmd, $(DOCBOOKS)) \
|
||||
$(index)
|
||||
|
||||
clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) man
|
||||
|
||||
cleandocs:
|
||||
$(Q)rm -f $(call objectify, $(clean-files))
|
||||
$(Q)rm -rf $(call objectify, $(clean-dirs))
|
||||
|
||||
# Declare the contents of the .PHONY variable as phony. We keep that
|
||||
# information in a variable so we can use it in if_changed and friends.
|
||||
|
||||
.PHONY: $(PHONY)
|
@ -1,381 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
|
||||
"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
|
||||
|
||||
<book id="Linux-filesystems-API">
|
||||
<bookinfo>
|
||||
<title>Linux Filesystems API</title>
|
||||
|
||||
<legalnotice>
|
||||
<para>
|
||||
This documentation is free software; you can redistribute
|
||||
it and/or modify it under the terms of the GNU General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2 of the License, or (at your option) any later
|
||||
version.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
This program is distributed in the hope that it will be
|
||||
useful, but WITHOUT ANY WARRANTY; without even the implied
|
||||
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
See the GNU General Public License for more details.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
You should have received a copy of the GNU General Public
|
||||
License along with this program; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
|
||||
MA 02111-1307 USA
|
||||
</para>
|
||||
|
||||
<para>
|
||||
For more details see the file COPYING in the source
|
||||
distribution of Linux.
|
||||
</para>
|
||||
</legalnotice>
|
||||
</bookinfo>
|
||||
|
||||
<toc></toc>
|
||||
|
||||
<chapter id="vfs">
|
||||
<title>The Linux VFS</title>
|
||||
<sect1 id="the_filesystem_types"><title>The Filesystem types</title>
|
||||
!Iinclude/linux/fs.h
|
||||
</sect1>
|
||||
<sect1 id="the_directory_cache"><title>The Directory Cache</title>
|
||||
!Efs/dcache.c
|
||||
!Iinclude/linux/dcache.h
|
||||
</sect1>
|
||||
<sect1 id="inode_handling"><title>Inode Handling</title>
|
||||
!Efs/inode.c
|
||||
!Efs/bad_inode.c
|
||||
</sect1>
|
||||
<sect1 id="registration_and_superblocks"><title>Registration and Superblocks</title>
|
||||
!Efs/super.c
|
||||
</sect1>
|
||||
<sect1 id="file_locks"><title>File Locks</title>
|
||||
!Efs/locks.c
|
||||
!Ifs/locks.c
|
||||
</sect1>
|
||||
<sect1 id="other_functions"><title>Other Functions</title>
|
||||
!Efs/mpage.c
|
||||
!Efs/namei.c
|
||||
!Efs/buffer.c
|
||||
!Eblock/bio.c
|
||||
!Efs/seq_file.c
|
||||
!Efs/filesystems.c
|
||||
!Efs/fs-writeback.c
|
||||
!Efs/block_dev.c
|
||||
</sect1>
|
||||
</chapter>
|
||||
|
||||
<chapter id="proc">
|
||||
<title>The proc filesystem</title>
|
||||
|
||||
<sect1 id="sysctl_interface"><title>sysctl interface</title>
|
||||
!Ekernel/sysctl.c
|
||||
</sect1>
|
||||
|
||||
<sect1 id="proc_filesystem_interface"><title>proc filesystem interface</title>
|
||||
!Ifs/proc/base.c
|
||||
</sect1>
|
||||
</chapter>
|
||||
|
||||
<chapter id="fs_events">
|
||||
<title>Events based on file descriptors</title>
|
||||
!Efs/eventfd.c
|
||||
</chapter>
|
||||
|
||||
<chapter id="sysfs">
|
||||
<title>The Filesystem for Exporting Kernel Objects</title>
|
||||
!Efs/sysfs/file.c
|
||||
!Efs/sysfs/symlink.c
|
||||
</chapter>
|
||||
|
||||
<chapter id="debugfs">
|
||||
<title>The debugfs filesystem</title>
|
||||
|
||||
<sect1 id="debugfs_interface"><title>debugfs interface</title>
|
||||
!Efs/debugfs/inode.c
|
||||
!Efs/debugfs/file.c
|
||||
</sect1>
|
||||
</chapter>
|
||||
|
||||
<chapter id="LinuxJDBAPI">
|
||||
<chapterinfo>
|
||||
<title>The Linux Journalling API</title>
|
||||
|
||||
<authorgroup>
|
||||
<author>
|
||||
<firstname>Roger</firstname>
|
||||
<surname>Gammans</surname>
|
||||
<affiliation>
|
||||
<address>
|
||||
<email>rgammans@computer-surgery.co.uk</email>
|
||||
</address>
|
||||
</affiliation>
|
||||
</author>
|
||||
</authorgroup>
|
||||
|
||||
<authorgroup>
|
||||
<author>
|
||||
<firstname>Stephen</firstname>
|
||||
<surname>Tweedie</surname>
|
||||
<affiliation>
|
||||
<address>
|
||||
<email>sct@redhat.com</email>
|
||||
</address>
|
||||
</affiliation>
|
||||
</author>
|
||||
</authorgroup>
|
||||
|
||||
<copyright>
|
||||
<year>2002</year>
|
||||
<holder>Roger Gammans</holder>
|
||||
</copyright>
|
||||
</chapterinfo>
|
||||
|
||||
<title>The Linux Journalling API</title>
|
||||
|
||||
<sect1 id="journaling_overview">
|
||||
<title>Overview</title>
|
||||
<sect2 id="journaling_details">
|
||||
<title>Details</title>
|
||||
<para>
|
||||
The journalling layer is easy to use. You need to
|
||||
first of all create a journal_t data structure. There are
|
||||
two calls to do this dependent on how you decide to allocate the physical
|
||||
media on which the journal resides. The jbd2_journal_init_inode() call
|
||||
is for journals stored in filesystem inodes, or the jbd2_journal_init_dev()
|
||||
call can be used for journal stored on a raw device (in a continuous range
|
||||
of blocks). A journal_t is a typedef for a struct pointer, so when
|
||||
you are finally finished make sure you call jbd2_journal_destroy() on it
|
||||
to free up any used kernel memory.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Once you have got your journal_t object you need to 'mount' or load the journal
|
||||
file. The journalling layer expects the space for the journal was already
|
||||
allocated and initialized properly by the userspace tools. When loading the
|
||||
journal you must call jbd2_journal_load() to process journal contents. If the
|
||||
client file system detects the journal contents does not need to be processed
|
||||
(or even need not have valid contents), it may call jbd2_journal_wipe() to
|
||||
clear the journal contents before calling jbd2_journal_load().
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Note that jbd2_journal_wipe(..,0) calls jbd2_journal_skip_recovery() for you if
|
||||
it detects any outstanding transactions in the journal and similarly
|
||||
jbd2_journal_load() will call jbd2_journal_recover() if necessary. I would
|
||||
advise reading ext4_load_journal() in fs/ext4/super.c for examples on this
|
||||
stage.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Now you can go ahead and start modifying the underlying
|
||||
filesystem. Almost.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
|
||||
You still need to actually journal your filesystem changes, this
|
||||
is done by wrapping them into transactions. Additionally you
|
||||
also need to wrap the modification of each of the buffers
|
||||
with calls to the journal layer, so it knows what the modifications
|
||||
you are actually making are. To do this use jbd2_journal_start() which
|
||||
returns a transaction handle.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
jbd2_journal_start()
|
||||
and its counterpart jbd2_journal_stop(), which indicates the end of a
|
||||
transaction are nestable calls, so you can reenter a transaction if necessary,
|
||||
but remember you must call jbd2_journal_stop() the same number of times as
|
||||
jbd2_journal_start() before the transaction is completed (or more accurately
|
||||
leaves the update phase). Ext4/VFS makes use of this feature to simplify
|
||||
handling of inode dirtying, quota support, etc.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Inside each transaction you need to wrap the modifications to the
|
||||
individual buffers (blocks). Before you start to modify a buffer you
|
||||
need to call jbd2_journal_get_{create,write,undo}_access() as appropriate,
|
||||
this allows the journalling layer to copy the unmodified data if it
|
||||
needs to. After all the buffer may be part of a previously uncommitted
|
||||
transaction.
|
||||
At this point you are at last ready to modify a buffer, and once
|
||||
you are have done so you need to call jbd2_journal_dirty_{meta,}data().
|
||||
Or if you've asked for access to a buffer you now know is now longer
|
||||
required to be pushed back on the device you can call jbd2_journal_forget()
|
||||
in much the same way as you might have used bforget() in the past.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
A jbd2_journal_flush() may be called at any time to commit and checkpoint
|
||||
all your transactions.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Then at umount time , in your put_super() you can then call jbd2_journal_destroy()
|
||||
to clean up your in-core journal object.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Unfortunately there a couple of ways the journal layer can cause a deadlock.
|
||||
The first thing to note is that each task can only have
|
||||
a single outstanding transaction at any one time, remember nothing
|
||||
commits until the outermost jbd2_journal_stop(). This means
|
||||
you must complete the transaction at the end of each file/inode/address
|
||||
etc. operation you perform, so that the journalling system isn't re-entered
|
||||
on another journal. Since transactions can't be nested/batched
|
||||
across differing journals, and another filesystem other than
|
||||
yours (say ext4) may be modified in a later syscall.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
The second case to bear in mind is that jbd2_journal_start() can
|
||||
block if there isn't enough space in the journal for your transaction
|
||||
(based on the passed nblocks param) - when it blocks it merely(!) needs to
|
||||
wait for transactions to complete and be committed from other tasks,
|
||||
so essentially we are waiting for jbd2_journal_stop(). So to avoid
|
||||
deadlocks you must treat jbd2_journal_start/stop() as if they
|
||||
were semaphores and include them in your semaphore ordering rules to prevent
|
||||
deadlocks. Note that jbd2_journal_extend() has similar blocking behaviour to
|
||||
jbd2_journal_start() so you can deadlock here just as easily as on
|
||||
jbd2_journal_start().
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Try to reserve the right number of blocks the first time. ;-). This will
|
||||
be the maximum number of blocks you are going to touch in this transaction.
|
||||
I advise having a look at at least ext4_jbd.h to see the basis on which
|
||||
ext4 uses to make these decisions.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Another wriggle to watch out for is your on-disk block allocation strategy.
|
||||
Why? Because, if you do a delete, you need to ensure you haven't reused any
|
||||
of the freed blocks until the transaction freeing these blocks commits. If you
|
||||
reused these blocks and crash happens, there is no way to restore the contents
|
||||
of the reallocated blocks at the end of the last fully committed transaction.
|
||||
|
||||
One simple way of doing this is to mark blocks as free in internal in-memory
|
||||
block allocation structures only after the transaction freeing them commits.
|
||||
Ext4 uses journal commit callback for this purpose.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
With journal commit callbacks you can ask the journalling layer to call a
|
||||
callback function when the transaction is finally committed to disk, so that
|
||||
you can do some of your own management. You ask the journalling layer for
|
||||
calling the callback by simply setting journal->j_commit_callback function
|
||||
pointer and that function is called after each transaction commit. You can also
|
||||
use transaction->t_private_list for attaching entries to a transaction that
|
||||
need processing when the transaction commits.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
JBD2 also provides a way to block all transaction updates via
|
||||
jbd2_journal_{un,}lock_updates(). Ext4 uses this when it wants a window with a
|
||||
clean and stable fs for a moment. E.g.
|
||||
</para>
|
||||
|
||||
<programlisting>
|
||||
|
||||
jbd2_journal_lock_updates() //stop new stuff happening..
|
||||
jbd2_journal_flush() // checkpoint everything.
|
||||
..do stuff on stable fs
|
||||
jbd2_journal_unlock_updates() // carry on with filesystem use.
|
||||
</programlisting>
|
||||
|
||||
<para>
|
||||
The opportunities for abuse and DOS attacks with this should be obvious,
|
||||
if you allow unprivileged userspace to trigger codepaths containing these
|
||||
calls.
|
||||
</para>
|
||||
|
||||
</sect2>
|
||||
|
||||
<sect2 id="jbd_summary">
|
||||
<title>Summary</title>
|
||||
<para>
|
||||
Using the journal is a matter of wrapping the different context changes,
|
||||
being each mount, each modification (transaction) and each changed buffer
|
||||
to tell the journalling layer about them.
|
||||
</para>
|
||||
|
||||
</sect2>
|
||||
|
||||
</sect1>
|
||||
|
||||
<sect1 id="data_types">
|
||||
<title>Data Types</title>
|
||||
<para>
|
||||
The journalling layer uses typedefs to 'hide' the concrete definitions
|
||||
of the structures used. As a client of the JBD2 layer you can
|
||||
just rely on the using the pointer as a magic cookie of some sort.
|
||||
|
||||
Obviously the hiding is not enforced as this is 'C'.
|
||||
</para>
|
||||
<sect2 id="structures"><title>Structures</title>
|
||||
!Iinclude/linux/jbd2.h
|
||||
</sect2>
|
||||
</sect1>
|
||||
|
||||
<sect1 id="functions">
|
||||
<title>Functions</title>
|
||||
<para>
|
||||
The functions here are split into two groups those that
|
||||
affect a journal as a whole, and those which are used to
|
||||
manage transactions
|
||||
</para>
|
||||
<sect2 id="journal_level"><title>Journal Level</title>
|
||||
!Efs/jbd2/journal.c
|
||||
!Ifs/jbd2/recovery.c
|
||||
</sect2>
|
||||
<sect2 id="transaction_level"><title>Transasction Level</title>
|
||||
!Efs/jbd2/transaction.c
|
||||
</sect2>
|
||||
</sect1>
|
||||
<sect1 id="see_also">
|
||||
<title>See also</title>
|
||||
<para>
|
||||
<citation>
|
||||
<ulink url="http://kernel.org/pub/linux/kernel/people/sct/ext3/journal-design.ps.gz">
|
||||
Journaling the Linux ext2fs Filesystem, LinuxExpo 98, Stephen Tweedie
|
||||
</ulink>
|
||||
</citation>
|
||||
</para>
|
||||
<para>
|
||||
<citation>
|
||||
<ulink url="http://olstrans.sourceforge.net/release/OLS2000-ext3/OLS2000-ext3.html">
|
||||
Ext3 Journalling FileSystem, OLS 2000, Dr. Stephen Tweedie
|
||||
</ulink>
|
||||
</citation>
|
||||
</para>
|
||||
</sect1>
|
||||
|
||||
</chapter>
|
||||
|
||||
<chapter id="splice">
|
||||
<title>splice API</title>
|
||||
<para>
|
||||
splice is a method for moving blocks of data around inside the
|
||||
kernel, without continually transferring them between the kernel
|
||||
and user space.
|
||||
</para>
|
||||
!Ffs/splice.c
|
||||
</chapter>
|
||||
|
||||
<chapter id="pipes">
|
||||
<title>pipes API</title>
|
||||
<para>
|
||||
Pipe interfaces are all for in-kernel (builtin image) use.
|
||||
They are not exported for use by modules.
|
||||
</para>
|
||||
!Iinclude/linux/pipe_fs_i.h
|
||||
!Ffs/pipe.c
|
||||
</chapter>
|
||||
|
||||
</book>
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,918 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
|
||||
"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
|
||||
|
||||
<book id="kgdbOnLinux">
|
||||
<bookinfo>
|
||||
<title>Using kgdb, kdb and the kernel debugger internals</title>
|
||||
|
||||
<authorgroup>
|
||||
<author>
|
||||
<firstname>Jason</firstname>
|
||||
<surname>Wessel</surname>
|
||||
<affiliation>
|
||||
<address>
|
||||
<email>jason.wessel@windriver.com</email>
|
||||
</address>
|
||||
</affiliation>
|
||||
</author>
|
||||
</authorgroup>
|
||||
<copyright>
|
||||
<year>2008,2010</year>
|
||||
<holder>Wind River Systems, Inc.</holder>
|
||||
</copyright>
|
||||
<copyright>
|
||||
<year>2004-2005</year>
|
||||
<holder>MontaVista Software, Inc.</holder>
|
||||
</copyright>
|
||||
<copyright>
|
||||
<year>2004</year>
|
||||
<holder>Amit S. Kale</holder>
|
||||
</copyright>
|
||||
|
||||
<legalnotice>
|
||||
<para>
|
||||
This file is licensed under the terms of the GNU General Public License
|
||||
version 2. This program is licensed "as is" without any warranty of any
|
||||
kind, whether express or implied.
|
||||
</para>
|
||||
|
||||
</legalnotice>
|
||||
</bookinfo>
|
||||
|
||||
<toc></toc>
|
||||
<chapter id="Introduction">
|
||||
<title>Introduction</title>
|
||||
<para>
|
||||
The kernel has two different debugger front ends (kdb and kgdb)
|
||||
which interface to the debug core. It is possible to use either
|
||||
of the debugger front ends and dynamically transition between them
|
||||
if you configure the kernel properly at compile and runtime.
|
||||
</para>
|
||||
<para>
|
||||
Kdb is simplistic shell-style interface which you can use on a
|
||||
system console with a keyboard or serial console. You can use it
|
||||
to inspect memory, registers, process lists, dmesg, and even set
|
||||
breakpoints to stop in a certain location. Kdb is not a source
|
||||
level debugger, although you can set breakpoints and execute some
|
||||
basic kernel run control. Kdb is mainly aimed at doing some
|
||||
analysis to aid in development or diagnosing kernel problems. You
|
||||
can access some symbols by name in kernel built-ins or in kernel
|
||||
modules if the code was built
|
||||
with <symbol>CONFIG_KALLSYMS</symbol>.
|
||||
</para>
|
||||
<para>
|
||||
Kgdb is intended to be used as a source level debugger for the
|
||||
Linux kernel. It is used along with gdb to debug a Linux kernel.
|
||||
The expectation is that gdb can be used to "break in" to the
|
||||
kernel to inspect memory, variables and look through call stack
|
||||
information similar to the way an application developer would use
|
||||
gdb to debug an application. It is possible to place breakpoints
|
||||
in kernel code and perform some limited execution stepping.
|
||||
</para>
|
||||
<para>
|
||||
Two machines are required for using kgdb. One of these machines is
|
||||
a development machine and the other is the target machine. The
|
||||
kernel to be debugged runs on the target machine. The development
|
||||
machine runs an instance of gdb against the vmlinux file which
|
||||
contains the symbols (not a boot image such as bzImage, zImage,
|
||||
uImage...). In gdb the developer specifies the connection
|
||||
parameters and connects to kgdb. The type of connection a
|
||||
developer makes with gdb depends on the availability of kgdb I/O
|
||||
modules compiled as built-ins or loadable kernel modules in the test
|
||||
machine's kernel.
|
||||
</para>
|
||||
</chapter>
|
||||
<chapter id="CompilingAKernel">
|
||||
<title>Compiling a kernel</title>
|
||||
<para>
|
||||
<itemizedlist>
|
||||
<listitem><para>In order to enable compilation of kdb, you must first enable kgdb.</para></listitem>
|
||||
<listitem><para>The kgdb test compile options are described in the kgdb test suite chapter.</para></listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
<sect1 id="CompileKGDB">
|
||||
<title>Kernel config options for kgdb</title>
|
||||
<para>
|
||||
To enable <symbol>CONFIG_KGDB</symbol> you should look under
|
||||
"Kernel hacking" / "Kernel debugging" and select "KGDB: kernel debugger".
|
||||
</para>
|
||||
<para>
|
||||
While it is not a hard requirement that you have symbols in your
|
||||
vmlinux file, gdb tends not to be very useful without the symbolic
|
||||
data, so you will want to turn
|
||||
on <symbol>CONFIG_DEBUG_INFO</symbol> which is called "Compile the
|
||||
kernel with debug info" in the config menu.
|
||||
</para>
|
||||
<para>
|
||||
It is advised, but not required, that you turn on the
|
||||
<symbol>CONFIG_FRAME_POINTER</symbol> kernel option which is called "Compile the
|
||||
kernel with frame pointers" in the config menu. This option
|
||||
inserts code to into the compiled executable which saves the frame
|
||||
information in registers or on the stack at different points which
|
||||
allows a debugger such as gdb to more accurately construct
|
||||
stack back traces while debugging the kernel.
|
||||
</para>
|
||||
<para>
|
||||
If the architecture that you are using supports the kernel option
|
||||
CONFIG_STRICT_KERNEL_RWX, you should consider turning it off. This
|
||||
option will prevent the use of software breakpoints because it
|
||||
marks certain regions of the kernel's memory space as read-only.
|
||||
If kgdb supports it for the architecture you are using, you can
|
||||
use hardware breakpoints if you desire to run with the
|
||||
CONFIG_STRICT_KERNEL_RWX option turned on, else you need to turn off
|
||||
this option.
|
||||
</para>
|
||||
<para>
|
||||
Next you should choose one of more I/O drivers to interconnect
|
||||
debugging host and debugged target. Early boot debugging requires
|
||||
a KGDB I/O driver that supports early debugging and the driver
|
||||
must be built into the kernel directly. Kgdb I/O driver
|
||||
configuration takes place via kernel or module parameters which
|
||||
you can learn more about in the in the section that describes the
|
||||
parameter "kgdboc".
|
||||
</para>
|
||||
<para>Here is an example set of .config symbols to enable or
|
||||
disable for kgdb:
|
||||
<itemizedlist>
|
||||
<listitem><para># CONFIG_STRICT_KERNEL_RWX is not set</para></listitem>
|
||||
<listitem><para>CONFIG_FRAME_POINTER=y</para></listitem>
|
||||
<listitem><para>CONFIG_KGDB=y</para></listitem>
|
||||
<listitem><para>CONFIG_KGDB_SERIAL_CONSOLE=y</para></listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</sect1>
|
||||
<sect1 id="CompileKDB">
|
||||
<title>Kernel config options for kdb</title>
|
||||
<para>Kdb is quite a bit more complex than the simple gdbstub
|
||||
sitting on top of the kernel's debug core. Kdb must implement a
|
||||
shell, and also adds some helper functions in other parts of the
|
||||
kernel, responsible for printing out interesting data such as what
|
||||
you would see if you ran "lsmod", or "ps". In order to build kdb
|
||||
into the kernel you follow the same steps as you would for kgdb.
|
||||
</para>
|
||||
<para>The main config option for kdb
|
||||
is <symbol>CONFIG_KGDB_KDB</symbol> which is called "KGDB_KDB:
|
||||
include kdb frontend for kgdb" in the config menu. In theory you
|
||||
would have already also selected an I/O driver such as the
|
||||
CONFIG_KGDB_SERIAL_CONSOLE interface if you plan on using kdb on a
|
||||
serial port, when you were configuring kgdb.
|
||||
</para>
|
||||
<para>If you want to use a PS/2-style keyboard with kdb, you would
|
||||
select CONFIG_KDB_KEYBOARD which is called "KGDB_KDB: keyboard as
|
||||
input device" in the config menu. The CONFIG_KDB_KEYBOARD option
|
||||
is not used for anything in the gdb interface to kgdb. The
|
||||
CONFIG_KDB_KEYBOARD option only works with kdb.
|
||||
</para>
|
||||
<para>Here is an example set of .config symbols to enable/disable kdb:
|
||||
<itemizedlist>
|
||||
<listitem><para># CONFIG_STRICT_KERNEL_RWX is not set</para></listitem>
|
||||
<listitem><para>CONFIG_FRAME_POINTER=y</para></listitem>
|
||||
<listitem><para>CONFIG_KGDB=y</para></listitem>
|
||||
<listitem><para>CONFIG_KGDB_SERIAL_CONSOLE=y</para></listitem>
|
||||
<listitem><para>CONFIG_KGDB_KDB=y</para></listitem>
|
||||
<listitem><para>CONFIG_KDB_KEYBOARD=y</para></listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</sect1>
|
||||
</chapter>
|
||||
<chapter id="kgdbKernelArgs">
|
||||
<title>Kernel Debugger Boot Arguments</title>
|
||||
<para>This section describes the various runtime kernel
|
||||
parameters that affect the configuration of the kernel debugger.
|
||||
The following chapter covers using kdb and kgdb as well as
|
||||
providing some examples of the configuration parameters.</para>
|
||||
<sect1 id="kgdboc">
|
||||
<title>Kernel parameter: kgdboc</title>
|
||||
<para>The kgdboc driver was originally an abbreviation meant to
|
||||
stand for "kgdb over console". Today it is the primary mechanism
|
||||
to configure how to communicate from gdb to kgdb as well as the
|
||||
devices you want to use to interact with the kdb shell.
|
||||
</para>
|
||||
<para>For kgdb/gdb, kgdboc is designed to work with a single serial
|
||||
port. It is intended to cover the circumstance where you want to
|
||||
use a serial console as your primary console as well as using it to
|
||||
perform kernel debugging. It is also possible to use kgdb on a
|
||||
serial port which is not designated as a system console. Kgdboc
|
||||
may be configured as a kernel built-in or a kernel loadable module.
|
||||
You can only make use of <constant>kgdbwait</constant> and early
|
||||
debugging if you build kgdboc into the kernel as a built-in.
|
||||
</para>
|
||||
<para>Optionally you can elect to activate kms (Kernel Mode
|
||||
Setting) integration. When you use kms with kgdboc and you have a
|
||||
video driver that has atomic mode setting hooks, it is possible to
|
||||
enter the debugger on the graphics console. When the kernel
|
||||
execution is resumed, the previous graphics mode will be restored.
|
||||
This integration can serve as a useful tool to aid in diagnosing
|
||||
crashes or doing analysis of memory with kdb while allowing the
|
||||
full graphics console applications to run.
|
||||
</para>
|
||||
<sect2 id="kgdbocArgs">
|
||||
<title>kgdboc arguments</title>
|
||||
<para>Usage: <constant>kgdboc=[kms][[,]kbd][[,]serial_device][,baud]</constant></para>
|
||||
<para>The order listed above must be observed if you use any of the
|
||||
optional configurations together.
|
||||
</para>
|
||||
<para>Abbreviations:
|
||||
<itemizedlist>
|
||||
<listitem><para>kms = Kernel Mode Setting</para></listitem>
|
||||
<listitem><para>kbd = Keyboard</para></listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
<para>You can configure kgdboc to use the keyboard, and/or a serial
|
||||
device depending on if you are using kdb and/or kgdb, in one of the
|
||||
following scenarios. The order listed above must be observed if
|
||||
you use any of the optional configurations together. Using kms +
|
||||
only gdb is generally not a useful combination.</para>
|
||||
<sect3 id="kgdbocArgs1">
|
||||
<title>Using loadable module or built-in</title>
|
||||
<para>
|
||||
<orderedlist>
|
||||
<listitem><para>As a kernel built-in:</para>
|
||||
<para>Use the kernel boot argument: <constant>kgdboc=<tty-device>,[baud]</constant></para></listitem>
|
||||
<listitem>
|
||||
<para>As a kernel loadable module:</para>
|
||||
<para>Use the command: <constant>modprobe kgdboc kgdboc=<tty-device>,[baud]</constant></para>
|
||||
<para>Here are two examples of how you might format the kgdboc
|
||||
string. The first is for an x86 target using the first serial port.
|
||||
The second example is for the ARM Versatile AB using the second
|
||||
serial port.
|
||||
<orderedlist>
|
||||
<listitem><para><constant>kgdboc=ttyS0,115200</constant></para></listitem>
|
||||
<listitem><para><constant>kgdboc=ttyAMA1,115200</constant></para></listitem>
|
||||
</orderedlist>
|
||||
</para>
|
||||
</listitem>
|
||||
</orderedlist></para>
|
||||
</sect3>
|
||||
<sect3 id="kgdbocArgs2">
|
||||
<title>Configure kgdboc at runtime with sysfs</title>
|
||||
<para>At run time you can enable or disable kgdboc by echoing a
|
||||
parameters into the sysfs. Here are two examples:</para>
|
||||
<orderedlist>
|
||||
<listitem><para>Enable kgdboc on ttyS0</para>
|
||||
<para><constant>echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
|
||||
<listitem><para>Disable kgdboc</para>
|
||||
<para><constant>echo "" > /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
|
||||
</orderedlist>
|
||||
<para>NOTE: You do not need to specify the baud if you are
|
||||
configuring the console on tty which is already configured or
|
||||
open.</para>
|
||||
</sect3>
|
||||
<sect3 id="kgdbocArgs3">
|
||||
<title>More examples</title>
|
||||
<para>You can configure kgdboc to use the keyboard, and/or a serial device
|
||||
depending on if you are using kdb and/or kgdb, in one of the
|
||||
following scenarios.
|
||||
<orderedlist>
|
||||
<listitem><para>kdb and kgdb over only a serial port</para>
|
||||
<para><constant>kgdboc=<serial_device>[,baud]</constant></para>
|
||||
<para>Example: <constant>kgdboc=ttyS0,115200</constant></para>
|
||||
</listitem>
|
||||
<listitem><para>kdb and kgdb with keyboard and a serial port</para>
|
||||
<para><constant>kgdboc=kbd,<serial_device>[,baud]</constant></para>
|
||||
<para>Example: <constant>kgdboc=kbd,ttyS0,115200</constant></para>
|
||||
</listitem>
|
||||
<listitem><para>kdb with a keyboard</para>
|
||||
<para><constant>kgdboc=kbd</constant></para>
|
||||
</listitem>
|
||||
<listitem><para>kdb with kernel mode setting</para>
|
||||
<para><constant>kgdboc=kms,kbd</constant></para>
|
||||
</listitem>
|
||||
<listitem><para>kdb with kernel mode setting and kgdb over a serial port</para>
|
||||
<para><constant>kgdboc=kms,kbd,ttyS0,115200</constant></para>
|
||||
</listitem>
|
||||
</orderedlist>
|
||||
</para>
|
||||
<para>NOTE: Kgdboc does not support interrupting the target via the
|
||||
gdb remote protocol. You must manually send a sysrq-g unless you
|
||||
have a proxy that splits console output to a terminal program.
|
||||
A console proxy has a separate TCP port for the debugger and a separate
|
||||
TCP port for the "human" console. The proxy can take care of sending
|
||||
the sysrq-g for you.
|
||||
</para>
|
||||
<para>When using kgdboc with no debugger proxy, you can end up
|
||||
connecting the debugger at one of two entry points. If an
|
||||
exception occurs after you have loaded kgdboc, a message should
|
||||
print on the console stating it is waiting for the debugger. In
|
||||
this case you disconnect your terminal program and then connect the
|
||||
debugger in its place. If you want to interrupt the target system
|
||||
and forcibly enter a debug session you have to issue a Sysrq
|
||||
sequence and then type the letter <constant>g</constant>. Then
|
||||
you disconnect the terminal session and connect gdb. Your options
|
||||
if you don't like this are to hack gdb to send the sysrq-g for you
|
||||
as well as on the initial connect, or to use a debugger proxy that
|
||||
allows an unmodified gdb to do the debugging.
|
||||
</para>
|
||||
</sect3>
|
||||
</sect2>
|
||||
</sect1>
|
||||
<sect1 id="kgdbwait">
|
||||
<title>Kernel parameter: kgdbwait</title>
|
||||
<para>
|
||||
The Kernel command line option <constant>kgdbwait</constant> makes
|
||||
kgdb wait for a debugger connection during booting of a kernel. You
|
||||
can only use this option if you compiled a kgdb I/O driver into the
|
||||
kernel and you specified the I/O driver configuration as a kernel
|
||||
command line option. The kgdbwait parameter should always follow the
|
||||
configuration parameter for the kgdb I/O driver in the kernel
|
||||
command line else the I/O driver will not be configured prior to
|
||||
asking the kernel to use it to wait.
|
||||
</para>
|
||||
<para>
|
||||
The kernel will stop and wait as early as the I/O driver and
|
||||
architecture allows when you use this option. If you build the
|
||||
kgdb I/O driver as a loadable kernel module kgdbwait will not do
|
||||
anything.
|
||||
</para>
|
||||
</sect1>
|
||||
<sect1 id="kgdbcon">
|
||||
<title>Kernel parameter: kgdbcon</title>
|
||||
<para> The kgdbcon feature allows you to see printk() messages
|
||||
inside gdb while gdb is connected to the kernel. Kdb does not make
|
||||
use of the kgdbcon feature.
|
||||
</para>
|
||||
<para>Kgdb supports using the gdb serial protocol to send console
|
||||
messages to the debugger when the debugger is connected and running.
|
||||
There are two ways to activate this feature.
|
||||
<orderedlist>
|
||||
<listitem><para>Activate with the kernel command line option:</para>
|
||||
<para><constant>kgdbcon</constant></para>
|
||||
</listitem>
|
||||
<listitem><para>Use sysfs before configuring an I/O driver</para>
|
||||
<para>
|
||||
<constant>echo 1 > /sys/module/kgdb/parameters/kgdb_use_con</constant>
|
||||
</para>
|
||||
<para>
|
||||
NOTE: If you do this after you configure the kgdb I/O driver, the
|
||||
setting will not take effect until the next point the I/O is
|
||||
reconfigured.
|
||||
</para>
|
||||
</listitem>
|
||||
</orderedlist>
|
||||
</para>
|
||||
<para>IMPORTANT NOTE: You cannot use kgdboc + kgdbcon on a tty that is an
|
||||
active system console. An example of incorrect usage is <constant>console=ttyS0,115200 kgdboc=ttyS0 kgdbcon</constant>
|
||||
</para>
|
||||
<para>It is possible to use this option with kgdboc on a tty that is not a system console.
|
||||
</para>
|
||||
</sect1>
|
||||
<sect1 id="kgdbreboot">
|
||||
<title>Run time parameter: kgdbreboot</title>
|
||||
<para> The kgdbreboot feature allows you to change how the debugger
|
||||
deals with the reboot notification. You have 3 choices for the
|
||||
behavior. The default behavior is always set to 0.</para>
|
||||
<orderedlist>
|
||||
<listitem><para>echo -1 > /sys/module/debug_core/parameters/kgdbreboot</para>
|
||||
<para>Ignore the reboot notification entirely.</para>
|
||||
</listitem>
|
||||
<listitem><para>echo 0 > /sys/module/debug_core/parameters/kgdbreboot</para>
|
||||
<para>Send the detach message to any attached debugger client.</para>
|
||||
</listitem>
|
||||
<listitem><para>echo 1 > /sys/module/debug_core/parameters/kgdbreboot</para>
|
||||
<para>Enter the debugger on reboot notify.</para>
|
||||
</listitem>
|
||||
</orderedlist>
|
||||
</sect1>
|
||||
</chapter>
|
||||
<chapter id="usingKDB">
|
||||
<title>Using kdb</title>
|
||||
<para>
|
||||
</para>
|
||||
<sect1 id="quickKDBserial">
|
||||
<title>Quick start for kdb on a serial port</title>
|
||||
<para>This is a quick example of how to use kdb.</para>
|
||||
<para><orderedlist>
|
||||
<listitem><para>Configure kgdboc at boot using kernel parameters:
|
||||
<itemizedlist>
|
||||
<listitem><para><constant>console=ttyS0,115200 kgdboc=ttyS0,115200</constant></para></listitem>
|
||||
</itemizedlist></para>
|
||||
<para>OR</para>
|
||||
<para>Configure kgdboc after the kernel has booted; assuming you are using a serial port console:
|
||||
<itemizedlist>
|
||||
<listitem><para><constant>echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem><para>Enter the kernel debugger manually or by waiting for an oops or fault. There are several ways you can enter the kernel debugger manually; all involve using the sysrq-g, which means you must have enabled CONFIG_MAGIC_SYSRQ=y in your kernel config.</para>
|
||||
<itemizedlist>
|
||||
<listitem><para>When logged in as root or with a super user session you can run:</para>
|
||||
<para><constant>echo g > /proc/sysrq-trigger</constant></para></listitem>
|
||||
<listitem><para>Example using minicom 2.2</para>
|
||||
<para>Press: <constant>Control-a</constant></para>
|
||||
<para>Press: <constant>f</constant></para>
|
||||
<para>Press: <constant>g</constant></para>
|
||||
</listitem>
|
||||
<listitem><para>When you have telneted to a terminal server that supports sending a remote break</para>
|
||||
<para>Press: <constant>Control-]</constant></para>
|
||||
<para>Type in:<constant>send break</constant></para>
|
||||
<para>Press: <constant>Enter</constant></para>
|
||||
<para>Press: <constant>g</constant></para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</listitem>
|
||||
<listitem><para>From the kdb prompt you can run the "help" command to see a complete list of the commands that are available.</para>
|
||||
<para>Some useful commands in kdb include:
|
||||
<itemizedlist>
|
||||
<listitem><para>lsmod -- Shows where kernel modules are loaded</para></listitem>
|
||||
<listitem><para>ps -- Displays only the active processes</para></listitem>
|
||||
<listitem><para>ps A -- Shows all the processes</para></listitem>
|
||||
<listitem><para>summary -- Shows kernel version info and memory usage</para></listitem>
|
||||
<listitem><para>bt -- Get a backtrace of the current process using dump_stack()</para></listitem>
|
||||
<listitem><para>dmesg -- View the kernel syslog buffer</para></listitem>
|
||||
<listitem><para>go -- Continue the system</para></listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>When you are done using kdb you need to consider rebooting the
|
||||
system or using the "go" command to resuming normal kernel
|
||||
execution. If you have paused the kernel for a lengthy period of
|
||||
time, applications that rely on timely networking or anything to do
|
||||
with real wall clock time could be adversely affected, so you
|
||||
should take this into consideration when using the kernel
|
||||
debugger.</para>
|
||||
</listitem>
|
||||
</orderedlist></para>
|
||||
</sect1>
|
||||
<sect1 id="quickKDBkeyboard">
|
||||
<title>Quick start for kdb using a keyboard connected console</title>
|
||||
<para>This is a quick example of how to use kdb with a keyboard.</para>
|
||||
<para><orderedlist>
|
||||
<listitem><para>Configure kgdboc at boot using kernel parameters:
|
||||
<itemizedlist>
|
||||
<listitem><para><constant>kgdboc=kbd</constant></para></listitem>
|
||||
</itemizedlist></para>
|
||||
<para>OR</para>
|
||||
<para>Configure kgdboc after the kernel has booted:
|
||||
<itemizedlist>
|
||||
<listitem><para><constant>echo kbd > /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem><para>Enter the kernel debugger manually or by waiting for an oops or fault. There are several ways you can enter the kernel debugger manually; all involve using the sysrq-g, which means you must have enabled CONFIG_MAGIC_SYSRQ=y in your kernel config.</para>
|
||||
<itemizedlist>
|
||||
<listitem><para>When logged in as root or with a super user session you can run:</para>
|
||||
<para><constant>echo g > /proc/sysrq-trigger</constant></para></listitem>
|
||||
<listitem><para>Example using a laptop keyboard</para>
|
||||
<para>Press and hold down: <constant>Alt</constant></para>
|
||||
<para>Press and hold down: <constant>Fn</constant></para>
|
||||
<para>Press and release the key with the label: <constant>SysRq</constant></para>
|
||||
<para>Release: <constant>Fn</constant></para>
|
||||
<para>Press and release: <constant>g</constant></para>
|
||||
<para>Release: <constant>Alt</constant></para>
|
||||
</listitem>
|
||||
<listitem><para>Example using a PS/2 101-key keyboard</para>
|
||||
<para>Press and hold down: <constant>Alt</constant></para>
|
||||
<para>Press and release the key with the label: <constant>SysRq</constant></para>
|
||||
<para>Press and release: <constant>g</constant></para>
|
||||
<para>Release: <constant>Alt</constant></para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Now type in a kdb command such as "help", "dmesg", "bt" or "go" to continue kernel execution.</para>
|
||||
</listitem>
|
||||
</orderedlist></para>
|
||||
</sect1>
|
||||
</chapter>
|
||||
<chapter id="EnableKGDB">
|
||||
<title>Using kgdb / gdb</title>
|
||||
<para>In order to use kgdb you must activate it by passing
|
||||
configuration information to one of the kgdb I/O drivers. If you
|
||||
do not pass any configuration information kgdb will not do anything
|
||||
at all. Kgdb will only actively hook up to the kernel trap hooks
|
||||
if a kgdb I/O driver is loaded and configured. If you unconfigure
|
||||
a kgdb I/O driver, kgdb will unregister all the kernel hook points.
|
||||
</para>
|
||||
<para> All kgdb I/O drivers can be reconfigured at run time, if
|
||||
<symbol>CONFIG_SYSFS</symbol> and <symbol>CONFIG_MODULES</symbol>
|
||||
are enabled, by echo'ing a new config string to
|
||||
<constant>/sys/module/<driver>/parameter/<option></constant>.
|
||||
The driver can be unconfigured by passing an empty string. You cannot
|
||||
change the configuration while the debugger is attached. Make sure
|
||||
to detach the debugger with the <constant>detach</constant> command
|
||||
prior to trying to unconfigure a kgdb I/O driver.
|
||||
</para>
|
||||
<sect1 id="ConnectingGDB">
|
||||
<title>Connecting with gdb to a serial port</title>
|
||||
<orderedlist>
|
||||
<listitem><para>Configure kgdboc</para>
|
||||
<para>Configure kgdboc at boot using kernel parameters:
|
||||
<itemizedlist>
|
||||
<listitem><para><constant>kgdboc=ttyS0,115200</constant></para></listitem>
|
||||
</itemizedlist></para>
|
||||
<para>OR</para>
|
||||
<para>Configure kgdboc after the kernel has booted:
|
||||
<itemizedlist>
|
||||
<listitem><para><constant>echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
|
||||
</itemizedlist></para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Stop kernel execution (break into the debugger)</para>
|
||||
<para>In order to connect to gdb via kgdboc, the kernel must
|
||||
first be stopped. There are several ways to stop the kernel which
|
||||
include using kgdbwait as a boot argument, via a sysrq-g, or running
|
||||
the kernel until it takes an exception where it waits for the
|
||||
debugger to attach.
|
||||
<itemizedlist>
|
||||
<listitem><para>When logged in as root or with a super user session you can run:</para>
|
||||
<para><constant>echo g > /proc/sysrq-trigger</constant></para></listitem>
|
||||
<listitem><para>Example using minicom 2.2</para>
|
||||
<para>Press: <constant>Control-a</constant></para>
|
||||
<para>Press: <constant>f</constant></para>
|
||||
<para>Press: <constant>g</constant></para>
|
||||
</listitem>
|
||||
<listitem><para>When you have telneted to a terminal server that supports sending a remote break</para>
|
||||
<para>Press: <constant>Control-]</constant></para>
|
||||
<para>Type in:<constant>send break</constant></para>
|
||||
<para>Press: <constant>Enter</constant></para>
|
||||
<para>Press: <constant>g</constant></para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Connect from gdb</para>
|
||||
<para>
|
||||
Example (using a directly connected port):
|
||||
</para>
|
||||
<programlisting>
|
||||
% gdb ./vmlinux
|
||||
(gdb) set remotebaud 115200
|
||||
(gdb) target remote /dev/ttyS0
|
||||
</programlisting>
|
||||
<para>
|
||||
Example (kgdb to a terminal server on TCP port 2012):
|
||||
</para>
|
||||
<programlisting>
|
||||
% gdb ./vmlinux
|
||||
(gdb) target remote 192.168.2.2:2012
|
||||
</programlisting>
|
||||
<para>
|
||||
Once connected, you can debug a kernel the way you would debug an
|
||||
application program.
|
||||
</para>
|
||||
<para>
|
||||
If you are having problems connecting or something is going
|
||||
seriously wrong while debugging, it will most often be the case
|
||||
that you want to enable gdb to be verbose about its target
|
||||
communications. You do this prior to issuing the <constant>target
|
||||
remote</constant> command by typing in: <constant>set debug remote 1</constant>
|
||||
</para>
|
||||
</listitem>
|
||||
</orderedlist>
|
||||
<para>Remember if you continue in gdb, and need to "break in" again,
|
||||
you need to issue an other sysrq-g. It is easy to create a simple
|
||||
entry point by putting a breakpoint at <constant>sys_sync</constant>
|
||||
and then you can run "sync" from a shell or script to break into the
|
||||
debugger.</para>
|
||||
</sect1>
|
||||
</chapter>
|
||||
<chapter id="switchKdbKgdb">
|
||||
<title>kgdb and kdb interoperability</title>
|
||||
<para>It is possible to transition between kdb and kgdb dynamically.
|
||||
The debug core will remember which you used the last time and
|
||||
automatically start in the same mode.</para>
|
||||
<sect1>
|
||||
<title>Switching between kdb and kgdb</title>
|
||||
<sect2>
|
||||
<title>Switching from kgdb to kdb</title>
|
||||
<para>
|
||||
There are two ways to switch from kgdb to kdb: you can use gdb to
|
||||
issue a maintenance packet, or you can blindly type the command $3#33.
|
||||
Whenever the kernel debugger stops in kgdb mode it will print the
|
||||
message <constant>KGDB or $3#33 for KDB</constant>. It is important
|
||||
to note that you have to type the sequence correctly in one pass.
|
||||
You cannot type a backspace or delete because kgdb will interpret
|
||||
that as part of the debug stream.
|
||||
<orderedlist>
|
||||
<listitem><para>Change from kgdb to kdb by blindly typing:</para>
|
||||
<para><constant>$3#33</constant></para></listitem>
|
||||
<listitem><para>Change from kgdb to kdb with gdb</para>
|
||||
<para><constant>maintenance packet 3</constant></para>
|
||||
<para>NOTE: Now you must kill gdb. Typically you press control-z and
|
||||
issue the command: kill -9 %</para></listitem>
|
||||
</orderedlist>
|
||||
</para>
|
||||
</sect2>
|
||||
<sect2>
|
||||
<title>Change from kdb to kgdb</title>
|
||||
<para>There are two ways you can change from kdb to kgdb. You can
|
||||
manually enter kgdb mode by issuing the kgdb command from the kdb
|
||||
shell prompt, or you can connect gdb while the kdb shell prompt is
|
||||
active. The kdb shell looks for the typical first commands that gdb
|
||||
would issue with the gdb remote protocol and if it sees one of those
|
||||
commands it automatically changes into kgdb mode.</para>
|
||||
<orderedlist>
|
||||
<listitem><para>From kdb issue the command:</para>
|
||||
<para><constant>kgdb</constant></para>
|
||||
<para>Now disconnect your terminal program and connect gdb in its place</para></listitem>
|
||||
<listitem><para>At the kdb prompt, disconnect the terminal program and connect gdb in its place.</para></listitem>
|
||||
</orderedlist>
|
||||
</sect2>
|
||||
</sect1>
|
||||
<sect1>
|
||||
<title>Running kdb commands from gdb</title>
|
||||
<para>It is possible to run a limited set of kdb commands from gdb,
|
||||
using the gdb monitor command. You don't want to execute any of the
|
||||
run control or breakpoint operations, because it can disrupt the
|
||||
state of the kernel debugger. You should be using gdb for
|
||||
breakpoints and run control operations if you have gdb connected.
|
||||
The more useful commands to run are things like lsmod, dmesg, ps or
|
||||
possibly some of the memory information commands. To see all the kdb
|
||||
commands you can run <constant>monitor help</constant>.</para>
|
||||
<para>Example:
|
||||
<informalexample><programlisting>
|
||||
(gdb) monitor ps
|
||||
1 idle process (state I) and
|
||||
27 sleeping system daemon (state M) processes suppressed,
|
||||
use 'ps A' to see all.
|
||||
Task Addr Pid Parent [*] cpu State Thread Command
|
||||
|
||||
0xc78291d0 1 0 0 0 S 0xc7829404 init
|
||||
0xc7954150 942 1 0 0 S 0xc7954384 dropbear
|
||||
0xc78789c0 944 1 0 0 S 0xc7878bf4 sh
|
||||
(gdb)
|
||||
</programlisting></informalexample>
|
||||
</para>
|
||||
</sect1>
|
||||
</chapter>
|
||||
<chapter id="KGDBTestSuite">
|
||||
<title>kgdb Test Suite</title>
|
||||
<para>
|
||||
When kgdb is enabled in the kernel config you can also elect to
|
||||
enable the config parameter KGDB_TESTS. Turning this on will
|
||||
enable a special kgdb I/O module which is designed to test the
|
||||
kgdb internal functions.
|
||||
</para>
|
||||
<para>
|
||||
The kgdb tests are mainly intended for developers to test the kgdb
|
||||
internals as well as a tool for developing a new kgdb architecture
|
||||
specific implementation. These tests are not really for end users
|
||||
of the Linux kernel. The primary source of documentation would be
|
||||
to look in the drivers/misc/kgdbts.c file.
|
||||
</para>
|
||||
<para>
|
||||
The kgdb test suite can also be configured at compile time to run
|
||||
the core set of tests by setting the kernel config parameter
|
||||
KGDB_TESTS_ON_BOOT. This particular option is aimed at automated
|
||||
regression testing and does not require modifying the kernel boot
|
||||
config arguments. If this is turned on, the kgdb test suite can
|
||||
be disabled by specifying "kgdbts=" as a kernel boot argument.
|
||||
</para>
|
||||
</chapter>
|
||||
<chapter id="CommonBackEndReq">
|
||||
<title>Kernel Debugger Internals</title>
|
||||
<sect1 id="kgdbArchitecture">
|
||||
<title>Architecture Specifics</title>
|
||||
<para>
|
||||
The kernel debugger is organized into a number of components:
|
||||
<orderedlist>
|
||||
<listitem><para>The debug core</para>
|
||||
<para>
|
||||
The debug core is found in kernel/debugger/debug_core.c. It contains:
|
||||
<itemizedlist>
|
||||
<listitem><para>A generic OS exception handler which includes
|
||||
sync'ing the processors into a stopped state on an multi-CPU
|
||||
system.</para></listitem>
|
||||
<listitem><para>The API to talk to the kgdb I/O drivers</para></listitem>
|
||||
<listitem><para>The API to make calls to the arch-specific kgdb implementation</para></listitem>
|
||||
<listitem><para>The logic to perform safe memory reads and writes to memory while using the debugger</para></listitem>
|
||||
<listitem><para>A full implementation for software breakpoints unless overridden by the arch</para></listitem>
|
||||
<listitem><para>The API to invoke either the kdb or kgdb frontend to the debug core.</para></listitem>
|
||||
<listitem><para>The structures and callback API for atomic kernel mode setting.</para>
|
||||
<para>NOTE: kgdboc is where the kms callbacks are invoked.</para></listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem><para>kgdb arch-specific implementation</para>
|
||||
<para>
|
||||
This implementation is generally found in arch/*/kernel/kgdb.c.
|
||||
As an example, arch/x86/kernel/kgdb.c contains the specifics to
|
||||
implement HW breakpoint as well as the initialization to
|
||||
dynamically register and unregister for the trap handlers on
|
||||
this architecture. The arch-specific portion implements:
|
||||
<itemizedlist>
|
||||
<listitem><para>contains an arch-specific trap catcher which
|
||||
invokes kgdb_handle_exception() to start kgdb about doing its
|
||||
work</para></listitem>
|
||||
<listitem><para>translation to and from gdb specific packet format to pt_regs</para></listitem>
|
||||
<listitem><para>Registration and unregistration of architecture specific trap hooks</para></listitem>
|
||||
<listitem><para>Any special exception handling and cleanup</para></listitem>
|
||||
<listitem><para>NMI exception handling and cleanup</para></listitem>
|
||||
<listitem><para>(optional) HW breakpoints</para></listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem><para>gdbstub frontend (aka kgdb)</para>
|
||||
<para>The gdbstub is located in kernel/debug/gdbstub.c. It contains:</para>
|
||||
<itemizedlist>
|
||||
<listitem><para>All the logic to implement the gdb serial protocol</para></listitem>
|
||||
</itemizedlist>
|
||||
</listitem>
|
||||
<listitem><para>kdb frontend</para>
|
||||
<para>The kdb debugger shell is broken down into a number of
|
||||
components. The kdb core is located in kernel/debug/kdb. There
|
||||
are a number of helper functions in some of the other kernel
|
||||
components to make it possible for kdb to examine and report
|
||||
information about the kernel without taking locks that could
|
||||
cause a kernel deadlock. The kdb core contains implements the following functionality.</para>
|
||||
<itemizedlist>
|
||||
<listitem><para>A simple shell</para></listitem>
|
||||
<listitem><para>The kdb core command set</para></listitem>
|
||||
<listitem><para>A registration API to register additional kdb shell commands.</para>
|
||||
<itemizedlist>
|
||||
<listitem><para>A good example of a self-contained kdb module
|
||||
is the "ftdump" command for dumping the ftrace buffer. See:
|
||||
kernel/trace/trace_kdb.c</para></listitem>
|
||||
<listitem><para>For an example of how to dynamically register
|
||||
a new kdb command you can build the kdb_hello.ko kernel module
|
||||
from samples/kdb/kdb_hello.c. To build this example you can
|
||||
set CONFIG_SAMPLES=y and CONFIG_SAMPLE_KDB=m in your kernel
|
||||
config. Later run "modprobe kdb_hello" and the next time you
|
||||
enter the kdb shell, you can run the "hello"
|
||||
command.</para></listitem>
|
||||
</itemizedlist></listitem>
|
||||
<listitem><para>The implementation for kdb_printf() which
|
||||
emits messages directly to I/O drivers, bypassing the kernel
|
||||
log.</para></listitem>
|
||||
<listitem><para>SW / HW breakpoint management for the kdb shell</para></listitem>
|
||||
</itemizedlist>
|
||||
</listitem>
|
||||
<listitem><para>kgdb I/O driver</para>
|
||||
<para>
|
||||
Each kgdb I/O driver has to provide an implementation for the following:
|
||||
<itemizedlist>
|
||||
<listitem><para>configuration via built-in or module</para></listitem>
|
||||
<listitem><para>dynamic configuration and kgdb hook registration calls</para></listitem>
|
||||
<listitem><para>read and write character interface</para></listitem>
|
||||
<listitem><para>A cleanup handler for unconfiguring from the kgdb core</para></listitem>
|
||||
<listitem><para>(optional) Early debug methodology</para></listitem>
|
||||
</itemizedlist>
|
||||
Any given kgdb I/O driver has to operate very closely with the
|
||||
hardware and must do it in such a way that does not enable
|
||||
interrupts or change other parts of the system context without
|
||||
completely restoring them. The kgdb core will repeatedly "poll"
|
||||
a kgdb I/O driver for characters when it needs input. The I/O
|
||||
driver is expected to return immediately if there is no data
|
||||
available. Doing so allows for the future possibility to touch
|
||||
watchdog hardware in such a way as to have a target system not
|
||||
reset when these are enabled.
|
||||
</para>
|
||||
</listitem>
|
||||
</orderedlist>
|
||||
</para>
|
||||
<para>
|
||||
If you are intent on adding kgdb architecture specific support
|
||||
for a new architecture, the architecture should define
|
||||
<constant>HAVE_ARCH_KGDB</constant> in the architecture specific
|
||||
Kconfig file. This will enable kgdb for the architecture, and
|
||||
at that point you must create an architecture specific kgdb
|
||||
implementation.
|
||||
</para>
|
||||
<para>
|
||||
There are a few flags which must be set on every architecture in
|
||||
their <asm/kgdb.h> file. These are:
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
NUMREGBYTES: The size in bytes of all of the registers, so
|
||||
that we can ensure they will all fit into a packet.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
BUFMAX: The size in bytes of the buffer GDB will read into.
|
||||
This must be larger than NUMREGBYTES.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
CACHE_FLUSH_IS_SAFE: Set to 1 if it is always safe to call
|
||||
flush_cache_range or flush_icache_range. On some architectures,
|
||||
these functions may not be safe to call on SMP since we keep other
|
||||
CPUs in a holding pattern.
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
<para>
|
||||
There are also the following functions for the common backend,
|
||||
found in kernel/kgdb.c, that must be supplied by the
|
||||
architecture-specific backend unless marked as (optional), in
|
||||
which case a default function maybe used if the architecture
|
||||
does not need to provide a specific implementation.
|
||||
</para>
|
||||
!Iinclude/linux/kgdb.h
|
||||
</sect1>
|
||||
<sect1 id="kgdbocDesign">
|
||||
<title>kgdboc internals</title>
|
||||
<sect2>
|
||||
<title>kgdboc and uarts</title>
|
||||
<para>
|
||||
The kgdboc driver is actually a very thin driver that relies on the
|
||||
underlying low level to the hardware driver having "polling hooks"
|
||||
to which the tty driver is attached. In the initial
|
||||
implementation of kgdboc the serial_core was changed to expose a
|
||||
low level UART hook for doing polled mode reading and writing of a
|
||||
single character while in an atomic context. When kgdb makes an I/O
|
||||
request to the debugger, kgdboc invokes a callback in the serial
|
||||
core which in turn uses the callback in the UART driver.</para>
|
||||
<para>
|
||||
When using kgdboc with a UART, the UART driver must implement two callbacks in the <constant>struct uart_ops</constant>. Example from drivers/8250.c:<programlisting>
|
||||
#ifdef CONFIG_CONSOLE_POLL
|
||||
.poll_get_char = serial8250_get_poll_char,
|
||||
.poll_put_char = serial8250_put_poll_char,
|
||||
#endif
|
||||
</programlisting>
|
||||
Any implementation specifics around creating a polling driver use the
|
||||
<constant>#ifdef CONFIG_CONSOLE_POLL</constant>, as shown above.
|
||||
Keep in mind that polling hooks have to be implemented in such a way
|
||||
that they can be called from an atomic context and have to restore
|
||||
the state of the UART chip on return such that the system can return
|
||||
to normal when the debugger detaches. You need to be very careful
|
||||
with any kind of lock you consider, because failing here is most likely
|
||||
going to mean pressing the reset button.
|
||||
</para>
|
||||
</sect2>
|
||||
<sect2 id="kgdbocKbd">
|
||||
<title>kgdboc and keyboards</title>
|
||||
<para>The kgdboc driver contains logic to configure communications
|
||||
with an attached keyboard. The keyboard infrastructure is only
|
||||
compiled into the kernel when CONFIG_KDB_KEYBOARD=y is set in the
|
||||
kernel configuration.</para>
|
||||
<para>The core polled keyboard driver driver for PS/2 type keyboards
|
||||
is in drivers/char/kdb_keyboard.c. This driver is hooked into the
|
||||
debug core when kgdboc populates the callback in the array
|
||||
called <constant>kdb_poll_funcs[]</constant>. The
|
||||
kdb_get_kbd_char() is the top-level function which polls hardware
|
||||
for single character input.
|
||||
</para>
|
||||
</sect2>
|
||||
<sect2 id="kgdbocKms">
|
||||
<title>kgdboc and kms</title>
|
||||
<para>The kgdboc driver contains logic to request the graphics
|
||||
display to switch to a text context when you are using
|
||||
"kgdboc=kms,kbd", provided that you have a video driver which has a
|
||||
frame buffer console and atomic kernel mode setting support.</para>
|
||||
<para>
|
||||
Every time the kernel
|
||||
debugger is entered it calls kgdboc_pre_exp_handler() which in turn
|
||||
calls con_debug_enter() in the virtual console layer. On resuming kernel
|
||||
execution, the kernel debugger calls kgdboc_post_exp_handler() which
|
||||
in turn calls con_debug_leave().</para>
|
||||
<para>Any video driver that wants to be compatible with the kernel
|
||||
debugger and the atomic kms callbacks must implement the
|
||||
mode_set_base_atomic, fb_debug_enter and fb_debug_leave operations.
|
||||
For the fb_debug_enter and fb_debug_leave the option exists to use
|
||||
the generic drm fb helper functions or implement something custom for
|
||||
the hardware. The following example shows the initialization of the
|
||||
.mode_set_base_atomic operation in
|
||||
drivers/gpu/drm/i915/intel_display.c:
|
||||
<informalexample>
|
||||
<programlisting>
|
||||
static const struct drm_crtc_helper_funcs intel_helper_funcs = {
|
||||
[...]
|
||||
.mode_set_base_atomic = intel_pipe_set_base_atomic,
|
||||
[...]
|
||||
};
|
||||
</programlisting>
|
||||
</informalexample>
|
||||
</para>
|
||||
<para>Here is an example of how the i915 driver initializes the fb_debug_enter and fb_debug_leave functions to use the generic drm helpers in
|
||||
drivers/gpu/drm/i915/intel_fb.c:
|
||||
<informalexample>
|
||||
<programlisting>
|
||||
static struct fb_ops intelfb_ops = {
|
||||
[...]
|
||||
.fb_debug_enter = drm_fb_helper_debug_enter,
|
||||
.fb_debug_leave = drm_fb_helper_debug_leave,
|
||||
[...]
|
||||
};
|
||||
</programlisting>
|
||||
</informalexample>
|
||||
</para>
|
||||
</sect2>
|
||||
</sect1>
|
||||
</chapter>
|
||||
<chapter id="credits">
|
||||
<title>Credits</title>
|
||||
<para>
|
||||
The following people have contributed to this document:
|
||||
<orderedlist>
|
||||
<listitem><para>Amit Kale<email>amitkale@linsyssoft.com</email></para></listitem>
|
||||
<listitem><para>Tom Rini<email>trini@kernel.crashing.org</email></para></listitem>
|
||||
</orderedlist>
|
||||
In March 2008 this document was completely rewritten by:
|
||||
<itemizedlist>
|
||||
<listitem><para>Jason Wessel<email>jason.wessel@windriver.com</email></para></listitem>
|
||||
</itemizedlist>
|
||||
In Jan 2010 this document was updated to include kdb.
|
||||
<itemizedlist>
|
||||
<listitem><para>Jason Wessel<email>jason.wessel@windriver.com</email></para></listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</chapter>
|
||||
</book>
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,289 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
|
||||
"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
|
||||
|
||||
<book id="Reed-Solomon-Library-Guide">
|
||||
<bookinfo>
|
||||
<title>Reed-Solomon Library Programming Interface</title>
|
||||
|
||||
<authorgroup>
|
||||
<author>
|
||||
<firstname>Thomas</firstname>
|
||||
<surname>Gleixner</surname>
|
||||
<affiliation>
|
||||
<address>
|
||||
<email>tglx@linutronix.de</email>
|
||||
</address>
|
||||
</affiliation>
|
||||
</author>
|
||||
</authorgroup>
|
||||
|
||||
<copyright>
|
||||
<year>2004</year>
|
||||
<holder>Thomas Gleixner</holder>
|
||||
</copyright>
|
||||
|
||||
<legalnotice>
|
||||
<para>
|
||||
This documentation is free software; you can redistribute
|
||||
it and/or modify it under the terms of the GNU General Public
|
||||
License version 2 as published by the Free Software Foundation.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
This program is distributed in the hope that it will be
|
||||
useful, but WITHOUT ANY WARRANTY; without even the implied
|
||||
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
See the GNU General Public License for more details.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
You should have received a copy of the GNU General Public
|
||||
License along with this program; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
|
||||
MA 02111-1307 USA
|
||||
</para>
|
||||
|
||||
<para>
|
||||
For more details see the file COPYING in the source
|
||||
distribution of Linux.
|
||||
</para>
|
||||
</legalnotice>
|
||||
</bookinfo>
|
||||
|
||||
<toc></toc>
|
||||
|
||||
<chapter id="intro">
|
||||
<title>Introduction</title>
|
||||
<para>
|
||||
The generic Reed-Solomon Library provides encoding, decoding
|
||||
and error correction functions.
|
||||
</para>
|
||||
<para>
|
||||
Reed-Solomon codes are used in communication and storage
|
||||
applications to ensure data integrity.
|
||||
</para>
|
||||
<para>
|
||||
This documentation is provided for developers who want to utilize
|
||||
the functions provided by the library.
|
||||
</para>
|
||||
</chapter>
|
||||
|
||||
<chapter id="bugs">
|
||||
<title>Known Bugs And Assumptions</title>
|
||||
<para>
|
||||
None.
|
||||
</para>
|
||||
</chapter>
|
||||
|
||||
<chapter id="usage">
|
||||
<title>Usage</title>
|
||||
<para>
|
||||
This chapter provides examples of how to use the library.
|
||||
</para>
|
||||
<sect1>
|
||||
<title>Initializing</title>
|
||||
<para>
|
||||
The init function init_rs returns a pointer to an
|
||||
rs decoder structure, which holds the necessary
|
||||
information for encoding, decoding and error correction
|
||||
with the given polynomial. It either uses an existing
|
||||
matching decoder or creates a new one. On creation all
|
||||
the lookup tables for fast en/decoding are created.
|
||||
The function may take a while, so make sure not to
|
||||
call it in critical code paths.
|
||||
</para>
|
||||
<programlisting>
|
||||
/* the Reed Solomon control structure */
|
||||
static struct rs_control *rs_decoder;
|
||||
|
||||
/* Symbolsize is 10 (bits)
|
||||
* Primitive polynomial is x^10+x^3+1
|
||||
* first consecutive root is 0
|
||||
* primitive element to generate roots = 1
|
||||
* generator polynomial degree (number of roots) = 6
|
||||
*/
|
||||
rs_decoder = init_rs (10, 0x409, 0, 1, 6);
|
||||
</programlisting>
|
||||
</sect1>
|
||||
<sect1>
|
||||
<title>Encoding</title>
|
||||
<para>
|
||||
The encoder calculates the Reed-Solomon code over
|
||||
the given data length and stores the result in
|
||||
the parity buffer. Note that the parity buffer must
|
||||
be initialized before calling the encoder.
|
||||
</para>
|
||||
<para>
|
||||
The expanded data can be inverted on the fly by
|
||||
providing a non-zero inversion mask. The expanded data is
|
||||
XOR'ed with the mask. This is used e.g. for FLASH
|
||||
ECC, where the all 0xFF is inverted to an all 0x00.
|
||||
The Reed-Solomon code for all 0x00 is all 0x00. The
|
||||
code is inverted before storing to FLASH so it is 0xFF
|
||||
too. This prevents that reading from an erased FLASH
|
||||
results in ECC errors.
|
||||
</para>
|
||||
<para>
|
||||
The databytes are expanded to the given symbol size
|
||||
on the fly. There is no support for encoding continuous
|
||||
bitstreams with a symbol size != 8 at the moment. If
|
||||
it is necessary it should be not a big deal to implement
|
||||
such functionality.
|
||||
</para>
|
||||
<programlisting>
|
||||
/* Parity buffer. Size = number of roots */
|
||||
uint16_t par[6];
|
||||
/* Initialize the parity buffer */
|
||||
memset(par, 0, sizeof(par));
|
||||
/* Encode 512 byte in data8. Store parity in buffer par */
|
||||
encode_rs8 (rs_decoder, data8, 512, par, 0);
|
||||
</programlisting>
|
||||
</sect1>
|
||||
<sect1>
|
||||
<title>Decoding</title>
|
||||
<para>
|
||||
The decoder calculates the syndrome over
|
||||
the given data length and the received parity symbols
|
||||
and corrects errors in the data.
|
||||
</para>
|
||||
<para>
|
||||
If a syndrome is available from a hardware decoder
|
||||
then the syndrome calculation is skipped.
|
||||
</para>
|
||||
<para>
|
||||
The correction of the data buffer can be suppressed
|
||||
by providing a correction pattern buffer and an error
|
||||
location buffer to the decoder. The decoder stores the
|
||||
calculated error location and the correction bitmask
|
||||
in the given buffers. This is useful for hardware
|
||||
decoders which use a weird bit ordering scheme.
|
||||
</para>
|
||||
<para>
|
||||
The databytes are expanded to the given symbol size
|
||||
on the fly. There is no support for decoding continuous
|
||||
bitstreams with a symbolsize != 8 at the moment. If
|
||||
it is necessary it should be not a big deal to implement
|
||||
such functionality.
|
||||
</para>
|
||||
|
||||
<sect2>
|
||||
<title>
|
||||
Decoding with syndrome calculation, direct data correction
|
||||
</title>
|
||||
<programlisting>
|
||||
/* Parity buffer. Size = number of roots */
|
||||
uint16_t par[6];
|
||||
uint8_t data[512];
|
||||
int numerr;
|
||||
/* Receive data */
|
||||
.....
|
||||
/* Receive parity */
|
||||
.....
|
||||
/* Decode 512 byte in data8.*/
|
||||
numerr = decode_rs8 (rs_decoder, data8, par, 512, NULL, 0, NULL, 0, NULL);
|
||||
</programlisting>
|
||||
</sect2>
|
||||
|
||||
<sect2>
|
||||
<title>
|
||||
Decoding with syndrome given by hardware decoder, direct data correction
|
||||
</title>
|
||||
<programlisting>
|
||||
/* Parity buffer. Size = number of roots */
|
||||
uint16_t par[6], syn[6];
|
||||
uint8_t data[512];
|
||||
int numerr;
|
||||
/* Receive data */
|
||||
.....
|
||||
/* Receive parity */
|
||||
.....
|
||||
/* Get syndrome from hardware decoder */
|
||||
.....
|
||||
/* Decode 512 byte in data8.*/
|
||||
numerr = decode_rs8 (rs_decoder, data8, par, 512, syn, 0, NULL, 0, NULL);
|
||||
</programlisting>
|
||||
</sect2>
|
||||
|
||||
<sect2>
|
||||
<title>
|
||||
Decoding with syndrome given by hardware decoder, no direct data correction.
|
||||
</title>
|
||||
<para>
|
||||
Note: It's not necessary to give data and received parity to the decoder.
|
||||
</para>
|
||||
<programlisting>
|
||||
/* Parity buffer. Size = number of roots */
|
||||
uint16_t par[6], syn[6], corr[8];
|
||||
uint8_t data[512];
|
||||
int numerr, errpos[8];
|
||||
/* Receive data */
|
||||
.....
|
||||
/* Receive parity */
|
||||
.....
|
||||
/* Get syndrome from hardware decoder */
|
||||
.....
|
||||
/* Decode 512 byte in data8.*/
|
||||
numerr = decode_rs8 (rs_decoder, NULL, NULL, 512, syn, 0, errpos, 0, corr);
|
||||
for (i = 0; i < numerr; i++) {
|
||||
do_error_correction_in_your_buffer(errpos[i], corr[i]);
|
||||
}
|
||||
</programlisting>
|
||||
</sect2>
|
||||
</sect1>
|
||||
<sect1>
|
||||
<title>Cleanup</title>
|
||||
<para>
|
||||
The function free_rs frees the allocated resources,
|
||||
if the caller is the last user of the decoder.
|
||||
</para>
|
||||
<programlisting>
|
||||
/* Release resources */
|
||||
free_rs(rs_decoder);
|
||||
</programlisting>
|
||||
</sect1>
|
||||
|
||||
</chapter>
|
||||
|
||||
<chapter id="structs">
|
||||
<title>Structures</title>
|
||||
<para>
|
||||
This chapter contains the autogenerated documentation of the structures which are
|
||||
used in the Reed-Solomon Library and are relevant for a developer.
|
||||
</para>
|
||||
!Iinclude/linux/rslib.h
|
||||
</chapter>
|
||||
|
||||
<chapter id="pubfunctions">
|
||||
<title>Public Functions Provided</title>
|
||||
<para>
|
||||
This chapter contains the autogenerated documentation of the Reed-Solomon functions
|
||||
which are exported.
|
||||
</para>
|
||||
!Elib/reed_solomon/reed_solomon.c
|
||||
</chapter>
|
||||
|
||||
<chapter id="credits">
|
||||
<title>Credits</title>
|
||||
<para>
|
||||
The library code for encoding and decoding was written by Phil Karn.
|
||||
</para>
|
||||
<programlisting>
|
||||
Copyright 2002, Phil Karn, KA9Q
|
||||
May be used under the terms of the GNU General Public License (GPL)
|
||||
</programlisting>
|
||||
<para>
|
||||
The wrapper functions and interfaces are written by Thomas Gleixner.
|
||||
</para>
|
||||
<para>
|
||||
Many users have provided bugfixes, improvements and helping hands for testing.
|
||||
Thanks a lot.
|
||||
</para>
|
||||
<para>
|
||||
The following people have contributed to this document:
|
||||
</para>
|
||||
<para>
|
||||
Thomas Gleixner<email>tglx@linutronix.de</email>
|
||||
</para>
|
||||
</chapter>
|
||||
</book>
|
@ -1,265 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
|
||||
"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
|
||||
|
||||
<article class="whitepaper" id="LinuxSecurityModule" lang="en">
|
||||
<articleinfo>
|
||||
<title>Linux Security Modules: General Security Hooks for Linux</title>
|
||||
<authorgroup>
|
||||
<author>
|
||||
<firstname>Stephen</firstname>
|
||||
<surname>Smalley</surname>
|
||||
<affiliation>
|
||||
<orgname>NAI Labs</orgname>
|
||||
<address><email>ssmalley@nai.com</email></address>
|
||||
</affiliation>
|
||||
</author>
|
||||
<author>
|
||||
<firstname>Timothy</firstname>
|
||||
<surname>Fraser</surname>
|
||||
<affiliation>
|
||||
<orgname>NAI Labs</orgname>
|
||||
<address><email>tfraser@nai.com</email></address>
|
||||
</affiliation>
|
||||
</author>
|
||||
<author>
|
||||
<firstname>Chris</firstname>
|
||||
<surname>Vance</surname>
|
||||
<affiliation>
|
||||
<orgname>NAI Labs</orgname>
|
||||
<address><email>cvance@nai.com</email></address>
|
||||
</affiliation>
|
||||
</author>
|
||||
</authorgroup>
|
||||
</articleinfo>
|
||||
|
||||
<sect1 id="Introduction"><title>Introduction</title>
|
||||
|
||||
<para>
|
||||
In March 2001, the National Security Agency (NSA) gave a presentation
|
||||
about Security-Enhanced Linux (SELinux) at the 2.5 Linux Kernel
|
||||
Summit. SELinux is an implementation of flexible and fine-grained
|
||||
nondiscretionary access controls in the Linux kernel, originally
|
||||
implemented as its own particular kernel patch. Several other
|
||||
security projects (e.g. RSBAC, Medusa) have also developed flexible
|
||||
access control architectures for the Linux kernel, and various
|
||||
projects have developed particular access control models for Linux
|
||||
(e.g. LIDS, DTE, SubDomain). Each project has developed and
|
||||
maintained its own kernel patch to support its security needs.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
In response to the NSA presentation, Linus Torvalds made a set of
|
||||
remarks that described a security framework he would be willing to
|
||||
consider for inclusion in the mainstream Linux kernel. He described a
|
||||
general framework that would provide a set of security hooks to
|
||||
control operations on kernel objects and a set of opaque security
|
||||
fields in kernel data structures for maintaining security attributes.
|
||||
This framework could then be used by loadable kernel modules to
|
||||
implement any desired model of security. Linus also suggested the
|
||||
possibility of migrating the Linux capabilities code into such a
|
||||
module.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
The Linux Security Modules (LSM) project was started by WireX to
|
||||
develop such a framework. LSM is a joint development effort by
|
||||
several security projects, including Immunix, SELinux, SGI and Janus,
|
||||
and several individuals, including Greg Kroah-Hartman and James
|
||||
Morris, to develop a Linux kernel patch that implements this
|
||||
framework. The patch is currently tracking the 2.4 series and is
|
||||
targeted for integration into the 2.5 development series. This
|
||||
technical report provides an overview of the framework and the example
|
||||
capabilities security module provided by the LSM kernel patch.
|
||||
</para>
|
||||
|
||||
</sect1>
|
||||
|
||||
<sect1 id="framework"><title>LSM Framework</title>
|
||||
|
||||
<para>
|
||||
The LSM kernel patch provides a general kernel framework to support
|
||||
security modules. In particular, the LSM framework is primarily
|
||||
focused on supporting access control modules, although future
|
||||
development is likely to address other security needs such as
|
||||
auditing. By itself, the framework does not provide any additional
|
||||
security; it merely provides the infrastructure to support security
|
||||
modules. The LSM kernel patch also moves most of the capabilities
|
||||
logic into an optional security module, with the system defaulting
|
||||
to the traditional superuser logic. This capabilities module
|
||||
is discussed further in <xref linkend="cap"/>.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
The LSM kernel patch adds security fields to kernel data structures
|
||||
and inserts calls to hook functions at critical points in the kernel
|
||||
code to manage the security fields and to perform access control. It
|
||||
also adds functions for registering and unregistering security
|
||||
modules, and adds a general <function>security</function> system call
|
||||
to support new system calls for security-aware applications.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
The LSM security fields are simply <type>void*</type> pointers. For
|
||||
process and program execution security information, security fields
|
||||
were added to <structname>struct task_struct</structname> and
|
||||
<structname>struct linux_binprm</structname>. For filesystem security
|
||||
information, a security field was added to
|
||||
<structname>struct super_block</structname>. For pipe, file, and socket
|
||||
security information, security fields were added to
|
||||
<structname>struct inode</structname> and
|
||||
<structname>struct file</structname>. For packet and network device security
|
||||
information, security fields were added to
|
||||
<structname>struct sk_buff</structname> and
|
||||
<structname>struct net_device</structname>. For System V IPC security
|
||||
information, security fields were added to
|
||||
<structname>struct kern_ipc_perm</structname> and
|
||||
<structname>struct msg_msg</structname>; additionally, the definitions
|
||||
for <structname>struct msg_msg</structname>, <structname>struct
|
||||
msg_queue</structname>, and <structname>struct
|
||||
shmid_kernel</structname> were moved to header files
|
||||
(<filename>include/linux/msg.h</filename> and
|
||||
<filename>include/linux/shm.h</filename> as appropriate) to allow
|
||||
the security modules to use these definitions.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Each LSM hook is a function pointer in a global table,
|
||||
security_ops. This table is a
|
||||
<structname>security_operations</structname> structure as defined by
|
||||
<filename>include/linux/security.h</filename>. Detailed documentation
|
||||
for each hook is included in this header file. At present, this
|
||||
structure consists of a collection of substructures that group related
|
||||
hooks based on the kernel object (e.g. task, inode, file, sk_buff,
|
||||
etc) as well as some top-level hook function pointers for system
|
||||
operations. This structure is likely to be flattened in the future
|
||||
for performance. The placement of the hook calls in the kernel code
|
||||
is described by the "called:" lines in the per-hook documentation in
|
||||
the header file. The hook calls can also be easily found in the
|
||||
kernel code by looking for the string "security_ops->".
|
||||
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Linus mentioned per-process security hooks in his original remarks as a
|
||||
possible alternative to global security hooks. However, if LSM were
|
||||
to start from the perspective of per-process hooks, then the base
|
||||
framework would have to deal with how to handle operations that
|
||||
involve multiple processes (e.g. kill), since each process might have
|
||||
its own hook for controlling the operation. This would require a
|
||||
general mechanism for composing hooks in the base framework.
|
||||
Additionally, LSM would still need global hooks for operations that
|
||||
have no process context (e.g. network input operations).
|
||||
Consequently, LSM provides global security hooks, but a security
|
||||
module is free to implement per-process hooks (where that makes sense)
|
||||
by storing a security_ops table in each process' security field and
|
||||
then invoking these per-process hooks from the global hooks.
|
||||
The problem of composition is thus deferred to the module.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
The global security_ops table is initialized to a set of hook
|
||||
functions provided by a dummy security module that provides
|
||||
traditional superuser logic. A <function>register_security</function>
|
||||
function (in <filename>security/security.c</filename>) is provided to
|
||||
allow a security module to set security_ops to refer to its own hook
|
||||
functions, and an <function>unregister_security</function> function is
|
||||
provided to revert security_ops to the dummy module hooks. This
|
||||
mechanism is used to set the primary security module, which is
|
||||
responsible for making the final decision for each hook.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
LSM also provides a simple mechanism for stacking additional security
|
||||
modules with the primary security module. It defines
|
||||
<function>register_security</function> and
|
||||
<function>unregister_security</function> hooks in the
|
||||
<structname>security_operations</structname> structure and provides
|
||||
<function>mod_reg_security</function> and
|
||||
<function>mod_unreg_security</function> functions that invoke these
|
||||
hooks after performing some sanity checking. A security module can
|
||||
call these functions in order to stack with other modules. However,
|
||||
the actual details of how this stacking is handled are deferred to the
|
||||
module, which can implement these hooks in any way it wishes
|
||||
(including always returning an error if it does not wish to support
|
||||
stacking). In this manner, LSM again defers the problem of
|
||||
composition to the module.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Although the LSM hooks are organized into substructures based on
|
||||
kernel object, all of the hooks can be viewed as falling into two
|
||||
major categories: hooks that are used to manage the security fields
|
||||
and hooks that are used to perform access control. Examples of the
|
||||
first category of hooks include the
|
||||
<function>alloc_security</function> and
|
||||
<function>free_security</function> hooks defined for each kernel data
|
||||
structure that has a security field. These hooks are used to allocate
|
||||
and free security structures for kernel objects. The first category
|
||||
of hooks also includes hooks that set information in the security
|
||||
field after allocation, such as the <function>post_lookup</function>
|
||||
hook in <structname>struct inode_security_ops</structname>. This hook
|
||||
is used to set security information for inodes after successful lookup
|
||||
operations. An example of the second category of hooks is the
|
||||
<function>permission</function> hook in
|
||||
<structname>struct inode_security_ops</structname>. This hook checks
|
||||
permission when accessing an inode.
|
||||
</para>
|
||||
|
||||
</sect1>
|
||||
|
||||
<sect1 id="cap"><title>LSM Capabilities Module</title>
|
||||
|
||||
<para>
|
||||
The LSM kernel patch moves most of the existing POSIX.1e capabilities
|
||||
logic into an optional security module stored in the file
|
||||
<filename>security/capability.c</filename>. This change allows
|
||||
users who do not want to use capabilities to omit this code entirely
|
||||
from their kernel, instead using the dummy module for traditional
|
||||
superuser logic or any other module that they desire. This change
|
||||
also allows the developers of the capabilities logic to maintain and
|
||||
enhance their code more freely, without needing to integrate patches
|
||||
back into the base kernel.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
In addition to moving the capabilities logic, the LSM kernel patch
|
||||
could move the capability-related fields from the kernel data
|
||||
structures into the new security fields managed by the security
|
||||
modules. However, at present, the LSM kernel patch leaves the
|
||||
capability fields in the kernel data structures. In his original
|
||||
remarks, Linus suggested that this might be preferable so that other
|
||||
security modules can be easily stacked with the capabilities module
|
||||
without needing to chain multiple security structures on the security field.
|
||||
It also avoids imposing extra overhead on the capabilities module
|
||||
to manage the security fields. However, the LSM framework could
|
||||
certainly support such a move if it is determined to be desirable,
|
||||
with only a few additional changes described below.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
At present, the capabilities logic for computing process capabilities
|
||||
on <function>execve</function> and <function>set*uid</function>,
|
||||
checking capabilities for a particular process, saving and checking
|
||||
capabilities for netlink messages, and handling the
|
||||
<function>capget</function> and <function>capset</function> system
|
||||
calls have been moved into the capabilities module. There are still a
|
||||
few locations in the base kernel where capability-related fields are
|
||||
directly examined or modified, but the current version of the LSM
|
||||
patch does allow a security module to completely replace the
|
||||
assignment and testing of capabilities. These few locations would
|
||||
need to be changed if the capability-related fields were moved into
|
||||
the security field. The following is a list of known locations that
|
||||
still perform such direct examination or modification of
|
||||
capability-related fields:
|
||||
<itemizedlist>
|
||||
<listitem><para><filename>fs/open.c</filename>:<function>sys_access</function></para></listitem>
|
||||
<listitem><para><filename>fs/lockd/host.c</filename>:<function>nlm_bind_host</function></para></listitem>
|
||||
<listitem><para><filename>fs/nfsd/auth.c</filename>:<function>nfsd_setuser</function></para></listitem>
|
||||
<listitem><para><filename>fs/proc/array.c</filename>:<function>task_cap</function></para></listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
|
||||
</sect1>
|
||||
|
||||
</article>
|
File diff suppressed because it is too large
Load Diff
@ -1,111 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
|
||||
"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
|
||||
|
||||
<book id="LinuxNetworking">
|
||||
<bookinfo>
|
||||
<title>Linux Networking and Network Devices APIs</title>
|
||||
|
||||
<legalnotice>
|
||||
<para>
|
||||
This documentation is free software; you can redistribute
|
||||
it and/or modify it under the terms of the GNU General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2 of the License, or (at your option) any later
|
||||
version.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
This program is distributed in the hope that it will be
|
||||
useful, but WITHOUT ANY WARRANTY; without even the implied
|
||||
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
See the GNU General Public License for more details.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
You should have received a copy of the GNU General Public
|
||||
License along with this program; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
|
||||
MA 02111-1307 USA
|
||||
</para>
|
||||
|
||||
<para>
|
||||
For more details see the file COPYING in the source
|
||||
distribution of Linux.
|
||||
</para>
|
||||
</legalnotice>
|
||||
</bookinfo>
|
||||
|
||||
<toc></toc>
|
||||
|
||||
<chapter id="netcore">
|
||||
<title>Linux Networking</title>
|
||||
<sect1><title>Networking Base Types</title>
|
||||
!Iinclude/linux/net.h
|
||||
</sect1>
|
||||
<sect1><title>Socket Buffer Functions</title>
|
||||
!Iinclude/linux/skbuff.h
|
||||
!Iinclude/net/sock.h
|
||||
!Enet/socket.c
|
||||
!Enet/core/skbuff.c
|
||||
!Enet/core/sock.c
|
||||
!Enet/core/datagram.c
|
||||
!Enet/core/stream.c
|
||||
</sect1>
|
||||
<sect1><title>Socket Filter</title>
|
||||
!Enet/core/filter.c
|
||||
</sect1>
|
||||
<sect1><title>Generic Network Statistics</title>
|
||||
!Iinclude/uapi/linux/gen_stats.h
|
||||
!Enet/core/gen_stats.c
|
||||
!Enet/core/gen_estimator.c
|
||||
</sect1>
|
||||
<sect1><title>SUN RPC subsystem</title>
|
||||
<!-- The !D functionality is not perfect, garbage has to be protected by comments
|
||||
!Dnet/sunrpc/sunrpc_syms.c
|
||||
-->
|
||||
!Enet/sunrpc/xdr.c
|
||||
!Enet/sunrpc/svc_xprt.c
|
||||
!Enet/sunrpc/xprt.c
|
||||
!Enet/sunrpc/sched.c
|
||||
!Enet/sunrpc/socklib.c
|
||||
!Enet/sunrpc/stats.c
|
||||
!Enet/sunrpc/rpc_pipe.c
|
||||
!Enet/sunrpc/rpcb_clnt.c
|
||||
!Enet/sunrpc/clnt.c
|
||||
</sect1>
|
||||
<sect1><title>WiMAX</title>
|
||||
!Enet/wimax/op-msg.c
|
||||
!Enet/wimax/op-reset.c
|
||||
!Enet/wimax/op-rfkill.c
|
||||
!Enet/wimax/stack.c
|
||||
!Iinclude/net/wimax.h
|
||||
!Iinclude/uapi/linux/wimax.h
|
||||
</sect1>
|
||||
</chapter>
|
||||
|
||||
<chapter id="netdev">
|
||||
<title>Network device support</title>
|
||||
<sect1><title>Driver Support</title>
|
||||
!Enet/core/dev.c
|
||||
!Enet/ethernet/eth.c
|
||||
!Enet/sched/sch_generic.c
|
||||
!Iinclude/linux/etherdevice.h
|
||||
!Iinclude/linux/netdevice.h
|
||||
</sect1>
|
||||
<sect1><title>PHY Support</title>
|
||||
!Edrivers/net/phy/phy.c
|
||||
!Idrivers/net/phy/phy.c
|
||||
!Edrivers/net/phy/phy_device.c
|
||||
!Idrivers/net/phy/phy_device.c
|
||||
!Edrivers/net/phy/mdio_bus.c
|
||||
!Idrivers/net/phy/mdio_bus.c
|
||||
</sect1>
|
||||
<!-- FIXME: Removed for now since no structured comments in source
|
||||
<sect1><title>Wireless</title>
|
||||
X!Enet/core/wireless.c
|
||||
</sect1>
|
||||
-->
|
||||
</chapter>
|
||||
|
||||
</book>
|
@ -1,155 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
|
||||
"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" [
|
||||
<!ENTITY rapidio SYSTEM "rapidio.xml">
|
||||
]>
|
||||
|
||||
<book id="RapidIO-Guide">
|
||||
<bookinfo>
|
||||
<title>RapidIO Subsystem Guide</title>
|
||||
|
||||
<authorgroup>
|
||||
<author>
|
||||
<firstname>Matt</firstname>
|
||||
<surname>Porter</surname>
|
||||
<affiliation>
|
||||
<address>
|
||||
<email>mporter@kernel.crashing.org</email>
|
||||
<email>mporter@mvista.com</email>
|
||||
</address>
|
||||
</affiliation>
|
||||
</author>
|
||||
</authorgroup>
|
||||
|
||||
<copyright>
|
||||
<year>2005</year>
|
||||
<holder>MontaVista Software, Inc.</holder>
|
||||
</copyright>
|
||||
|
||||
<legalnotice>
|
||||
<para>
|
||||
This documentation is free software; you can redistribute
|
||||
it and/or modify it under the terms of the GNU General Public
|
||||
License version 2 as published by the Free Software Foundation.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
This program is distributed in the hope that it will be
|
||||
useful, but WITHOUT ANY WARRANTY; without even the implied
|
||||
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
See the GNU General Public License for more details.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
You should have received a copy of the GNU General Public
|
||||
License along with this program; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
|
||||
MA 02111-1307 USA
|
||||
</para>
|
||||
|
||||
<para>
|
||||
For more details see the file COPYING in the source
|
||||
distribution of Linux.
|
||||
</para>
|
||||
</legalnotice>
|
||||
</bookinfo>
|
||||
|
||||
<toc></toc>
|
||||
|
||||
<chapter id="intro">
|
||||
<title>Introduction</title>
|
||||
<para>
|
||||
RapidIO is a high speed switched fabric interconnect with
|
||||
features aimed at the embedded market. RapidIO provides
|
||||
support for memory-mapped I/O as well as message-based
|
||||
transactions over the switched fabric network. RapidIO has
|
||||
a standardized discovery mechanism not unlike the PCI bus
|
||||
standard that allows simple detection of devices in a
|
||||
network.
|
||||
</para>
|
||||
<para>
|
||||
This documentation is provided for developers intending
|
||||
to support RapidIO on new architectures, write new drivers,
|
||||
or to understand the subsystem internals.
|
||||
</para>
|
||||
</chapter>
|
||||
|
||||
<chapter id="bugs">
|
||||
<title>Known Bugs and Limitations</title>
|
||||
|
||||
<sect1 id="known_bugs">
|
||||
<title>Bugs</title>
|
||||
<para>None. ;)</para>
|
||||
</sect1>
|
||||
<sect1 id="Limitations">
|
||||
<title>Limitations</title>
|
||||
<para>
|
||||
<orderedlist>
|
||||
<listitem><para>Access/management of RapidIO memory regions is not supported</para></listitem>
|
||||
<listitem><para>Multiple host enumeration is not supported</para></listitem>
|
||||
</orderedlist>
|
||||
</para>
|
||||
</sect1>
|
||||
</chapter>
|
||||
|
||||
<chapter id="drivers">
|
||||
<title>RapidIO driver interface</title>
|
||||
<para>
|
||||
Drivers are provided a set of calls in order
|
||||
to interface with the subsystem to gather info
|
||||
on devices, request/map memory region resources,
|
||||
and manage mailboxes/doorbells.
|
||||
</para>
|
||||
<sect1 id="Functions">
|
||||
<title>Functions</title>
|
||||
!Iinclude/linux/rio_drv.h
|
||||
!Edrivers/rapidio/rio-driver.c
|
||||
!Edrivers/rapidio/rio.c
|
||||
</sect1>
|
||||
</chapter>
|
||||
|
||||
<chapter id="internals">
|
||||
<title>Internals</title>
|
||||
|
||||
<para>
|
||||
This chapter contains the autogenerated documentation of the RapidIO
|
||||
subsystem.
|
||||
</para>
|
||||
|
||||
<sect1 id="Structures"><title>Structures</title>
|
||||
!Iinclude/linux/rio.h
|
||||
</sect1>
|
||||
<sect1 id="Enumeration_and_Discovery"><title>Enumeration and Discovery</title>
|
||||
!Idrivers/rapidio/rio-scan.c
|
||||
</sect1>
|
||||
<sect1 id="Driver_functionality"><title>Driver functionality</title>
|
||||
!Idrivers/rapidio/rio.c
|
||||
!Idrivers/rapidio/rio-access.c
|
||||
</sect1>
|
||||
<sect1 id="Device_model_support"><title>Device model support</title>
|
||||
!Idrivers/rapidio/rio-driver.c
|
||||
</sect1>
|
||||
<sect1 id="PPC32_support"><title>PPC32 support</title>
|
||||
!Iarch/powerpc/sysdev/fsl_rio.c
|
||||
</sect1>
|
||||
</chapter>
|
||||
|
||||
<chapter id="credits">
|
||||
<title>Credits</title>
|
||||
<para>
|
||||
The following people have contributed to the RapidIO
|
||||
subsystem directly or indirectly:
|
||||
<orderedlist>
|
||||
<listitem><para>Matt Porter<email>mporter@kernel.crashing.org</email></para></listitem>
|
||||
<listitem><para>Randy Vinson<email>rvinson@mvista.com</email></para></listitem>
|
||||
<listitem><para>Dan Malek<email>dan@embeddedalley.com</email></para></listitem>
|
||||
</orderedlist>
|
||||
</para>
|
||||
<para>
|
||||
The following people have contributed to this document:
|
||||
<orderedlist>
|
||||
<listitem><para>Matt Porter<email>mporter@kernel.crashing.org</email></para></listitem>
|
||||
</orderedlist>
|
||||
</para>
|
||||
</chapter>
|
||||
</book>
|
@ -1,161 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
|
||||
"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
|
||||
|
||||
<book id="s390drivers">
|
||||
<bookinfo>
|
||||
<title>Writing s390 channel device drivers</title>
|
||||
|
||||
<authorgroup>
|
||||
<author>
|
||||
<firstname>Cornelia</firstname>
|
||||
<surname>Huck</surname>
|
||||
<affiliation>
|
||||
<address>
|
||||
<email>cornelia.huck@de.ibm.com</email>
|
||||
</address>
|
||||
</affiliation>
|
||||
</author>
|
||||
</authorgroup>
|
||||
|
||||
<copyright>
|
||||
<year>2007</year>
|
||||
<holder>IBM Corp.</holder>
|
||||
</copyright>
|
||||
|
||||
<legalnotice>
|
||||
<para>
|
||||
This documentation is free software; you can redistribute
|
||||
it and/or modify it under the terms of the GNU General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2 of the License, or (at your option) any later
|
||||
version.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
This program is distributed in the hope that it will be
|
||||
useful, but WITHOUT ANY WARRANTY; without even the implied
|
||||
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
See the GNU General Public License for more details.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
You should have received a copy of the GNU General Public
|
||||
License along with this program; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
|
||||
MA 02111-1307 USA
|
||||
</para>
|
||||
|
||||
<para>
|
||||
For more details see the file COPYING in the source
|
||||
distribution of Linux.
|
||||
</para>
|
||||
</legalnotice>
|
||||
</bookinfo>
|
||||
|
||||
<toc></toc>
|
||||
|
||||
<chapter id="intro">
|
||||
<title>Introduction</title>
|
||||
<para>
|
||||
This document describes the interfaces available for device drivers that
|
||||
drive s390 based channel attached I/O devices. This includes interfaces for
|
||||
interaction with the hardware and interfaces for interacting with the
|
||||
common driver core. Those interfaces are provided by the s390 common I/O
|
||||
layer.
|
||||
</para>
|
||||
<para>
|
||||
The document assumes a familarity with the technical terms associated
|
||||
with the s390 channel I/O architecture. For a description of this
|
||||
architecture, please refer to the "z/Architecture: Principles of
|
||||
Operation", IBM publication no. SA22-7832.
|
||||
</para>
|
||||
<para>
|
||||
While most I/O devices on a s390 system are typically driven through the
|
||||
channel I/O mechanism described here, there are various other methods
|
||||
(like the diag interface). These are out of the scope of this document.
|
||||
</para>
|
||||
<para>
|
||||
Some additional information can also be found in the kernel source
|
||||
under Documentation/s390/driver-model.txt.
|
||||
</para>
|
||||
</chapter>
|
||||
<chapter id="ccw">
|
||||
<title>The ccw bus</title>
|
||||
<para>
|
||||
The ccw bus typically contains the majority of devices available to
|
||||
a s390 system. Named after the channel command word (ccw), the basic
|
||||
command structure used to address its devices, the ccw bus contains
|
||||
so-called channel attached devices. They are addressed via I/O
|
||||
subchannels, visible on the css bus. A device driver for
|
||||
channel-attached devices, however, will never interact with the
|
||||
subchannel directly, but only via the I/O device on the ccw bus,
|
||||
the ccw device.
|
||||
</para>
|
||||
<sect1 id="channelIO">
|
||||
<title>I/O functions for channel-attached devices</title>
|
||||
<para>
|
||||
Some hardware structures have been translated into C structures for use
|
||||
by the common I/O layer and device drivers. For more information on
|
||||
the hardware structures represented here, please consult the Principles
|
||||
of Operation.
|
||||
</para>
|
||||
!Iarch/s390/include/asm/cio.h
|
||||
</sect1>
|
||||
<sect1 id="ccwdev">
|
||||
<title>ccw devices</title>
|
||||
<para>
|
||||
Devices that want to initiate channel I/O need to attach to the ccw bus.
|
||||
Interaction with the driver core is done via the common I/O layer, which
|
||||
provides the abstractions of ccw devices and ccw device drivers.
|
||||
</para>
|
||||
<para>
|
||||
The functions that initiate or terminate channel I/O all act upon a
|
||||
ccw device structure. Device drivers must not bypass those functions
|
||||
or strange side effects may happen.
|
||||
</para>
|
||||
!Iarch/s390/include/asm/ccwdev.h
|
||||
!Edrivers/s390/cio/device.c
|
||||
!Edrivers/s390/cio/device_ops.c
|
||||
</sect1>
|
||||
<sect1 id="cmf">
|
||||
<title>The channel-measurement facility</title>
|
||||
<para>
|
||||
The channel-measurement facility provides a means to collect
|
||||
measurement data which is made available by the channel subsystem
|
||||
for each channel attached device.
|
||||
</para>
|
||||
!Iarch/s390/include/asm/cmb.h
|
||||
!Edrivers/s390/cio/cmf.c
|
||||
</sect1>
|
||||
</chapter>
|
||||
|
||||
<chapter id="ccwgroup">
|
||||
<title>The ccwgroup bus</title>
|
||||
<para>
|
||||
The ccwgroup bus only contains artificial devices, created by the user.
|
||||
Many networking devices (e.g. qeth) are in fact composed of several
|
||||
ccw devices (like read, write and data channel for qeth). The
|
||||
ccwgroup bus provides a mechanism to create a meta-device which
|
||||
contains those ccw devices as slave devices and can be associated
|
||||
with the netdevice.
|
||||
</para>
|
||||
<sect1 id="ccwgroupdevices">
|
||||
<title>ccw group devices</title>
|
||||
!Iarch/s390/include/asm/ccwgroup.h
|
||||
!Edrivers/s390/cio/ccwgroup.c
|
||||
</sect1>
|
||||
</chapter>
|
||||
|
||||
<chapter id="genericinterfaces">
|
||||
<title>Generic interfaces</title>
|
||||
<para>
|
||||
Some interfaces are available to other drivers that do not necessarily
|
||||
have anything to do with the busses described above, but still are
|
||||
indirectly using basic infrastructure in the common I/O layer.
|
||||
One example is the support for adapter interrupts.
|
||||
</para>
|
||||
!Edrivers/s390/cio/airq.c
|
||||
</chapter>
|
||||
|
||||
</book>
|
@ -1,409 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
|
||||
"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
|
||||
|
||||
<book id="scsimid">
|
||||
<bookinfo>
|
||||
<title>SCSI Interfaces Guide</title>
|
||||
|
||||
<authorgroup>
|
||||
<author>
|
||||
<firstname>James</firstname>
|
||||
<surname>Bottomley</surname>
|
||||
<affiliation>
|
||||
<address>
|
||||
<email>James.Bottomley@hansenpartnership.com</email>
|
||||
</address>
|
||||
</affiliation>
|
||||
</author>
|
||||
|
||||
<author>
|
||||
<firstname>Rob</firstname>
|
||||
<surname>Landley</surname>
|
||||
<affiliation>
|
||||
<address>
|
||||
<email>rob@landley.net</email>
|
||||
</address>
|
||||
</affiliation>
|
||||
</author>
|
||||
|
||||
</authorgroup>
|
||||
|
||||
<copyright>
|
||||
<year>2007</year>
|
||||
<holder>Linux Foundation</holder>
|
||||
</copyright>
|
||||
|
||||
<legalnotice>
|
||||
<para>
|
||||
This documentation is free software; you can redistribute
|
||||
it and/or modify it under the terms of the GNU General Public
|
||||
License version 2.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
This program is distributed in the hope that it will be
|
||||
useful, but WITHOUT ANY WARRANTY; without even the implied
|
||||
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
For more details see the file COPYING in the source
|
||||
distribution of Linux.
|
||||
</para>
|
||||
</legalnotice>
|
||||
</bookinfo>
|
||||
|
||||
<toc></toc>
|
||||
|
||||
<chapter id="intro">
|
||||
<title>Introduction</title>
|
||||
<sect1 id="protocol_vs_bus">
|
||||
<title>Protocol vs bus</title>
|
||||
<para>
|
||||
Once upon a time, the Small Computer Systems Interface defined both
|
||||
a parallel I/O bus and a data protocol to connect a wide variety of
|
||||
peripherals (disk drives, tape drives, modems, printers, scanners,
|
||||
optical drives, test equipment, and medical devices) to a host
|
||||
computer.
|
||||
</para>
|
||||
<para>
|
||||
Although the old parallel (fast/wide/ultra) SCSI bus has largely
|
||||
fallen out of use, the SCSI command set is more widely used than ever
|
||||
to communicate with devices over a number of different busses.
|
||||
</para>
|
||||
<para>
|
||||
The <ulink url='http://www.t10.org/scsi-3.htm'>SCSI protocol</ulink>
|
||||
is a big-endian peer-to-peer packet based protocol. SCSI commands
|
||||
are 6, 10, 12, or 16 bytes long, often followed by an associated data
|
||||
payload.
|
||||
</para>
|
||||
<para>
|
||||
SCSI commands can be transported over just about any kind of bus, and
|
||||
are the default protocol for storage devices attached to USB, SATA,
|
||||
SAS, Fibre Channel, FireWire, and ATAPI devices. SCSI packets are
|
||||
also commonly exchanged over Infiniband,
|
||||
<ulink url='http://i2o.shadowconnect.com/faq.php'>I20</ulink>, TCP/IP
|
||||
(<ulink url='https://en.wikipedia.org/wiki/ISCSI'>iSCSI</ulink>), even
|
||||
<ulink url='http://cyberelk.net/tim/parport/parscsi.html'>Parallel
|
||||
ports</ulink>.
|
||||
</para>
|
||||
</sect1>
|
||||
<sect1 id="subsystem_design">
|
||||
<title>Design of the Linux SCSI subsystem</title>
|
||||
<para>
|
||||
The SCSI subsystem uses a three layer design, with upper, mid, and low
|
||||
layers. Every operation involving the SCSI subsystem (such as reading
|
||||
a sector from a disk) uses one driver at each of the 3 levels: one
|
||||
upper layer driver, one lower layer driver, and the SCSI midlayer.
|
||||
</para>
|
||||
<para>
|
||||
The SCSI upper layer provides the interface between userspace and the
|
||||
kernel, in the form of block and char device nodes for I/O and
|
||||
ioctl(). The SCSI lower layer contains drivers for specific hardware
|
||||
devices.
|
||||
</para>
|
||||
<para>
|
||||
In between is the SCSI mid-layer, analogous to a network routing
|
||||
layer such as the IPv4 stack. The SCSI mid-layer routes a packet
|
||||
based data protocol between the upper layer's /dev nodes and the
|
||||
corresponding devices in the lower layer. It manages command queues,
|
||||
provides error handling and power management functions, and responds
|
||||
to ioctl() requests.
|
||||
</para>
|
||||
</sect1>
|
||||
</chapter>
|
||||
|
||||
<chapter id="upper_layer">
|
||||
<title>SCSI upper layer</title>
|
||||
<para>
|
||||
The upper layer supports the user-kernel interface by providing
|
||||
device nodes.
|
||||
</para>
|
||||
<sect1 id="sd">
|
||||
<title>sd (SCSI Disk)</title>
|
||||
<para>sd (sd_mod.o)</para>
|
||||
<!-- !Idrivers/scsi/sd.c -->
|
||||
</sect1>
|
||||
<sect1 id="sr">
|
||||
<title>sr (SCSI CD-ROM)</title>
|
||||
<para>sr (sr_mod.o)</para>
|
||||
</sect1>
|
||||
<sect1 id="st">
|
||||
<title>st (SCSI Tape)</title>
|
||||
<para>st (st.o)</para>
|
||||
</sect1>
|
||||
<sect1 id="sg">
|
||||
<title>sg (SCSI Generic)</title>
|
||||
<para>sg (sg.o)</para>
|
||||
</sect1>
|
||||
<sect1 id="ch">
|
||||
<title>ch (SCSI Media Changer)</title>
|
||||
<para>ch (ch.c)</para>
|
||||
</sect1>
|
||||
</chapter>
|
||||
|
||||
<chapter id="mid_layer">
|
||||
<title>SCSI mid layer</title>
|
||||
|
||||
<sect1 id="midlayer_implementation">
|
||||
<title>SCSI midlayer implementation</title>
|
||||
<sect2 id="scsi_device.h">
|
||||
<title>include/scsi/scsi_device.h</title>
|
||||
<para>
|
||||
</para>
|
||||
!Iinclude/scsi/scsi_device.h
|
||||
</sect2>
|
||||
|
||||
<sect2 id="scsi.c">
|
||||
<title>drivers/scsi/scsi.c</title>
|
||||
<para>Main file for the SCSI midlayer.</para>
|
||||
!Edrivers/scsi/scsi.c
|
||||
</sect2>
|
||||
<sect2 id="scsicam.c">
|
||||
<title>drivers/scsi/scsicam.c</title>
|
||||
<para>
|
||||
<ulink url='http://www.t10.org/ftp/t10/drafts/cam/cam-r12b.pdf'>SCSI
|
||||
Common Access Method</ulink> support functions, for use with
|
||||
HDIO_GETGEO, etc.
|
||||
</para>
|
||||
!Edrivers/scsi/scsicam.c
|
||||
</sect2>
|
||||
<sect2 id="scsi_error.c">
|
||||
<title>drivers/scsi/scsi_error.c</title>
|
||||
<para>Common SCSI error/timeout handling routines.</para>
|
||||
!Edrivers/scsi/scsi_error.c
|
||||
</sect2>
|
||||
<sect2 id="scsi_devinfo.c">
|
||||
<title>drivers/scsi/scsi_devinfo.c</title>
|
||||
<para>
|
||||
Manage scsi_dev_info_list, which tracks blacklisted and whitelisted
|
||||
devices.
|
||||
</para>
|
||||
!Idrivers/scsi/scsi_devinfo.c
|
||||
</sect2>
|
||||
<sect2 id="scsi_ioctl.c">
|
||||
<title>drivers/scsi/scsi_ioctl.c</title>
|
||||
<para>
|
||||
Handle ioctl() calls for SCSI devices.
|
||||
</para>
|
||||
!Edrivers/scsi/scsi_ioctl.c
|
||||
</sect2>
|
||||
<sect2 id="scsi_lib.c">
|
||||
<title>drivers/scsi/scsi_lib.c</title>
|
||||
<para>
|
||||
SCSI queuing library.
|
||||
</para>
|
||||
!Edrivers/scsi/scsi_lib.c
|
||||
</sect2>
|
||||
<sect2 id="scsi_lib_dma.c">
|
||||
<title>drivers/scsi/scsi_lib_dma.c</title>
|
||||
<para>
|
||||
SCSI library functions depending on DMA
|
||||
(map and unmap scatter-gather lists).
|
||||
</para>
|
||||
!Edrivers/scsi/scsi_lib_dma.c
|
||||
</sect2>
|
||||
<sect2 id="scsi_module.c">
|
||||
<title>drivers/scsi/scsi_module.c</title>
|
||||
<para>
|
||||
The file drivers/scsi/scsi_module.c contains legacy support for
|
||||
old-style host templates. It should never be used by any new driver.
|
||||
</para>
|
||||
</sect2>
|
||||
<sect2 id="scsi_proc.c">
|
||||
<title>drivers/scsi/scsi_proc.c</title>
|
||||
<para>
|
||||
The functions in this file provide an interface between
|
||||
the PROC file system and the SCSI device drivers
|
||||
It is mainly used for debugging, statistics and to pass
|
||||
information directly to the lowlevel driver.
|
||||
|
||||
I.E. plumbing to manage /proc/scsi/*
|
||||
</para>
|
||||
!Idrivers/scsi/scsi_proc.c
|
||||
</sect2>
|
||||
<sect2 id="scsi_netlink.c">
|
||||
<title>drivers/scsi/scsi_netlink.c</title>
|
||||
<para>
|
||||
Infrastructure to provide async events from transports to userspace
|
||||
via netlink, using a single NETLINK_SCSITRANSPORT protocol for all
|
||||
transports.
|
||||
|
||||
See <ulink url='http://marc.info/?l=linux-scsi&m=115507374832500&w=2'>the
|
||||
original patch submission</ulink> for more details.
|
||||
</para>
|
||||
!Idrivers/scsi/scsi_netlink.c
|
||||
</sect2>
|
||||
<sect2 id="scsi_scan.c">
|
||||
<title>drivers/scsi/scsi_scan.c</title>
|
||||
<para>
|
||||
Scan a host to determine which (if any) devices are attached.
|
||||
|
||||
The general scanning/probing algorithm is as follows, exceptions are
|
||||
made to it depending on device specific flags, compilation options,
|
||||
and global variable (boot or module load time) settings.
|
||||
|
||||
A specific LUN is scanned via an INQUIRY command; if the LUN has a
|
||||
device attached, a scsi_device is allocated and setup for it.
|
||||
|
||||
For every id of every channel on the given host, start by scanning
|
||||
LUN 0. Skip hosts that don't respond at all to a scan of LUN 0.
|
||||
Otherwise, if LUN 0 has a device attached, allocate and setup a
|
||||
scsi_device for it. If target is SCSI-3 or up, issue a REPORT LUN,
|
||||
and scan all of the LUNs returned by the REPORT LUN; else,
|
||||
sequentially scan LUNs up until some maximum is reached, or a LUN is
|
||||
seen that cannot have a device attached to it.
|
||||
</para>
|
||||
!Idrivers/scsi/scsi_scan.c
|
||||
</sect2>
|
||||
<sect2 id="scsi_sysctl.c">
|
||||
<title>drivers/scsi/scsi_sysctl.c</title>
|
||||
<para>
|
||||
Set up the sysctl entry: "/dev/scsi/logging_level"
|
||||
(DEV_SCSI_LOGGING_LEVEL) which sets/returns scsi_logging_level.
|
||||
</para>
|
||||
</sect2>
|
||||
<sect2 id="scsi_sysfs.c">
|
||||
<title>drivers/scsi/scsi_sysfs.c</title>
|
||||
<para>
|
||||
SCSI sysfs interface routines.
|
||||
</para>
|
||||
!Edrivers/scsi/scsi_sysfs.c
|
||||
</sect2>
|
||||
<sect2 id="hosts.c">
|
||||
<title>drivers/scsi/hosts.c</title>
|
||||
<para>
|
||||
mid to lowlevel SCSI driver interface
|
||||
</para>
|
||||
!Edrivers/scsi/hosts.c
|
||||
</sect2>
|
||||
<sect2 id="constants.c">
|
||||
<title>drivers/scsi/constants.c</title>
|
||||
<para>
|
||||
mid to lowlevel SCSI driver interface
|
||||
</para>
|
||||
!Edrivers/scsi/constants.c
|
||||
</sect2>
|
||||
</sect1>
|
||||
|
||||
<sect1 id="Transport_classes">
|
||||
<title>Transport classes</title>
|
||||
<para>
|
||||
Transport classes are service libraries for drivers in the SCSI
|
||||
lower layer, which expose transport attributes in sysfs.
|
||||
</para>
|
||||
<sect2 id="Fibre_Channel_transport">
|
||||
<title>Fibre Channel transport</title>
|
||||
<para>
|
||||
The file drivers/scsi/scsi_transport_fc.c defines transport attributes
|
||||
for Fibre Channel.
|
||||
</para>
|
||||
!Edrivers/scsi/scsi_transport_fc.c
|
||||
</sect2>
|
||||
<sect2 id="iSCSI_transport">
|
||||
<title>iSCSI transport class</title>
|
||||
<para>
|
||||
The file drivers/scsi/scsi_transport_iscsi.c defines transport
|
||||
attributes for the iSCSI class, which sends SCSI packets over TCP/IP
|
||||
connections.
|
||||
</para>
|
||||
!Edrivers/scsi/scsi_transport_iscsi.c
|
||||
</sect2>
|
||||
<sect2 id="SAS_transport">
|
||||
<title>Serial Attached SCSI (SAS) transport class</title>
|
||||
<para>
|
||||
The file drivers/scsi/scsi_transport_sas.c defines transport
|
||||
attributes for Serial Attached SCSI, a variant of SATA aimed at
|
||||
large high-end systems.
|
||||
</para>
|
||||
<para>
|
||||
The SAS transport class contains common code to deal with SAS HBAs,
|
||||
an aproximated representation of SAS topologies in the driver model,
|
||||
and various sysfs attributes to expose these topologies and management
|
||||
interfaces to userspace.
|
||||
</para>
|
||||
<para>
|
||||
In addition to the basic SCSI core objects this transport class
|
||||
introduces two additional intermediate objects: The SAS PHY
|
||||
as represented by struct sas_phy defines an "outgoing" PHY on
|
||||
a SAS HBA or Expander, and the SAS remote PHY represented by
|
||||
struct sas_rphy defines an "incoming" PHY on a SAS Expander or
|
||||
end device. Note that this is purely a software concept, the
|
||||
underlying hardware for a PHY and a remote PHY is the exactly
|
||||
the same.
|
||||
</para>
|
||||
<para>
|
||||
There is no concept of a SAS port in this code, users can see
|
||||
what PHYs form a wide port based on the port_identifier attribute,
|
||||
which is the same for all PHYs in a port.
|
||||
</para>
|
||||
!Edrivers/scsi/scsi_transport_sas.c
|
||||
</sect2>
|
||||
<sect2 id="SATA_transport">
|
||||
<title>SATA transport class</title>
|
||||
<para>
|
||||
The SATA transport is handled by libata, which has its own book of
|
||||
documentation in this directory.
|
||||
</para>
|
||||
</sect2>
|
||||
<sect2 id="SPI_transport">
|
||||
<title>Parallel SCSI (SPI) transport class</title>
|
||||
<para>
|
||||
The file drivers/scsi/scsi_transport_spi.c defines transport
|
||||
attributes for traditional (fast/wide/ultra) SCSI busses.
|
||||
</para>
|
||||
!Edrivers/scsi/scsi_transport_spi.c
|
||||
</sect2>
|
||||
<sect2 id="SRP_transport">
|
||||
<title>SCSI RDMA (SRP) transport class</title>
|
||||
<para>
|
||||
The file drivers/scsi/scsi_transport_srp.c defines transport
|
||||
attributes for SCSI over Remote Direct Memory Access.
|
||||
</para>
|
||||
!Edrivers/scsi/scsi_transport_srp.c
|
||||
</sect2>
|
||||
</sect1>
|
||||
|
||||
</chapter>
|
||||
|
||||
<chapter id="lower_layer">
|
||||
<title>SCSI lower layer</title>
|
||||
<sect1 id="hba_drivers">
|
||||
<title>Host Bus Adapter transport types</title>
|
||||
<para>
|
||||
Many modern device controllers use the SCSI command set as a protocol to
|
||||
communicate with their devices through many different types of physical
|
||||
connections.
|
||||
</para>
|
||||
<para>
|
||||
In SCSI language a bus capable of carrying SCSI commands is
|
||||
called a "transport", and a controller connecting to such a bus is
|
||||
called a "host bus adapter" (HBA).
|
||||
</para>
|
||||
<sect2 id="scsi_debug.c">
|
||||
<title>Debug transport</title>
|
||||
<para>
|
||||
The file drivers/scsi/scsi_debug.c simulates a host adapter with a
|
||||
variable number of disks (or disk like devices) attached, sharing a
|
||||
common amount of RAM. Does a lot of checking to make sure that we are
|
||||
not getting blocks mixed up, and panics the kernel if anything out of
|
||||
the ordinary is seen.
|
||||
</para>
|
||||
<para>
|
||||
To be more realistic, the simulated devices have the transport
|
||||
attributes of SAS disks.
|
||||
</para>
|
||||
<para>
|
||||
For documentation see
|
||||
<ulink url='http://sg.danny.cz/sg/sdebug26.html'>http://sg.danny.cz/sg/sdebug26.html</ulink>
|
||||
</para>
|
||||
<!-- !Edrivers/scsi/scsi_debug.c -->
|
||||
</sect2>
|
||||
<sect2 id="todo">
|
||||
<title>todo</title>
|
||||
<para>Parallel (fast/wide/ultra) SCSI, USB, SATA,
|
||||
SAS, Fibre Channel, FireWire, ATAPI devices, Infiniband,
|
||||
I20, iSCSI, Parallel ports, netlink...
|
||||
</para>
|
||||
</sect2>
|
||||
</sect1>
|
||||
</chapter>
|
||||
</book>
|
@ -1,105 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
|
||||
"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
|
||||
|
||||
<book id="sh-drivers">
|
||||
<bookinfo>
|
||||
<title>SuperH Interfaces Guide</title>
|
||||
|
||||
<authorgroup>
|
||||
<author>
|
||||
<firstname>Paul</firstname>
|
||||
<surname>Mundt</surname>
|
||||
<affiliation>
|
||||
<address>
|
||||
<email>lethal@linux-sh.org</email>
|
||||
</address>
|
||||
</affiliation>
|
||||
</author>
|
||||
</authorgroup>
|
||||
|
||||
<copyright>
|
||||
<year>2008-2010</year>
|
||||
<holder>Paul Mundt</holder>
|
||||
</copyright>
|
||||
<copyright>
|
||||
<year>2008-2010</year>
|
||||
<holder>Renesas Technology Corp.</holder>
|
||||
</copyright>
|
||||
<copyright>
|
||||
<year>2010</year>
|
||||
<holder>Renesas Electronics Corp.</holder>
|
||||
</copyright>
|
||||
|
||||
<legalnotice>
|
||||
<para>
|
||||
This documentation is free software; you can redistribute
|
||||
it and/or modify it under the terms of the GNU General Public
|
||||
License version 2 as published by the Free Software Foundation.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
This program is distributed in the hope that it will be
|
||||
useful, but WITHOUT ANY WARRANTY; without even the implied
|
||||
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
See the GNU General Public License for more details.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
You should have received a copy of the GNU General Public
|
||||
License along with this program; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
|
||||
MA 02111-1307 USA
|
||||
</para>
|
||||
|
||||
<para>
|
||||
For more details see the file COPYING in the source
|
||||
distribution of Linux.
|
||||
</para>
|
||||
</legalnotice>
|
||||
</bookinfo>
|
||||
|
||||
<toc></toc>
|
||||
|
||||
<chapter id="mm">
|
||||
<title>Memory Management</title>
|
||||
<sect1 id="sh4">
|
||||
<title>SH-4</title>
|
||||
<sect2 id="sq">
|
||||
<title>Store Queue API</title>
|
||||
!Earch/sh/kernel/cpu/sh4/sq.c
|
||||
</sect2>
|
||||
</sect1>
|
||||
<sect1 id="sh5">
|
||||
<title>SH-5</title>
|
||||
<sect2 id="tlb">
|
||||
<title>TLB Interfaces</title>
|
||||
!Iarch/sh/mm/tlb-sh5.c
|
||||
!Iarch/sh/include/asm/tlb_64.h
|
||||
</sect2>
|
||||
</sect1>
|
||||
</chapter>
|
||||
<chapter id="mach">
|
||||
<title>Machine Specific Interfaces</title>
|
||||
<sect1 id="dreamcast">
|
||||
<title>mach-dreamcast</title>
|
||||
!Iarch/sh/boards/mach-dreamcast/rtc.c
|
||||
</sect1>
|
||||
<sect1 id="x3proto">
|
||||
<title>mach-x3proto</title>
|
||||
!Earch/sh/boards/mach-x3proto/ilsel.c
|
||||
</sect1>
|
||||
</chapter>
|
||||
<chapter id="busses">
|
||||
<title>Busses</title>
|
||||
<sect1 id="superhyway">
|
||||
<title>SuperHyway</title>
|
||||
!Edrivers/sh/superhyway/superhyway.c
|
||||
</sect1>
|
||||
|
||||
<sect1 id="maple">
|
||||
<title>Maple</title>
|
||||
!Edrivers/sh/maple/maple.c
|
||||
</sect1>
|
||||
</chapter>
|
||||
</book>
|
@ -1,11 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<stylesheet xmlns="http://www.w3.org/1999/XSL/Transform" version="1.0">
|
||||
<param name="chunk.quietly">1</param>
|
||||
<param name="funcsynopsis.style">ansi</param>
|
||||
<param name="funcsynopsis.tabular.threshold">80</param>
|
||||
<param name="callout.graphics">0</param>
|
||||
<!-- <param name="paper.type">A4</param> -->
|
||||
<param name="generate.consistent.ids">1</param>
|
||||
<param name="generate.section.toc.level">2</param>
|
||||
<param name="use.id.as.filename">1</param>
|
||||
</stylesheet>
|
@ -1,101 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
|
||||
"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
|
||||
|
||||
<book id="w1id">
|
||||
<bookinfo>
|
||||
<title>W1: Dallas' 1-wire bus</title>
|
||||
|
||||
<authorgroup>
|
||||
<author>
|
||||
<firstname>David</firstname>
|
||||
<surname>Fries</surname>
|
||||
<affiliation>
|
||||
<address>
|
||||
<email>David@Fries.net</email>
|
||||
</address>
|
||||
</affiliation>
|
||||
</author>
|
||||
|
||||
</authorgroup>
|
||||
|
||||
<copyright>
|
||||
<year>2013</year>
|
||||
<!--
|
||||
<holder></holder>
|
||||
-->
|
||||
</copyright>
|
||||
|
||||
<legalnotice>
|
||||
<para>
|
||||
This documentation is free software; you can redistribute
|
||||
it and/or modify it under the terms of the GNU General Public
|
||||
License version 2.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
This program is distributed in the hope that it will be
|
||||
useful, but WITHOUT ANY WARRANTY; without even the implied
|
||||
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
For more details see the file COPYING in the source
|
||||
distribution of Linux.
|
||||
</para>
|
||||
</legalnotice>
|
||||
</bookinfo>
|
||||
|
||||
<toc></toc>
|
||||
|
||||
<chapter id="w1_internal">
|
||||
<title>W1 API internal to the kernel</title>
|
||||
|
||||
<sect1 id="w1_internal_api">
|
||||
<title>W1 API internal to the kernel</title>
|
||||
<sect2 id="w1.h">
|
||||
<title>drivers/w1/w1.h</title>
|
||||
<para>W1 core functions.</para>
|
||||
!Idrivers/w1/w1.h
|
||||
</sect2>
|
||||
|
||||
<sect2 id="w1.c">
|
||||
<title>drivers/w1/w1.c</title>
|
||||
<para>W1 core functions.</para>
|
||||
!Idrivers/w1/w1.c
|
||||
</sect2>
|
||||
|
||||
<sect2 id="w1_family.h">
|
||||
<title>drivers/w1/w1_family.h</title>
|
||||
<para>Allows registering device family operations.</para>
|
||||
!Idrivers/w1/w1_family.h
|
||||
</sect2>
|
||||
|
||||
<sect2 id="w1_family.c">
|
||||
<title>drivers/w1/w1_family.c</title>
|
||||
<para>Allows registering device family operations.</para>
|
||||
!Edrivers/w1/w1_family.c
|
||||
</sect2>
|
||||
|
||||
<sect2 id="w1_int.c">
|
||||
<title>drivers/w1/w1_int.c</title>
|
||||
<para>W1 internal initialization for master devices.</para>
|
||||
!Edrivers/w1/w1_int.c
|
||||
</sect2>
|
||||
|
||||
<sect2 id="w1_netlink.h">
|
||||
<title>drivers/w1/w1_netlink.h</title>
|
||||
<para>W1 external netlink API structures and commands.</para>
|
||||
!Idrivers/w1/w1_netlink.h
|
||||
</sect2>
|
||||
|
||||
<sect2 id="w1_io.c">
|
||||
<title>drivers/w1/w1_io.c</title>
|
||||
<para>W1 input/output.</para>
|
||||
!Edrivers/w1/w1_io.c
|
||||
!Idrivers/w1/w1_io.c
|
||||
</sect2>
|
||||
|
||||
</sect1>
|
||||
|
||||
|
||||
</chapter>
|
||||
|
||||
</book>
|
@ -1,371 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
|
||||
"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
|
||||
|
||||
<book id="Z85230Guide">
|
||||
<bookinfo>
|
||||
<title>Z8530 Programming Guide</title>
|
||||
|
||||
<authorgroup>
|
||||
<author>
|
||||
<firstname>Alan</firstname>
|
||||
<surname>Cox</surname>
|
||||
<affiliation>
|
||||
<address>
|
||||
<email>alan@lxorguk.ukuu.org.uk</email>
|
||||
</address>
|
||||
</affiliation>
|
||||
</author>
|
||||
</authorgroup>
|
||||
|
||||
<copyright>
|
||||
<year>2000</year>
|
||||
<holder>Alan Cox</holder>
|
||||
</copyright>
|
||||
|
||||
<legalnotice>
|
||||
<para>
|
||||
This documentation is free software; you can redistribute
|
||||
it and/or modify it under the terms of the GNU General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2 of the License, or (at your option) any later
|
||||
version.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
This program is distributed in the hope that it will be
|
||||
useful, but WITHOUT ANY WARRANTY; without even the implied
|
||||
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
See the GNU General Public License for more details.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
You should have received a copy of the GNU General Public
|
||||
License along with this program; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
|
||||
MA 02111-1307 USA
|
||||
</para>
|
||||
|
||||
<para>
|
||||
For more details see the file COPYING in the source
|
||||
distribution of Linux.
|
||||
</para>
|
||||
</legalnotice>
|
||||
</bookinfo>
|
||||
|
||||
<toc></toc>
|
||||
|
||||
<chapter id="intro">
|
||||
<title>Introduction</title>
|
||||
<para>
|
||||
The Z85x30 family synchronous/asynchronous controller chips are
|
||||
used on a large number of cheap network interface cards. The
|
||||
kernel provides a core interface layer that is designed to make
|
||||
it easy to provide WAN services using this chip.
|
||||
</para>
|
||||
<para>
|
||||
The current driver only support synchronous operation. Merging the
|
||||
asynchronous driver support into this code to allow any Z85x30
|
||||
device to be used as both a tty interface and as a synchronous
|
||||
controller is a project for Linux post the 2.4 release
|
||||
</para>
|
||||
</chapter>
|
||||
|
||||
<chapter id="Driver_Modes">
|
||||
<title>Driver Modes</title>
|
||||
<para>
|
||||
The Z85230 driver layer can drive Z8530, Z85C30 and Z85230 devices
|
||||
in three different modes. Each mode can be applied to an individual
|
||||
channel on the chip (each chip has two channels).
|
||||
</para>
|
||||
<para>
|
||||
The PIO synchronous mode supports the most common Z8530 wiring. Here
|
||||
the chip is interface to the I/O and interrupt facilities of the
|
||||
host machine but not to the DMA subsystem. When running PIO the
|
||||
Z8530 has extremely tight timing requirements. Doing high speeds,
|
||||
even with a Z85230 will be tricky. Typically you should expect to
|
||||
achieve at best 9600 baud with a Z8C530 and 64Kbits with a Z85230.
|
||||
</para>
|
||||
<para>
|
||||
The DMA mode supports the chip when it is configured to use dual DMA
|
||||
channels on an ISA bus. The better cards tend to support this mode
|
||||
of operation for a single channel. With DMA running the Z85230 tops
|
||||
out when it starts to hit ISA DMA constraints at about 512Kbits. It
|
||||
is worth noting here that many PC machines hang or crash when the
|
||||
chip is driven fast enough to hold the ISA bus solid.
|
||||
</para>
|
||||
<para>
|
||||
Transmit DMA mode uses a single DMA channel. The DMA channel is used
|
||||
for transmission as the transmit FIFO is smaller than the receive
|
||||
FIFO. it gives better performance than pure PIO mode but is nowhere
|
||||
near as ideal as pure DMA mode.
|
||||
</para>
|
||||
</chapter>
|
||||
|
||||
<chapter id="Using_the_Z85230_driver">
|
||||
<title>Using the Z85230 driver</title>
|
||||
<para>
|
||||
The Z85230 driver provides the back end interface to your board. To
|
||||
configure a Z8530 interface you need to detect the board and to
|
||||
identify its ports and interrupt resources. It is also your problem
|
||||
to verify the resources are available.
|
||||
</para>
|
||||
<para>
|
||||
Having identified the chip you need to fill in a struct z8530_dev,
|
||||
which describes each chip. This object must exist until you finally
|
||||
shutdown the board. Firstly zero the active field. This ensures
|
||||
nothing goes off without you intending it. The irq field should
|
||||
be set to the interrupt number of the chip. (Each chip has a single
|
||||
interrupt source rather than each channel). You are responsible
|
||||
for allocating the interrupt line. The interrupt handler should be
|
||||
set to <function>z8530_interrupt</function>. The device id should
|
||||
be set to the z8530_dev structure pointer. Whether the interrupt can
|
||||
be shared or not is board dependent, and up to you to initialise.
|
||||
</para>
|
||||
<para>
|
||||
The structure holds two channel structures.
|
||||
Initialise chanA.ctrlio and chanA.dataio with the address of the
|
||||
control and data ports. You can or this with Z8530_PORT_SLEEP to
|
||||
indicate your interface needs the 5uS delay for chip settling done
|
||||
in software. The PORT_SLEEP option is architecture specific. Other
|
||||
flags may become available on future platforms, eg for MMIO.
|
||||
Initialise the chanA.irqs to &z8530_nop to start the chip up
|
||||
as disabled and discarding interrupt events. This ensures that
|
||||
stray interrupts will be mopped up and not hang the bus. Set
|
||||
chanA.dev to point to the device structure itself. The
|
||||
private and name field you may use as you wish. The private field
|
||||
is unused by the Z85230 layer. The name is used for error reporting
|
||||
and it may thus make sense to make it match the network name.
|
||||
</para>
|
||||
<para>
|
||||
Repeat the same operation with the B channel if your chip has
|
||||
both channels wired to something useful. This isn't always the
|
||||
case. If it is not wired then the I/O values do not matter, but
|
||||
you must initialise chanB.dev.
|
||||
</para>
|
||||
<para>
|
||||
If your board has DMA facilities then initialise the txdma and
|
||||
rxdma fields for the relevant channels. You must also allocate the
|
||||
ISA DMA channels and do any necessary board level initialisation
|
||||
to configure them. The low level driver will do the Z8530 and
|
||||
DMA controller programming but not board specific magic.
|
||||
</para>
|
||||
<para>
|
||||
Having initialised the device you can then call
|
||||
<function>z8530_init</function>. This will probe the chip and
|
||||
reset it into a known state. An identification sequence is then
|
||||
run to identify the chip type. If the checks fail to pass the
|
||||
function returns a non zero error code. Typically this indicates
|
||||
that the port given is not valid. After this call the
|
||||
type field of the z8530_dev structure is initialised to either
|
||||
Z8530, Z85C30 or Z85230 according to the chip found.
|
||||
</para>
|
||||
<para>
|
||||
Once you have called z8530_init you can also make use of the utility
|
||||
function <function>z8530_describe</function>. This provides a
|
||||
consistent reporting format for the Z8530 devices, and allows all
|
||||
the drivers to provide consistent reporting.
|
||||
</para>
|
||||
</chapter>
|
||||
|
||||
<chapter id="Attaching_Network_Interfaces">
|
||||
<title>Attaching Network Interfaces</title>
|
||||
<para>
|
||||
If you wish to use the network interface facilities of the driver,
|
||||
then you need to attach a network device to each channel that is
|
||||
present and in use. In addition to use the generic HDLC
|
||||
you need to follow some additional plumbing rules. They may seem
|
||||
complex but a look at the example hostess_sv11 driver should
|
||||
reassure you.
|
||||
</para>
|
||||
<para>
|
||||
The network device used for each channel should be pointed to by
|
||||
the netdevice field of each channel. The hdlc-> priv field of the
|
||||
network device points to your private data - you will need to be
|
||||
able to find your private data from this.
|
||||
</para>
|
||||
<para>
|
||||
The way most drivers approach this particular problem is to
|
||||
create a structure holding the Z8530 device definition and
|
||||
put that into the private field of the network device. The
|
||||
network device fields of the channels then point back to the
|
||||
network devices.
|
||||
</para>
|
||||
<para>
|
||||
If you wish to use the generic HDLC then you need to register
|
||||
the HDLC device.
|
||||
</para>
|
||||
<para>
|
||||
Before you register your network device you will also need to
|
||||
provide suitable handlers for most of the network device callbacks.
|
||||
See the network device documentation for more details on this.
|
||||
</para>
|
||||
</chapter>
|
||||
|
||||
<chapter id="Configuring_And_Activating_The_Port">
|
||||
<title>Configuring And Activating The Port</title>
|
||||
<para>
|
||||
The Z85230 driver provides helper functions and tables to load the
|
||||
port registers on the Z8530 chips. When programming the register
|
||||
settings for a channel be aware that the documentation recommends
|
||||
initialisation orders. Strange things happen when these are not
|
||||
followed.
|
||||
</para>
|
||||
<para>
|
||||
<function>z8530_channel_load</function> takes an array of
|
||||
pairs of initialisation values in an array of u8 type. The first
|
||||
value is the Z8530 register number. Add 16 to indicate the alternate
|
||||
register bank on the later chips. The array is terminated by a 255.
|
||||
</para>
|
||||
<para>
|
||||
The driver provides a pair of public tables. The
|
||||
z8530_hdlc_kilostream table is for the UK 'Kilostream' service and
|
||||
also happens to cover most other end host configurations. The
|
||||
z8530_hdlc_kilostream_85230 table is the same configuration using
|
||||
the enhancements of the 85230 chip. The configuration loaded is
|
||||
standard NRZ encoded synchronous data with HDLC bitstuffing. All
|
||||
of the timing is taken from the other end of the link.
|
||||
</para>
|
||||
<para>
|
||||
When writing your own tables be aware that the driver internally
|
||||
tracks register values. It may need to reload values. You should
|
||||
therefore be sure to set registers 1-7, 9-11, 14 and 15 in all
|
||||
configurations. Where the register settings depend on DMA selection
|
||||
the driver will update the bits itself when you open or close.
|
||||
Loading a new table with the interface open is not recommended.
|
||||
</para>
|
||||
<para>
|
||||
There are three standard configurations supported by the core
|
||||
code. In PIO mode the interface is programmed up to use
|
||||
interrupt driven PIO. This places high demands on the host processor
|
||||
to avoid latency. The driver is written to take account of latency
|
||||
issues but it cannot avoid latencies caused by other drivers,
|
||||
notably IDE in PIO mode. Because the drivers allocate buffers you
|
||||
must also prevent MTU changes while the port is open.
|
||||
</para>
|
||||
<para>
|
||||
Once the port is open it will call the rx_function of each channel
|
||||
whenever a completed packet arrived. This is invoked from
|
||||
interrupt context and passes you the channel and a network
|
||||
buffer (struct sk_buff) holding the data. The data includes
|
||||
the CRC bytes so most users will want to trim the last two
|
||||
bytes before processing the data. This function is very timing
|
||||
critical. When you wish to simply discard data the support
|
||||
code provides the function <function>z8530_null_rx</function>
|
||||
to discard the data.
|
||||
</para>
|
||||
<para>
|
||||
To active PIO mode sending and receiving the <function>
|
||||
z8530_sync_open</function> is called. This expects to be passed
|
||||
the network device and the channel. Typically this is called from
|
||||
your network device open callback. On a failure a non zero error
|
||||
status is returned. The <function>z8530_sync_close</function>
|
||||
function shuts down a PIO channel. This must be done before the
|
||||
channel is opened again and before the driver shuts down
|
||||
and unloads.
|
||||
</para>
|
||||
<para>
|
||||
The ideal mode of operation is dual channel DMA mode. Here the
|
||||
kernel driver will configure the board for DMA in both directions.
|
||||
The driver also handles ISA DMA issues such as controller
|
||||
programming and the memory range limit for you. This mode is
|
||||
activated by calling the <function>z8530_sync_dma_open</function>
|
||||
function. On failure a non zero error value is returned.
|
||||
Once this mode is activated it can be shut down by calling the
|
||||
<function>z8530_sync_dma_close</function>. You must call the close
|
||||
function matching the open mode you used.
|
||||
</para>
|
||||
<para>
|
||||
The final supported mode uses a single DMA channel to drive the
|
||||
transmit side. As the Z85C30 has a larger FIFO on the receive
|
||||
channel this tends to increase the maximum speed a little.
|
||||
This is activated by calling the <function>z8530_sync_txdma_open
|
||||
</function>. This returns a non zero error code on failure. The
|
||||
<function>z8530_sync_txdma_close</function> function closes down
|
||||
the Z8530 interface from this mode.
|
||||
</para>
|
||||
</chapter>
|
||||
|
||||
<chapter id="Network_Layer_Functions">
|
||||
<title>Network Layer Functions</title>
|
||||
<para>
|
||||
The Z8530 layer provides functions to queue packets for
|
||||
transmission. The driver internally buffers the frame currently
|
||||
being transmitted and one further frame (in order to keep back
|
||||
to back transmission running). Any further buffering is up to
|
||||
the caller.
|
||||
</para>
|
||||
<para>
|
||||
The function <function>z8530_queue_xmit</function> takes a network
|
||||
buffer in sk_buff format and queues it for transmission. The
|
||||
caller must provide the entire packet with the exception of the
|
||||
bitstuffing and CRC. This is normally done by the caller via
|
||||
the generic HDLC interface layer. It returns 0 if the buffer has been
|
||||
queued and non zero values for queue full. If the function accepts
|
||||
the buffer it becomes property of the Z8530 layer and the caller
|
||||
should not free it.
|
||||
</para>
|
||||
<para>
|
||||
The function <function>z8530_get_stats</function> returns a pointer
|
||||
to an internally maintained per interface statistics block. This
|
||||
provides most of the interface code needed to implement the network
|
||||
layer get_stats callback.
|
||||
</para>
|
||||
</chapter>
|
||||
|
||||
<chapter id="Porting_The_Z8530_Driver">
|
||||
<title>Porting The Z8530 Driver</title>
|
||||
<para>
|
||||
The Z8530 driver is written to be portable. In DMA mode it makes
|
||||
assumptions about the use of ISA DMA. These are probably warranted
|
||||
in most cases as the Z85230 in particular was designed to glue to PC
|
||||
type machines. The PIO mode makes no real assumptions.
|
||||
</para>
|
||||
<para>
|
||||
Should you need to retarget the Z8530 driver to another architecture
|
||||
the only code that should need changing are the port I/O functions.
|
||||
At the moment these assume PC I/O port accesses. This may not be
|
||||
appropriate for all platforms. Replacing
|
||||
<function>z8530_read_port</function> and <function>z8530_write_port
|
||||
</function> is intended to be all that is required to port this
|
||||
driver layer.
|
||||
</para>
|
||||
</chapter>
|
||||
|
||||
<chapter id="bugs">
|
||||
<title>Known Bugs And Assumptions</title>
|
||||
<para>
|
||||
<variablelist>
|
||||
<varlistentry><term>Interrupt Locking</term>
|
||||
<listitem>
|
||||
<para>
|
||||
The locking in the driver is done via the global cli/sti lock. This
|
||||
makes for relatively poor SMP performance. Switching this to use a
|
||||
per device spin lock would probably materially improve performance.
|
||||
</para>
|
||||
</listitem></varlistentry>
|
||||
|
||||
<varlistentry><term>Occasional Failures</term>
|
||||
<listitem>
|
||||
<para>
|
||||
We have reports of occasional failures when run for very long
|
||||
periods of time and the driver starts to receive junk frames. At
|
||||
the moment the cause of this is not clear.
|
||||
</para>
|
||||
</listitem></varlistentry>
|
||||
</variablelist>
|
||||
|
||||
</para>
|
||||
</chapter>
|
||||
|
||||
<chapter id="pubfunctions">
|
||||
<title>Public Functions Provided</title>
|
||||
!Edrivers/net/wan/z85230.c
|
||||
</chapter>
|
||||
|
||||
<chapter id="intfunctions">
|
||||
<title>Internal Functions</title>
|
||||
!Idrivers/net/wan/z85230.c
|
||||
</chapter>
|
||||
|
||||
</book>
|
@ -1,9 +1,8 @@
|
||||
=====================
|
||||
The Linux IPMI Driver
|
||||
=====================
|
||||
|
||||
The Linux IPMI Driver
|
||||
---------------------
|
||||
Corey Minyard
|
||||
<minyard@mvista.com>
|
||||
<minyard@acm.org>
|
||||
:Author: Corey Minyard <minyard@mvista.com> / <minyard@acm.org>
|
||||
|
||||
The Intelligent Platform Management Interface, or IPMI, is a
|
||||
standard for controlling intelligent devices that monitor a system.
|
||||
@ -141,7 +140,7 @@ Addressing
|
||||
----------
|
||||
|
||||
The IPMI addressing works much like IP addresses, you have an overlay
|
||||
to handle the different address types. The overlay is:
|
||||
to handle the different address types. The overlay is::
|
||||
|
||||
struct ipmi_addr
|
||||
{
|
||||
@ -153,7 +152,7 @@ to handle the different address types. The overlay is:
|
||||
The addr_type determines what the address really is. The driver
|
||||
currently understands two different types of addresses.
|
||||
|
||||
"System Interface" addresses are defined as:
|
||||
"System Interface" addresses are defined as::
|
||||
|
||||
struct ipmi_system_interface_addr
|
||||
{
|
||||
@ -166,7 +165,7 @@ straight to the BMC on the current card. The channel must be
|
||||
IPMI_BMC_CHANNEL.
|
||||
|
||||
Messages that are destined to go out on the IPMB bus use the
|
||||
IPMI_IPMB_ADDR_TYPE address type. The format is
|
||||
IPMI_IPMB_ADDR_TYPE address type. The format is::
|
||||
|
||||
struct ipmi_ipmb_addr
|
||||
{
|
||||
@ -184,16 +183,16 @@ spec.
|
||||
Messages
|
||||
--------
|
||||
|
||||
Messages are defined as:
|
||||
Messages are defined as::
|
||||
|
||||
struct ipmi_msg
|
||||
{
|
||||
struct ipmi_msg
|
||||
{
|
||||
unsigned char netfn;
|
||||
unsigned char lun;
|
||||
unsigned char cmd;
|
||||
unsigned char *data;
|
||||
int data_len;
|
||||
};
|
||||
};
|
||||
|
||||
The driver takes care of adding/stripping the header information. The
|
||||
data portion is just the data to be send (do NOT put addressing info
|
||||
@ -208,7 +207,7 @@ block of data, even when receiving messages. Otherwise the driver
|
||||
will have no place to put the message.
|
||||
|
||||
Messages coming up from the message handler in kernelland will come in
|
||||
as:
|
||||
as::
|
||||
|
||||
struct ipmi_recv_msg
|
||||
{
|
||||
@ -246,6 +245,7 @@ and the user should not have to care what type of SMI is below them.
|
||||
|
||||
|
||||
Watching For Interfaces
|
||||
^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
When your code comes up, the IPMI driver may or may not have detected
|
||||
if IPMI devices exist. So you might have to defer your setup until
|
||||
@ -256,6 +256,7 @@ and tell you when they come and go.
|
||||
|
||||
|
||||
Creating the User
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
To use the message handler, you must first create a user using
|
||||
ipmi_create_user. The interface number specifies which SMI you want
|
||||
@ -272,6 +273,7 @@ closing the device automatically destroys the user.
|
||||
|
||||
|
||||
Messaging
|
||||
^^^^^^^^^
|
||||
|
||||
To send a message from kernel-land, the ipmi_request_settime() call does
|
||||
pretty much all message handling. Most of the parameter are
|
||||
@ -321,6 +323,7 @@ though, since it is tricky to manage your own buffers.
|
||||
|
||||
|
||||
Events and Incoming Commands
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The driver takes care of polling for IPMI events and receiving
|
||||
commands (commands are messages that are not responses, they are
|
||||
@ -367,7 +370,7 @@ in the system. It discovers interfaces through a host of different
|
||||
methods, depending on the system.
|
||||
|
||||
You can specify up to four interfaces on the module load line and
|
||||
control some module parameters:
|
||||
control some module parameters::
|
||||
|
||||
modprobe ipmi_si.o type=<type1>,<type2>....
|
||||
ports=<port1>,<port2>... addrs=<addr1>,<addr2>...
|
||||
@ -437,7 +440,7 @@ default is one. Setting to 0 is useful with the hotmod, but is
|
||||
obviously only useful for modules.
|
||||
|
||||
When compiled into the kernel, the parameters can be specified on the
|
||||
kernel command line as:
|
||||
kernel command line as::
|
||||
|
||||
ipmi_si.type=<type1>,<type2>...
|
||||
ipmi_si.ports=<port1>,<port2>... ipmi_si.addrs=<addr1>,<addr2>...
|
||||
@ -474,16 +477,22 @@ The driver supports a hot add and remove of interfaces. This way,
|
||||
interfaces can be added or removed after the kernel is up and running.
|
||||
This is done using /sys/modules/ipmi_si/parameters/hotmod, which is a
|
||||
write-only parameter. You write a string to this interface. The string
|
||||
has the format:
|
||||
has the format::
|
||||
|
||||
<op1>[:op2[:op3...]]
|
||||
The "op"s are:
|
||||
|
||||
The "op"s are::
|
||||
|
||||
add|remove,kcs|bt|smic,mem|i/o,<address>[,<opt1>[,<opt2>[,...]]]
|
||||
You can specify more than one interface on the line. The "opt"s are:
|
||||
|
||||
You can specify more than one interface on the line. The "opt"s are::
|
||||
|
||||
rsp=<regspacing>
|
||||
rsi=<regsize>
|
||||
rsh=<regshift>
|
||||
irq=<irq>
|
||||
ipmb=<ipmb slave addr>
|
||||
|
||||
and these have the same meanings as discussed above. Note that you
|
||||
can also use this on the kernel command line for a more compact format
|
||||
for specifying an interface. Note that when removing an interface,
|
||||
@ -496,7 +505,7 @@ The SMBus Driver (SSIF)
|
||||
The SMBus driver allows up to 4 SMBus devices to be configured in the
|
||||
system. By default, the driver will only register with something it
|
||||
finds in DMI or ACPI tables. You can change this
|
||||
at module load time (for a module) with:
|
||||
at module load time (for a module) with::
|
||||
|
||||
modprobe ipmi_ssif.o
|
||||
addr=<i2caddr1>[,<i2caddr2>[,...]]
|
||||
@ -535,7 +544,7 @@ the smb_addr parameter unless you have DMI or ACPI data to tell the
|
||||
driver what to use.
|
||||
|
||||
When compiled into the kernel, the addresses can be specified on the
|
||||
kernel command line as:
|
||||
kernel command line as::
|
||||
|
||||
ipmb_ssif.addr=<i2caddr1>[,<i2caddr2>[...]]
|
||||
ipmi_ssif.adapter=<adapter1>[,<adapter2>[...]]
|
||||
@ -565,9 +574,9 @@ Some users need more detailed information about a device, like where
|
||||
the address came from or the raw base device for the IPMI interface.
|
||||
You can use the IPMI smi_watcher to catch the IPMI interfaces as they
|
||||
come or go, and to grab the information, you can use the function
|
||||
ipmi_get_smi_info(), which returns the following structure:
|
||||
ipmi_get_smi_info(), which returns the following structure::
|
||||
|
||||
struct ipmi_smi_info {
|
||||
struct ipmi_smi_info {
|
||||
enum ipmi_addr_src addr_src;
|
||||
struct device *dev;
|
||||
union {
|
||||
@ -575,7 +584,7 @@ struct ipmi_smi_info {
|
||||
void *acpi_handle;
|
||||
} acpi_info;
|
||||
} addr_info;
|
||||
};
|
||||
};
|
||||
|
||||
Currently special info for only for SI_ACPI address sources is
|
||||
returned. Others may be added as necessary.
|
||||
@ -590,7 +599,7 @@ Watchdog
|
||||
|
||||
A watchdog timer is provided that implements the Linux-standard
|
||||
watchdog timer interface. It has three module parameters that can be
|
||||
used to control it:
|
||||
used to control it::
|
||||
|
||||
modprobe ipmi_watchdog timeout=<t> pretimeout=<t> action=<action type>
|
||||
preaction=<preaction type> preop=<preop type> start_now=x
|
||||
@ -635,7 +644,7 @@ watchdog device is closed. The default value of nowayout is true
|
||||
if the CONFIG_WATCHDOG_NOWAYOUT option is enabled, or false if not.
|
||||
|
||||
When compiled into the kernel, the kernel command line is available
|
||||
for configuring the watchdog:
|
||||
for configuring the watchdog::
|
||||
|
||||
ipmi_watchdog.timeout=<t> ipmi_watchdog.pretimeout=<t>
|
||||
ipmi_watchdog.action=<action type>
|
||||
@ -675,6 +684,7 @@ also get a bunch of OEM events holding the panic string.
|
||||
|
||||
|
||||
The field settings of the events are:
|
||||
|
||||
* Generator ID: 0x21 (kernel)
|
||||
* EvM Rev: 0x03 (this event is formatting in IPMI 1.0 format)
|
||||
* Sensor Type: 0x20 (OS critical stop sensor)
|
||||
@ -683,18 +693,20 @@ The field settings of the events are:
|
||||
* Event Data 1: 0xa1 (Runtime stop in OEM bytes 2 and 3)
|
||||
* Event data 2: second byte of panic string
|
||||
* Event data 3: third byte of panic string
|
||||
|
||||
See the IPMI spec for the details of the event layout. This event is
|
||||
always sent to the local management controller. It will handle routing
|
||||
the message to the right place
|
||||
|
||||
Other OEM events have the following format:
|
||||
Record ID (bytes 0-1): Set by the SEL.
|
||||
Record type (byte 2): 0xf0 (OEM non-timestamped)
|
||||
byte 3: The slave address of the card saving the panic
|
||||
byte 4: A sequence number (starting at zero)
|
||||
The rest of the bytes (11 bytes) are the panic string. If the panic string
|
||||
is longer than 11 bytes, multiple messages will be sent with increasing
|
||||
sequence numbers.
|
||||
|
||||
* Record ID (bytes 0-1): Set by the SEL.
|
||||
* Record type (byte 2): 0xf0 (OEM non-timestamped)
|
||||
* byte 3: The slave address of the card saving the panic
|
||||
* byte 4: A sequence number (starting at zero)
|
||||
The rest of the bytes (11 bytes) are the panic string. If the panic string
|
||||
is longer than 11 bytes, multiple messages will be sent with increasing
|
||||
sequence numbers.
|
||||
|
||||
Because you cannot send OEM events using the standard interface, this
|
||||
function will attempt to find an SEL and add the events there. It
|
||||
|
@ -1,8 +1,11 @@
|
||||
ChangeLog:
|
||||
Started by Ingo Molnar <mingo@redhat.com>
|
||||
Update by Max Krasnyansky <maxk@qualcomm.com>
|
||||
|
||||
================
|
||||
SMP IRQ affinity
|
||||
================
|
||||
|
||||
ChangeLog:
|
||||
- Started by Ingo Molnar <mingo@redhat.com>
|
||||
- Update by Max Krasnyansky <maxk@qualcomm.com>
|
||||
|
||||
|
||||
/proc/irq/IRQ#/smp_affinity and /proc/irq/IRQ#/smp_affinity_list specify
|
||||
which target CPUs are permitted for a given IRQ source. It's a bitmask
|
||||
@ -16,50 +19,52 @@ will be set to the default mask. It can then be changed as described above.
|
||||
Default mask is 0xffffffff.
|
||||
|
||||
Here is an example of restricting IRQ44 (eth1) to CPU0-3 then restricting
|
||||
it to CPU4-7 (this is an 8-CPU SMP box):
|
||||
it to CPU4-7 (this is an 8-CPU SMP box)::
|
||||
|
||||
[root@moon 44]# cd /proc/irq/44
|
||||
[root@moon 44]# cat smp_affinity
|
||||
ffffffff
|
||||
[root@moon 44]# cd /proc/irq/44
|
||||
[root@moon 44]# cat smp_affinity
|
||||
ffffffff
|
||||
|
||||
[root@moon 44]# echo 0f > smp_affinity
|
||||
[root@moon 44]# cat smp_affinity
|
||||
0000000f
|
||||
[root@moon 44]# ping -f h
|
||||
PING hell (195.4.7.3): 56 data bytes
|
||||
...
|
||||
--- hell ping statistics ---
|
||||
6029 packets transmitted, 6027 packets received, 0% packet loss
|
||||
round-trip min/avg/max = 0.1/0.1/0.4 ms
|
||||
[root@moon 44]# cat /proc/interrupts | grep 'CPU\|44:'
|
||||
CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7
|
||||
44: 1068 1785 1785 1783 0 0 0 0 IO-APIC-level eth1
|
||||
[root@moon 44]# echo 0f > smp_affinity
|
||||
[root@moon 44]# cat smp_affinity
|
||||
0000000f
|
||||
[root@moon 44]# ping -f h
|
||||
PING hell (195.4.7.3): 56 data bytes
|
||||
...
|
||||
--- hell ping statistics ---
|
||||
6029 packets transmitted, 6027 packets received, 0% packet loss
|
||||
round-trip min/avg/max = 0.1/0.1/0.4 ms
|
||||
[root@moon 44]# cat /proc/interrupts | grep 'CPU\|44:'
|
||||
CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7
|
||||
44: 1068 1785 1785 1783 0 0 0 0 IO-APIC-level eth1
|
||||
|
||||
As can be seen from the line above IRQ44 was delivered only to the first four
|
||||
processors (0-3).
|
||||
Now lets restrict that IRQ to CPU(4-7).
|
||||
|
||||
[root@moon 44]# echo f0 > smp_affinity
|
||||
[root@moon 44]# cat smp_affinity
|
||||
000000f0
|
||||
[root@moon 44]# ping -f h
|
||||
PING hell (195.4.7.3): 56 data bytes
|
||||
..
|
||||
--- hell ping statistics ---
|
||||
2779 packets transmitted, 2777 packets received, 0% packet loss
|
||||
round-trip min/avg/max = 0.1/0.5/585.4 ms
|
||||
[root@moon 44]# cat /proc/interrupts | 'CPU\|44:'
|
||||
CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7
|
||||
44: 1068 1785 1785 1783 1784 1069 1070 1069 IO-APIC-level eth1
|
||||
::
|
||||
|
||||
[root@moon 44]# echo f0 > smp_affinity
|
||||
[root@moon 44]# cat smp_affinity
|
||||
000000f0
|
||||
[root@moon 44]# ping -f h
|
||||
PING hell (195.4.7.3): 56 data bytes
|
||||
..
|
||||
--- hell ping statistics ---
|
||||
2779 packets transmitted, 2777 packets received, 0% packet loss
|
||||
round-trip min/avg/max = 0.1/0.5/585.4 ms
|
||||
[root@moon 44]# cat /proc/interrupts | 'CPU\|44:'
|
||||
CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7
|
||||
44: 1068 1785 1785 1783 1784 1069 1070 1069 IO-APIC-level eth1
|
||||
|
||||
This time around IRQ44 was delivered only to the last four processors.
|
||||
i.e counters for the CPU0-3 did not change.
|
||||
|
||||
Here is an example of limiting that same irq (44) to cpus 1024 to 1031:
|
||||
Here is an example of limiting that same irq (44) to cpus 1024 to 1031::
|
||||
|
||||
[root@moon 44]# echo 1024-1031 > smp_affinity_list
|
||||
[root@moon 44]# cat smp_affinity_list
|
||||
1024-1031
|
||||
[root@moon 44]# echo 1024-1031 > smp_affinity_list
|
||||
[root@moon 44]# cat smp_affinity_list
|
||||
1024-1031
|
||||
|
||||
Note that to do this with a bitmask would require 32 bitmasks of zero
|
||||
to follow the pertinent one.
|
||||
|
@ -1,4 +1,6 @@
|
||||
irq_domain interrupt number mapping library
|
||||
===============================================
|
||||
The irq_domain interrupt number mapping library
|
||||
===============================================
|
||||
|
||||
The current design of the Linux kernel uses a single large number
|
||||
space where each separate IRQ source is assigned a different number.
|
||||
@ -36,7 +38,9 @@ irq_domain also implements translation from an abstract irq_fwspec
|
||||
structure to hwirq numbers (Device Tree and ACPI GSI so far), and can
|
||||
be easily extended to support other IRQ topology data sources.
|
||||
|
||||
=== irq_domain usage ===
|
||||
irq_domain usage
|
||||
================
|
||||
|
||||
An interrupt controller driver creates and registers an irq_domain by
|
||||
calling one of the irq_domain_add_*() functions (each mapping method
|
||||
has a different allocator function, more on that later). The function
|
||||
@ -62,15 +66,21 @@ If the driver has the Linux IRQ number or the irq_data pointer, and
|
||||
needs to know the associated hwirq number (such as in the irq_chip
|
||||
callbacks) then it can be directly obtained from irq_data->hwirq.
|
||||
|
||||
=== Types of irq_domain mappings ===
|
||||
Types of irq_domain mappings
|
||||
============================
|
||||
|
||||
There are several mechanisms available for reverse mapping from hwirq
|
||||
to Linux irq, and each mechanism uses a different allocation function.
|
||||
Which reverse map type should be used depends on the use case. Each
|
||||
of the reverse map types are described below:
|
||||
|
||||
==== Linear ====
|
||||
irq_domain_add_linear()
|
||||
irq_domain_create_linear()
|
||||
Linear
|
||||
------
|
||||
|
||||
::
|
||||
|
||||
irq_domain_add_linear()
|
||||
irq_domain_create_linear()
|
||||
|
||||
The linear reverse map maintains a fixed size table indexed by the
|
||||
hwirq number. When a hwirq is mapped, an irq_desc is allocated for
|
||||
@ -89,9 +99,13 @@ accepts a more general abstraction 'struct fwnode_handle'.
|
||||
|
||||
The majority of drivers should use the linear map.
|
||||
|
||||
==== Tree ====
|
||||
irq_domain_add_tree()
|
||||
irq_domain_create_tree()
|
||||
Tree
|
||||
----
|
||||
|
||||
::
|
||||
|
||||
irq_domain_add_tree()
|
||||
irq_domain_create_tree()
|
||||
|
||||
The irq_domain maintains a radix tree map from hwirq numbers to Linux
|
||||
IRQs. When an hwirq is mapped, an irq_desc is allocated and the
|
||||
@ -109,8 +123,12 @@ accepts a more general abstraction 'struct fwnode_handle'.
|
||||
|
||||
Very few drivers should need this mapping.
|
||||
|
||||
==== No Map ===-
|
||||
irq_domain_add_nomap()
|
||||
No Map
|
||||
------
|
||||
|
||||
::
|
||||
|
||||
irq_domain_add_nomap()
|
||||
|
||||
The No Map mapping is to be used when the hwirq number is
|
||||
programmable in the hardware. In this case it is best to program the
|
||||
@ -121,10 +139,14 @@ Linux IRQ number into the hardware.
|
||||
|
||||
Most drivers cannot use this mapping.
|
||||
|
||||
==== Legacy ====
|
||||
irq_domain_add_simple()
|
||||
irq_domain_add_legacy()
|
||||
irq_domain_add_legacy_isa()
|
||||
Legacy
|
||||
------
|
||||
|
||||
::
|
||||
|
||||
irq_domain_add_simple()
|
||||
irq_domain_add_legacy()
|
||||
irq_domain_add_legacy_isa()
|
||||
|
||||
The Legacy mapping is a special case for drivers that already have a
|
||||
range of irq_descs allocated for the hwirqs. It is used when the
|
||||
@ -163,14 +185,17 @@ that the driver using the simple domain call irq_create_mapping()
|
||||
before any irq_find_mapping() since the latter will actually work
|
||||
for the static IRQ assignment case.
|
||||
|
||||
==== Hierarchy IRQ domain ====
|
||||
Hierarchy IRQ domain
|
||||
--------------------
|
||||
|
||||
On some architectures, there may be multiple interrupt controllers
|
||||
involved in delivering an interrupt from the device to the target CPU.
|
||||
Let's look at a typical interrupt delivering path on x86 platforms:
|
||||
Let's look at a typical interrupt delivering path on x86 platforms::
|
||||
|
||||
Device --> IOAPIC -> Interrupt remapping Controller -> Local APIC -> CPU
|
||||
Device --> IOAPIC -> Interrupt remapping Controller -> Local APIC -> CPU
|
||||
|
||||
There are three interrupt controllers involved:
|
||||
|
||||
1) IOAPIC controller
|
||||
2) Interrupt remapping controller
|
||||
3) Local APIC controller
|
||||
@ -180,7 +205,8 @@ hardware architecture, an irq_domain data structure is built for each
|
||||
interrupt controller and those irq_domains are organized into hierarchy.
|
||||
When building irq_domain hierarchy, the irq_domain near to the device is
|
||||
child and the irq_domain near to CPU is parent. So a hierarchy structure
|
||||
as below will be built for the example above.
|
||||
as below will be built for the example above::
|
||||
|
||||
CPU Vector irq_domain (root irq_domain to manage CPU vectors)
|
||||
^
|
||||
|
|
||||
@ -190,6 +216,7 @@ as below will be built for the example above.
|
||||
IOAPIC irq_domain (manage IOAPIC delivery entries/pins)
|
||||
|
||||
There are four major interfaces to use hierarchy irq_domain:
|
||||
|
||||
1) irq_domain_alloc_irqs(): allocate IRQ descriptors and interrupt
|
||||
controller related resources to deliver these interrupts.
|
||||
2) irq_domain_free_irqs(): free IRQ descriptors and interrupt controller
|
||||
@ -199,7 +226,8 @@ There are four major interfaces to use hierarchy irq_domain:
|
||||
4) irq_domain_deactivate_irq(): deactivate interrupt controller hardware
|
||||
to stop delivering the interrupt.
|
||||
|
||||
Following changes are needed to support hierarchy irq_domain.
|
||||
Following changes are needed to support hierarchy irq_domain:
|
||||
|
||||
1) a new field 'parent' is added to struct irq_domain; it's used to
|
||||
maintain irq_domain hierarchy information.
|
||||
2) a new field 'parent_data' is added to struct irq_data; it's used to
|
||||
@ -223,6 +251,7 @@ software architecture.
|
||||
|
||||
For an interrupt controller driver to support hierarchy irq_domain, it
|
||||
needs to:
|
||||
|
||||
1) Implement irq_domain_ops.alloc and irq_domain_ops.free
|
||||
2) Optionally implement irq_domain_ops.activate and
|
||||
irq_domain_ops.deactivate.
|
||||
@ -231,5 +260,42 @@ needs to:
|
||||
4) No need to implement irq_domain_ops.map and irq_domain_ops.unmap,
|
||||
they are unused with hierarchy irq_domain.
|
||||
|
||||
Hierarchy irq_domain may also be used to support other architectures,
|
||||
such as ARM, ARM64 etc.
|
||||
Hierarchy irq_domain is in no way x86 specific, and is heavily used to
|
||||
support other architectures, such as ARM, ARM64 etc.
|
||||
|
||||
=== Debugging ===
|
||||
|
||||
If you switch on CONFIG_IRQ_DOMAIN_DEBUG (which depends on
|
||||
CONFIG_IRQ_DOMAIN and CONFIG_DEBUG_FS), you will find a new file in
|
||||
your debugfs mount point, called irq_domain_mapping. This file
|
||||
contains a live snapshot of all the IRQ domains in the system:
|
||||
|
||||
name mapped linear-max direct-max devtree-node
|
||||
pl061 8 8 0 /smb/gpio@e0080000
|
||||
pl061 8 8 0 /smb/gpio@e1050000
|
||||
pMSI 0 0 0 /interrupt-controller@e1101000/v2m@e0080000
|
||||
MSI 37 0 0 /interrupt-controller@e1101000/v2m@e0080000
|
||||
GICv2m 37 0 0 /interrupt-controller@e1101000/v2m@e0080000
|
||||
GICv2 448 448 0 /interrupt-controller@e1101000
|
||||
|
||||
it also iterates over the interrupts to display their mapping in the
|
||||
domains, and makes the domain stacking visible:
|
||||
|
||||
|
||||
irq hwirq chip name chip data active type domain
|
||||
1 0x00019 GICv2 0xffff00000916bfd8 * LINEAR GICv2
|
||||
2 0x0001d GICv2 0xffff00000916bfd8 LINEAR GICv2
|
||||
3 0x0001e GICv2 0xffff00000916bfd8 * LINEAR GICv2
|
||||
4 0x0001b GICv2 0xffff00000916bfd8 * LINEAR GICv2
|
||||
5 0x0001a GICv2 0xffff00000916bfd8 LINEAR GICv2
|
||||
[...]
|
||||
96 0x81808 MSI 0x (null) RADIX MSI
|
||||
96+ 0x00063 GICv2m 0xffff8003ee116980 RADIX GICv2m
|
||||
96+ 0x00063 GICv2 0xffff00000916bfd8 LINEAR GICv2
|
||||
97 0x08800 MSI 0x (null) * RADIX MSI
|
||||
97+ 0x00064 GICv2m 0xffff8003ee116980 * RADIX GICv2m
|
||||
97+ 0x00064 GICv2 0xffff00000916bfd8 * LINEAR GICv2
|
||||
|
||||
Here, interrupts 1-5 are only using a single domain, while 96 and 97
|
||||
are build out of a stack of three domain, each level performing a
|
||||
particular function.
|
||||
|
@ -1,4 +1,6 @@
|
||||
===============
|
||||
What is an IRQ?
|
||||
===============
|
||||
|
||||
An IRQ is an interrupt request from a device.
|
||||
Currently they can come in over a pin, or over a packet.
|
||||
|
@ -1,3 +1,4 @@
|
||||
===================
|
||||
Linux IOMMU Support
|
||||
===================
|
||||
|
||||
@ -9,11 +10,11 @@ This guide gives a quick cheat sheet for some basic understanding.
|
||||
|
||||
Some Keywords
|
||||
|
||||
DMAR - DMA remapping
|
||||
DRHD - DMA Remapping Hardware Unit Definition
|
||||
RMRR - Reserved memory Region Reporting Structure
|
||||
ZLR - Zero length reads from PCI devices
|
||||
IOVA - IO Virtual address.
|
||||
- DMAR - DMA remapping
|
||||
- DRHD - DMA Remapping Hardware Unit Definition
|
||||
- RMRR - Reserved memory Region Reporting Structure
|
||||
- ZLR - Zero length reads from PCI devices
|
||||
- IOVA - IO Virtual address.
|
||||
|
||||
Basic stuff
|
||||
-----------
|
||||
@ -33,7 +34,7 @@ devices that need to access these regions. OS is expected to setup
|
||||
unity mappings for these regions for these devices to access these regions.
|
||||
|
||||
How is IOVA generated?
|
||||
---------------------
|
||||
----------------------
|
||||
|
||||
Well behaved drivers call pci_map_*() calls before sending command to device
|
||||
that needs to perform DMA. Once DMA is completed and mapping is no longer
|
||||
@ -82,14 +83,14 @@ in ACPI.
|
||||
ACPI: DMAR (v001 A M I OEMDMAR 0x00000001 MSFT 0x00000097) @ 0x000000007f5b5ef0
|
||||
|
||||
When DMAR is being processed and initialized by ACPI, prints DMAR locations
|
||||
and any RMRR's processed.
|
||||
and any RMRR's processed::
|
||||
|
||||
ACPI DMAR:Host address width 36
|
||||
ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed90000
|
||||
ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed91000
|
||||
ACPI DMAR:DRHD (flags: 0x00000001)base: 0x00000000fed93000
|
||||
ACPI DMAR:RMRR base: 0x00000000000ed000 end: 0x00000000000effff
|
||||
ACPI DMAR:RMRR base: 0x000000007f600000 end: 0x000000007fffffff
|
||||
ACPI DMAR:Host address width 36
|
||||
ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed90000
|
||||
ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed91000
|
||||
ACPI DMAR:DRHD (flags: 0x00000001)base: 0x00000000fed93000
|
||||
ACPI DMAR:RMRR base: 0x00000000000ed000 end: 0x00000000000effff
|
||||
ACPI DMAR:RMRR base: 0x000000007f600000 end: 0x000000007fffffff
|
||||
|
||||
When DMAR is enabled for use, you will notice..
|
||||
|
||||
@ -98,10 +99,12 @@ PCI-DMA: Using DMAR IOMMU
|
||||
Fault reporting
|
||||
---------------
|
||||
|
||||
DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
|
||||
DMAR:[fault reason 05] PTE Write access is not set
|
||||
DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
|
||||
DMAR:[fault reason 05] PTE Write access is not set
|
||||
::
|
||||
|
||||
DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
|
||||
DMAR:[fault reason 05] PTE Write access is not set
|
||||
DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
|
||||
DMAR:[fault reason 05] PTE Write access is not set
|
||||
|
||||
TBD
|
||||
----
|
||||
|
@ -1 +1,126 @@
|
||||
# -*- makefile -*-
|
||||
# Makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
subdir-y :=
|
||||
|
||||
# You can set these variables from the command line.
|
||||
SPHINXBUILD = sphinx-build
|
||||
SPHINXOPTS =
|
||||
SPHINXDIRS = .
|
||||
_SPHINXDIRS = $(patsubst $(srctree)/Documentation/%/conf.py,%,$(wildcard $(srctree)/Documentation/*/conf.py))
|
||||
SPHINX_CONF = conf.py
|
||||
PAPER =
|
||||
BUILDDIR = $(obj)/output
|
||||
PDFLATEX = xelatex
|
||||
LATEXOPTS = -interaction=batchmode
|
||||
|
||||
# User-friendly check for sphinx-build
|
||||
HAVE_SPHINX := $(shell if which $(SPHINXBUILD) >/dev/null 2>&1; then echo 1; else echo 0; fi)
|
||||
|
||||
ifeq ($(HAVE_SPHINX),0)
|
||||
|
||||
.DEFAULT:
|
||||
$(warning The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed and in PATH, or set the SPHINXBUILD make variable to point to the full path of the '$(SPHINXBUILD)' executable.)
|
||||
@echo " SKIP Sphinx $@ target."
|
||||
|
||||
else # HAVE_SPHINX
|
||||
|
||||
# User-friendly check for pdflatex
|
||||
HAVE_PDFLATEX := $(shell if which $(PDFLATEX) >/dev/null 2>&1; then echo 1; else echo 0; fi)
|
||||
|
||||
# Internal variables.
|
||||
PAPEROPT_a4 = -D latex_paper_size=a4
|
||||
PAPEROPT_letter = -D latex_paper_size=letter
|
||||
KERNELDOC = $(srctree)/scripts/kernel-doc
|
||||
KERNELDOC_CONF = -D kerneldoc_srctree=$(srctree) -D kerneldoc_bin=$(KERNELDOC)
|
||||
ALLSPHINXOPTS = $(KERNELDOC_CONF) $(PAPEROPT_$(PAPER)) $(SPHINXOPTS)
|
||||
# the i18n builder cannot share the environment and doctrees with the others
|
||||
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
|
||||
|
||||
# commands; the 'cmd' from scripts/Kbuild.include is not *loopable*
|
||||
loop_cmd = $(echo-cmd) $(cmd_$(1)) || exit;
|
||||
|
||||
# $2 sphinx builder e.g. "html"
|
||||
# $3 name of the build subfolder / e.g. "media", used as:
|
||||
# * dest folder relative to $(BUILDDIR) and
|
||||
# * cache folder relative to $(BUILDDIR)/.doctrees
|
||||
# $4 dest subfolder e.g. "man" for man pages at media/man
|
||||
# $5 reST source folder relative to $(srctree)/$(src),
|
||||
# e.g. "media" for the linux-tv book-set at ./Documentation/media
|
||||
|
||||
quiet_cmd_sphinx = SPHINX $@ --> file://$(abspath $(BUILDDIR)/$3/$4)
|
||||
cmd_sphinx = $(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media $2 && \
|
||||
PYTHONDONTWRITEBYTECODE=1 \
|
||||
BUILDDIR=$(abspath $(BUILDDIR)) SPHINX_CONF=$(abspath $(srctree)/$(src)/$5/$(SPHINX_CONF)) \
|
||||
$(SPHINXBUILD) \
|
||||
-b $2 \
|
||||
-c $(abspath $(srctree)/$(src)) \
|
||||
-d $(abspath $(BUILDDIR)/.doctrees/$3) \
|
||||
-D version=$(KERNELVERSION) -D release=$(KERNELRELEASE) \
|
||||
$(ALLSPHINXOPTS) \
|
||||
$(abspath $(srctree)/$(src)/$5) \
|
||||
$(abspath $(BUILDDIR)/$3/$4)
|
||||
|
||||
htmldocs:
|
||||
@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,html,$(var),,$(var)))
|
||||
|
||||
linkcheckdocs:
|
||||
@$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,linkcheck,$(var),,$(var)))
|
||||
|
||||
latexdocs:
|
||||
@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,latex,$(var),latex,$(var)))
|
||||
|
||||
ifeq ($(HAVE_PDFLATEX),0)
|
||||
|
||||
pdfdocs:
|
||||
$(warning The '$(PDFLATEX)' command was not found. Make sure you have it installed and in PATH to produce PDF output.)
|
||||
@echo " SKIP Sphinx $@ target."
|
||||
|
||||
else # HAVE_PDFLATEX
|
||||
|
||||
pdfdocs: latexdocs
|
||||
$(foreach var,$(SPHINXDIRS), $(MAKE) PDFLATEX=$(PDFLATEX) LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit;)
|
||||
|
||||
endif # HAVE_PDFLATEX
|
||||
|
||||
epubdocs:
|
||||
@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,epub,$(var),epub,$(var)))
|
||||
|
||||
xmldocs:
|
||||
@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,xml,$(var),xml,$(var)))
|
||||
|
||||
endif # HAVE_SPHINX
|
||||
|
||||
# The following targets are independent of HAVE_SPHINX, and the rules should
|
||||
# work or silently pass without Sphinx.
|
||||
|
||||
# no-ops for the Sphinx toolchain
|
||||
sgmldocs:
|
||||
@:
|
||||
psdocs:
|
||||
@:
|
||||
mandocs:
|
||||
@:
|
||||
installmandocs:
|
||||
@:
|
||||
|
||||
cleandocs:
|
||||
$(Q)rm -rf $(BUILDDIR)
|
||||
$(Q)$(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media clean
|
||||
|
||||
dochelp:
|
||||
@echo ' Linux kernel internal documentation in different formats from ReST:'
|
||||
@echo ' htmldocs - HTML'
|
||||
@echo ' latexdocs - LaTeX'
|
||||
@echo ' pdfdocs - PDF'
|
||||
@echo ' epubdocs - EPUB'
|
||||
@echo ' xmldocs - XML'
|
||||
@echo ' linkcheckdocs - check for broken external links (will connect to external hosts)'
|
||||
@echo ' cleandocs - clean all generated files'
|
||||
@echo
|
||||
@echo ' make SPHINXDIRS="s1 s2" [target] Generate only docs of folder s1, s2'
|
||||
@echo ' valid values for SPHINXDIRS are: $(_SPHINXDIRS)'
|
||||
@echo
|
||||
@echo ' make SPHINX_CONF={conf-file} [target] use *additional* sphinx-build'
|
||||
@echo ' configuration. This is e.g. useful to build with nit-picking config.'
|
||||
|
@ -1,130 +0,0 @@
|
||||
# -*- makefile -*-
|
||||
# Makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
# You can set these variables from the command line.
|
||||
SPHINXBUILD = sphinx-build
|
||||
SPHINXOPTS =
|
||||
SPHINXDIRS = .
|
||||
_SPHINXDIRS = $(patsubst $(srctree)/Documentation/%/conf.py,%,$(wildcard $(srctree)/Documentation/*/conf.py))
|
||||
SPHINX_CONF = conf.py
|
||||
PAPER =
|
||||
BUILDDIR = $(obj)/output
|
||||
PDFLATEX = xelatex
|
||||
LATEXOPTS = -interaction=batchmode
|
||||
|
||||
# User-friendly check for sphinx-build
|
||||
HAVE_SPHINX := $(shell if which $(SPHINXBUILD) >/dev/null 2>&1; then echo 1; else echo 0; fi)
|
||||
|
||||
ifeq ($(HAVE_SPHINX),0)
|
||||
|
||||
.DEFAULT:
|
||||
$(warning The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed and in PATH, or set the SPHINXBUILD make variable to point to the full path of the '$(SPHINXBUILD)' executable.)
|
||||
@echo " SKIP Sphinx $@ target."
|
||||
|
||||
else ifneq ($(DOCBOOKS),)
|
||||
|
||||
# Skip Sphinx build if the user explicitly requested DOCBOOKS.
|
||||
.DEFAULT:
|
||||
@echo " SKIP Sphinx $@ target (DOCBOOKS specified)."
|
||||
|
||||
else # HAVE_SPHINX
|
||||
|
||||
# User-friendly check for pdflatex
|
||||
HAVE_PDFLATEX := $(shell if which $(PDFLATEX) >/dev/null 2>&1; then echo 1; else echo 0; fi)
|
||||
|
||||
# Internal variables.
|
||||
PAPEROPT_a4 = -D latex_paper_size=a4
|
||||
PAPEROPT_letter = -D latex_paper_size=letter
|
||||
KERNELDOC = $(srctree)/scripts/kernel-doc
|
||||
KERNELDOC_CONF = -D kerneldoc_srctree=$(srctree) -D kerneldoc_bin=$(KERNELDOC)
|
||||
ALLSPHINXOPTS = $(KERNELDOC_CONF) $(PAPEROPT_$(PAPER)) $(SPHINXOPTS)
|
||||
# the i18n builder cannot share the environment and doctrees with the others
|
||||
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
|
||||
|
||||
# commands; the 'cmd' from scripts/Kbuild.include is not *loopable*
|
||||
loop_cmd = $(echo-cmd) $(cmd_$(1)) || exit;
|
||||
|
||||
# $2 sphinx builder e.g. "html"
|
||||
# $3 name of the build subfolder / e.g. "media", used as:
|
||||
# * dest folder relative to $(BUILDDIR) and
|
||||
# * cache folder relative to $(BUILDDIR)/.doctrees
|
||||
# $4 dest subfolder e.g. "man" for man pages at media/man
|
||||
# $5 reST source folder relative to $(srctree)/$(src),
|
||||
# e.g. "media" for the linux-tv book-set at ./Documentation/media
|
||||
|
||||
quiet_cmd_sphinx = SPHINX $@ --> file://$(abspath $(BUILDDIR)/$3/$4)
|
||||
cmd_sphinx = $(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media $2 && \
|
||||
PYTHONDONTWRITEBYTECODE=1 \
|
||||
BUILDDIR=$(abspath $(BUILDDIR)) SPHINX_CONF=$(abspath $(srctree)/$(src)/$5/$(SPHINX_CONF)) \
|
||||
$(SPHINXBUILD) \
|
||||
-b $2 \
|
||||
-c $(abspath $(srctree)/$(src)) \
|
||||
-d $(abspath $(BUILDDIR)/.doctrees/$3) \
|
||||
-D version=$(KERNELVERSION) -D release=$(KERNELRELEASE) \
|
||||
$(ALLSPHINXOPTS) \
|
||||
$(abspath $(srctree)/$(src)/$5) \
|
||||
$(abspath $(BUILDDIR)/$3/$4)
|
||||
|
||||
htmldocs:
|
||||
@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,html,$(var),,$(var)))
|
||||
|
||||
linkcheckdocs:
|
||||
@$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,linkcheck,$(var),,$(var)))
|
||||
|
||||
latexdocs:
|
||||
@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,latex,$(var),latex,$(var)))
|
||||
|
||||
ifeq ($(HAVE_PDFLATEX),0)
|
||||
|
||||
pdfdocs:
|
||||
$(warning The '$(PDFLATEX)' command was not found. Make sure you have it installed and in PATH to produce PDF output.)
|
||||
@echo " SKIP Sphinx $@ target."
|
||||
|
||||
else # HAVE_PDFLATEX
|
||||
|
||||
pdfdocs: latexdocs
|
||||
$(foreach var,$(SPHINXDIRS), $(MAKE) PDFLATEX=$(PDFLATEX) LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit;)
|
||||
|
||||
endif # HAVE_PDFLATEX
|
||||
|
||||
epubdocs:
|
||||
@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,epub,$(var),epub,$(var)))
|
||||
|
||||
xmldocs:
|
||||
@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,xml,$(var),xml,$(var)))
|
||||
|
||||
endif # HAVE_SPHINX
|
||||
|
||||
# The following targets are independent of HAVE_SPHINX, and the rules should
|
||||
# work or silently pass without Sphinx.
|
||||
|
||||
# no-ops for the Sphinx toolchain
|
||||
sgmldocs:
|
||||
@:
|
||||
psdocs:
|
||||
@:
|
||||
mandocs:
|
||||
@:
|
||||
installmandocs:
|
||||
@:
|
||||
|
||||
cleandocs:
|
||||
$(Q)rm -rf $(BUILDDIR)
|
||||
$(Q)$(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media clean
|
||||
|
||||
dochelp:
|
||||
@echo ' Linux kernel internal documentation in different formats (Sphinx):'
|
||||
@echo ' htmldocs - HTML'
|
||||
@echo ' latexdocs - LaTeX'
|
||||
@echo ' pdfdocs - PDF'
|
||||
@echo ' epubdocs - EPUB'
|
||||
@echo ' xmldocs - XML'
|
||||
@echo ' linkcheckdocs - check for broken external links (will connect to external hosts)'
|
||||
@echo ' cleandocs - clean all generated files'
|
||||
@echo
|
||||
@echo ' make SPHINXDIRS="s1 s2" [target] Generate only docs of folder s1, s2'
|
||||
@echo ' valid values for SPHINXDIRS are: $(_SPHINXDIRS)'
|
||||
@echo
|
||||
@echo ' make SPHINX_CONF={conf-file} [target] use *additional* sphinx-build'
|
||||
@echo ' configuration. This is e.g. useful to build with nit-picking config.'
|
@ -186,7 +186,7 @@ must disable interrupts while the lock is held. If the device sends
|
||||
a different interrupt, the driver will deadlock trying to recursively
|
||||
acquire the spinlock. Such deadlocks can be avoided by using
|
||||
spin_lock_irqsave() or spin_lock_irq() which disable local interrupts
|
||||
and acquire the lock (see Documentation/DocBook/kernel-locking).
|
||||
and acquire the lock (see Documentation/kernel-hacking/locking.rst).
|
||||
|
||||
4.5 How to tell whether MSI/MSI-X is enabled on a device
|
||||
|
||||
|
@ -28,8 +28,6 @@ stallwarn.txt
|
||||
- RCU CPU stall warnings (module parameter rcu_cpu_stall_suppress)
|
||||
torture.txt
|
||||
- RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST)
|
||||
trace.txt
|
||||
- CONFIG_RCU_TRACE debugfs files and formats
|
||||
UP.txt
|
||||
- RCU on Uniprocessor Systems
|
||||
whatisRCU.txt
|
||||
|
@ -559,9 +559,7 @@ The <tt>rcu_access_pointer()</tt> on line 6 is similar to
|
||||
For <tt>remove_gp_synchronous()</tt>, as long as all modifications
|
||||
to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>,
|
||||
the above optimizations are harmless.
|
||||
However,
|
||||
with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>,
|
||||
<tt>sparse</tt> will complain if you
|
||||
However, <tt>sparse</tt> will complain if you
|
||||
define <tt>gp</tt> with <tt>__rcu</tt> and then
|
||||
access it without using
|
||||
either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>.
|
||||
@ -1849,7 +1847,8 @@ mass storage, or user patience, whichever comes first.
|
||||
If the nesting is not visible to the compiler, as is the case with
|
||||
mutually recursive functions each in its own translation unit,
|
||||
stack overflow will result.
|
||||
If the nesting takes the form of loops, either the control variable
|
||||
If the nesting takes the form of loops, perhaps in the guise of tail
|
||||
recursion, either the control variable
|
||||
will overflow or (in the Linux kernel) you will get an RCU CPU stall warning.
|
||||
Nevertheless, this class of RCU implementations is one
|
||||
of the most composable constructs in existence.
|
||||
@ -1977,9 +1976,8 @@ guard against mishaps and misuse:
|
||||
and <tt>rcu_dereference()</tt>, perhaps (incorrectly)
|
||||
substituting a simple assignment.
|
||||
To catch this sort of error, a given RCU-protected pointer may be
|
||||
tagged with <tt>__rcu</tt>, after which running sparse
|
||||
with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain
|
||||
about simple-assignment accesses to that pointer.
|
||||
tagged with <tt>__rcu</tt>, after which sparse
|
||||
will complain about simple-assignment accesses to that pointer.
|
||||
Arnd Bergmann made me aware of this requirement, and also
|
||||
supplied the needed
|
||||
<a href="https://lwn.net/Articles/376011/">patch series</a>.
|
||||
@ -2036,7 +2034,7 @@ guard against mishaps and misuse:
|
||||
some other synchronization mechanism, for example, reference
|
||||
counting.
|
||||
<li> In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related
|
||||
information is provided via both debugfs and event tracing.
|
||||
information is provided via event tracing.
|
||||
<li> Open-coded use of <tt>rcu_assign_pointer()</tt> and
|
||||
<tt>rcu_dereference()</tt> to create typical linked
|
||||
data structures can be surprisingly error-prone.
|
||||
@ -2519,11 +2517,7 @@ It is similarly socially unacceptable to interrupt an
|
||||
<tt>nohz_full</tt> CPU running in userspace.
|
||||
RCU must therefore track <tt>nohz_full</tt> userspace
|
||||
execution.
|
||||
And in
|
||||
<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a>
|
||||
kernels, RCU must separately track idle CPUs on the one hand and
|
||||
CPUs that are either idle or executing in userspace on the other.
|
||||
In both cases, RCU must be able to sample state at two points in
|
||||
RCU must therefore be able to sample state at two points in
|
||||
time, and be able to determine whether or not some other CPU spent
|
||||
any time idle and/or executing in userspace.
|
||||
|
||||
@ -2935,6 +2929,20 @@ The reason that this is possible is that SRCU is insensitive
|
||||
to whether or not a CPU is online, which means that <tt>srcu_barrier()</tt>
|
||||
need not exclude CPU-hotplug operations.
|
||||
|
||||
<p>
|
||||
SRCU also differs from other RCU flavors in that SRCU's expedited and
|
||||
non-expedited grace periods are implemented by the same mechanism.
|
||||
This means that in the current SRCU implementation, expediting a
|
||||
future grace period has the side effect of expediting all prior
|
||||
grace periods that have not yet completed.
|
||||
(But please note that this is a property of the current implementation,
|
||||
not necessarily of future implementations.)
|
||||
In addition, if SRCU has been idle for longer than the interval
|
||||
specified by the <tt>srcutree.exp_holdoff</tt> kernel boot parameter
|
||||
(25 microseconds by default),
|
||||
and if a <tt>synchronize_srcu()</tt> invocation ends this idle period,
|
||||
that invocation will be automatically expedited.
|
||||
|
||||
<p>
|
||||
As of v4.12, SRCU's callbacks are maintained per-CPU, eliminating
|
||||
a locking bottleneck present in prior kernel versions.
|
||||
|
@ -413,11 +413,11 @@ over a rather long period of time, but improvements are always welcome!
|
||||
read-side critical sections. It is the responsibility of the
|
||||
RCU update-side primitives to deal with this.
|
||||
|
||||
17. Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
|
||||
__rcu sparse checks (enabled by CONFIG_SPARSE_RCU_POINTER) to
|
||||
validate your RCU code. These can help find problems as follows:
|
||||
17. Use CONFIG_PROVE_LOCKING, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
|
||||
__rcu sparse checks to validate your RCU code. These can help
|
||||
find problems as follows:
|
||||
|
||||
CONFIG_PROVE_RCU: check that accesses to RCU-protected data
|
||||
CONFIG_PROVE_LOCKING: check that accesses to RCU-protected data
|
||||
structures are carried out under the proper RCU
|
||||
read-side critical section, while holding the right
|
||||
combination of locks, or whatever other conditions
|
||||
|
@ -1,535 +0,0 @@
|
||||
CONFIG_RCU_TRACE debugfs Files and Formats
|
||||
|
||||
|
||||
The rcutree and rcutiny implementations of RCU provide debugfs trace
|
||||
output that summarizes counters and state. This information is useful for
|
||||
debugging RCU itself, and can sometimes also help to debug abuses of RCU.
|
||||
The following sections describe the debugfs files and formats, first
|
||||
for rcutree and next for rcutiny.
|
||||
|
||||
|
||||
CONFIG_TREE_RCU and CONFIG_PREEMPT_RCU debugfs Files and Formats
|
||||
|
||||
These implementations of RCU provide several debugfs directories under the
|
||||
top-level directory "rcu":
|
||||
|
||||
rcu/rcu_bh
|
||||
rcu/rcu_preempt
|
||||
rcu/rcu_sched
|
||||
|
||||
Each directory contains files for the corresponding flavor of RCU.
|
||||
Note that rcu/rcu_preempt is only present for CONFIG_PREEMPT_RCU.
|
||||
For CONFIG_TREE_RCU, the RCU flavor maps onto the RCU-sched flavor,
|
||||
so that activity for both appears in rcu/rcu_sched.
|
||||
|
||||
In addition, the following file appears in the top-level directory:
|
||||
rcu/rcutorture. This file displays rcutorture test progress. The output
|
||||
of "cat rcu/rcutorture" looks as follows:
|
||||
|
||||
rcutorture test sequence: 0 (test in progress)
|
||||
rcutorture update version number: 615
|
||||
|
||||
The first line shows the number of rcutorture tests that have completed
|
||||
since boot. If a test is currently running, the "(test in progress)"
|
||||
string will appear as shown above. The second line shows the number of
|
||||
update cycles that the current test has started, or zero if there is
|
||||
no test in progress.
|
||||
|
||||
|
||||
Within each flavor directory (rcu/rcu_bh, rcu/rcu_sched, and possibly
|
||||
also rcu/rcu_preempt) the following files will be present:
|
||||
|
||||
rcudata:
|
||||
Displays fields in struct rcu_data.
|
||||
rcuexp:
|
||||
Displays statistics for expedited grace periods.
|
||||
rcugp:
|
||||
Displays grace-period counters.
|
||||
rcuhier:
|
||||
Displays the struct rcu_node hierarchy.
|
||||
rcu_pending:
|
||||
Displays counts of the reasons rcu_pending() decided that RCU had
|
||||
work to do.
|
||||
rcuboost:
|
||||
Displays RCU boosting statistics. Only present if
|
||||
CONFIG_RCU_BOOST=y.
|
||||
|
||||
The output of "cat rcu/rcu_preempt/rcudata" looks as follows:
|
||||
|
||||
0!c=30455 g=30456 cnq=1/0:1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716
|
||||
1!c=30719 g=30720 cnq=1/0:0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982
|
||||
2!c=30150 g=30151 cnq=1/1:1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458
|
||||
3 c=31249 g=31250 cnq=1/1:0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622
|
||||
4!c=29502 g=29503 cnq=1/0:1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521
|
||||
5 c=31201 g=31202 cnq=1/0:1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698
|
||||
6!c=30253 g=30254 cnq=1/0:1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353
|
||||
7 c=31178 g=31178 cnq=1/0:0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969
|
||||
|
||||
This file has one line per CPU, or eight for this 8-CPU system.
|
||||
The fields are as follows:
|
||||
|
||||
o The number at the beginning of each line is the CPU number.
|
||||
CPUs numbers followed by an exclamation mark are offline,
|
||||
but have been online at least once since boot. There will be
|
||||
no output for CPUs that have never been online, which can be
|
||||
a good thing in the surprisingly common case where NR_CPUS is
|
||||
substantially larger than the number of actual CPUs.
|
||||
|
||||
o "c" is the count of grace periods that this CPU believes have
|
||||
completed. Offlined CPUs and CPUs in dynticks idle mode may lag
|
||||
quite a ways behind, for example, CPU 4 under "rcu_sched" above,
|
||||
which has been offline through 16 RCU grace periods. It is not
|
||||
unusual to see offline CPUs lagging by thousands of grace periods.
|
||||
Note that although the grace-period number is an unsigned long,
|
||||
it is printed out as a signed long to allow more human-friendly
|
||||
representation near boot time.
|
||||
|
||||
o "g" is the count of grace periods that this CPU believes have
|
||||
started. Again, offlined CPUs and CPUs in dynticks idle mode
|
||||
may lag behind. If the "c" and "g" values are equal, this CPU
|
||||
has already reported a quiescent state for the last RCU grace
|
||||
period that it is aware of, otherwise, the CPU believes that it
|
||||
owes RCU a quiescent state.
|
||||
|
||||
o "pq" indicates that this CPU has passed through a quiescent state
|
||||
for the current grace period. It is possible for "pq" to be
|
||||
"1" and "c" different than "g", which indicates that although
|
||||
the CPU has passed through a quiescent state, either (1) this
|
||||
CPU has not yet reported that fact, (2) some other CPU has not
|
||||
yet reported for this grace period, or (3) both.
|
||||
|
||||
o "qp" indicates that RCU still expects a quiescent state from
|
||||
this CPU. Offlined CPUs and CPUs in dyntick idle mode might
|
||||
well have qp=1, which is OK: RCU is still ignoring them.
|
||||
|
||||
o "dt" is the current value of the dyntick counter that is incremented
|
||||
when entering or leaving idle, either due to a context switch or
|
||||
due to an interrupt. This number is even if the CPU is in idle
|
||||
from RCU's viewpoint and odd otherwise. The number after the
|
||||
first "/" is the interrupt nesting depth when in idle state,
|
||||
or a large number added to the interrupt-nesting depth when
|
||||
running a non-idle task. Some architectures do not accurately
|
||||
count interrupt nesting when running in non-idle kernel context,
|
||||
which can result in interesting anomalies such as negative
|
||||
interrupt-nesting levels. The number after the second "/"
|
||||
is the NMI nesting depth.
|
||||
|
||||
o "df" is the number of times that some other CPU has forced a
|
||||
quiescent state on behalf of this CPU due to this CPU being in
|
||||
idle state.
|
||||
|
||||
o "of" is the number of times that some other CPU has forced a
|
||||
quiescent state on behalf of this CPU due to this CPU being
|
||||
offline. In a perfect world, this might never happen, but it
|
||||
turns out that offlining and onlining a CPU can take several grace
|
||||
periods, and so there is likely to be an extended period of time
|
||||
when RCU believes that the CPU is online when it really is not.
|
||||
Please note that erring in the other direction (RCU believing a
|
||||
CPU is offline when it is really alive and kicking) is a fatal
|
||||
error, so it makes sense to err conservatively.
|
||||
|
||||
o "ql" is the number of RCU callbacks currently residing on
|
||||
this CPU. The first number is the number of "lazy" callbacks
|
||||
that are known to RCU to only be freeing memory, and the number
|
||||
after the "/" is the total number of callbacks, lazy or not.
|
||||
These counters count callbacks regardless of what phase of
|
||||
grace-period processing that they are in (new, waiting for
|
||||
grace period to start, waiting for grace period to end, ready
|
||||
to invoke).
|
||||
|
||||
o "qs" gives an indication of the state of the callback queue
|
||||
with four characters:
|
||||
|
||||
"N" Indicates that there are callbacks queued that are not
|
||||
ready to be handled by the next grace period, and thus
|
||||
will be handled by the grace period following the next
|
||||
one.
|
||||
|
||||
"R" Indicates that there are callbacks queued that are
|
||||
ready to be handled by the next grace period.
|
||||
|
||||
"W" Indicates that there are callbacks queued that are
|
||||
waiting on the current grace period.
|
||||
|
||||
"D" Indicates that there are callbacks queued that have
|
||||
already been handled by a prior grace period, and are
|
||||
thus waiting to be invoked. Note that callbacks in
|
||||
the process of being invoked are not counted here.
|
||||
Callbacks in the process of being invoked are those
|
||||
that have been removed from the rcu_data structures
|
||||
queues by rcu_do_batch(), but which have not yet been
|
||||
invoked.
|
||||
|
||||
If there are no callbacks in a given one of the above states,
|
||||
the corresponding character is replaced by ".".
|
||||
|
||||
o "b" is the batch limit for this CPU. If more than this number
|
||||
of RCU callbacks is ready to invoke, then the remainder will
|
||||
be deferred.
|
||||
|
||||
o "ci" is the number of RCU callbacks that have been invoked for
|
||||
this CPU. Note that ci+nci+ql is the number of callbacks that have
|
||||
been registered in absence of CPU-hotplug activity.
|
||||
|
||||
o "nci" is the number of RCU callbacks that have been offloaded from
|
||||
this CPU. This will always be zero unless the kernel was built
|
||||
with CONFIG_RCU_NOCB_CPU=y and the "rcu_nocbs=" kernel boot
|
||||
parameter was specified.
|
||||
|
||||
o "co" is the number of RCU callbacks that have been orphaned due to
|
||||
this CPU going offline. These orphaned callbacks have been moved
|
||||
to an arbitrarily chosen online CPU.
|
||||
|
||||
o "ca" is the number of RCU callbacks that have been adopted by this
|
||||
CPU due to other CPUs going offline. Note that ci+co-ca+ql is
|
||||
the number of RCU callbacks registered on this CPU.
|
||||
|
||||
|
||||
Kernels compiled with CONFIG_RCU_BOOST=y display the following from
|
||||
/debug/rcu/rcu_preempt/rcudata:
|
||||
|
||||
0!c=12865 g=12866 cnq=1/0:1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871
|
||||
1 c=14407 g=14408 cnq=1/0:0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485
|
||||
2 c=14407 g=14408 cnq=1/0:0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490
|
||||
3 c=14407 g=14408 cnq=1/0:0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290
|
||||
4 c=14405 g=14406 cnq=1/0:1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114
|
||||
5!c=14168 g=14169 cnq=1/0:0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722
|
||||
6 c=14404 g=14405 cnq=1/0:0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811
|
||||
7 c=14407 g=14408 cnq=1/0:1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042
|
||||
|
||||
This is similar to the output discussed above, but contains the following
|
||||
additional fields:
|
||||
|
||||
o "kt" is the per-CPU kernel-thread state. The digit preceding
|
||||
the first slash is zero if there is no work pending and 1
|
||||
otherwise. The character between the first pair of slashes is
|
||||
as follows:
|
||||
|
||||
"S" The kernel thread is stopped, in other words, all
|
||||
CPUs corresponding to this rcu_node structure are
|
||||
offline.
|
||||
|
||||
"R" The kernel thread is running.
|
||||
|
||||
"W" The kernel thread is waiting because there is no work
|
||||
for it to do.
|
||||
|
||||
"O" The kernel thread is waiting because it has been
|
||||
forced off of its designated CPU or because its
|
||||
->cpus_allowed mask permits it to run on other than
|
||||
its designated CPU.
|
||||
|
||||
"Y" The kernel thread is yielding to avoid hogging CPU.
|
||||
|
||||
"?" Unknown value, indicates a bug.
|
||||
|
||||
The number after the final slash is the CPU that the kthread
|
||||
is actually running on.
|
||||
|
||||
This field is displayed only for CONFIG_RCU_BOOST kernels.
|
||||
|
||||
o "ktl" is the low-order 16 bits (in hexadecimal) of the count of
|
||||
the number of times that this CPU's per-CPU kthread has gone
|
||||
through its loop servicing invoke_rcu_cpu_kthread() requests.
|
||||
|
||||
This field is displayed only for CONFIG_RCU_BOOST kernels.
|
||||
|
||||
|
||||
The output of "cat rcu/rcu_preempt/rcuexp" looks as follows:
|
||||
|
||||
s=21872 wd1=0 wd2=0 wd3=5 enq=0 sc=21872
|
||||
|
||||
These fields are as follows:
|
||||
|
||||
o "s" is the sequence number, with an odd number indicating that
|
||||
an expedited grace period is in progress.
|
||||
|
||||
o "wd1", "wd2", and "wd3" are the number of times that an attempt
|
||||
to start an expedited grace period found that someone else had
|
||||
completed an expedited grace period that satisfies the attempted
|
||||
request. "Our work is done."
|
||||
|
||||
o "enq" is the number of quiescent states still outstanding.
|
||||
|
||||
o "sc" is the number of times that the attempt to start a
|
||||
new expedited grace period succeeded.
|
||||
|
||||
|
||||
The output of "cat rcu/rcu_preempt/rcugp" looks as follows:
|
||||
|
||||
completed=31249 gpnum=31250 age=1 max=18
|
||||
|
||||
These fields are taken from the rcu_state structure, and are as follows:
|
||||
|
||||
o "completed" is the number of grace periods that have completed.
|
||||
It is comparable to the "c" field from rcu/rcudata in that a
|
||||
CPU whose "c" field matches the value of "completed" is aware
|
||||
that the corresponding RCU grace period has completed.
|
||||
|
||||
o "gpnum" is the number of grace periods that have started. It is
|
||||
similarly comparable to the "g" field from rcu/rcudata in that
|
||||
a CPU whose "g" field matches the value of "gpnum" is aware that
|
||||
the corresponding RCU grace period has started.
|
||||
|
||||
If these two fields are equal, then there is no grace period
|
||||
in progress, in other words, RCU is idle. On the other hand,
|
||||
if the two fields differ (as they are above), then an RCU grace
|
||||
period is in progress.
|
||||
|
||||
o "age" is the number of jiffies that the current grace period
|
||||
has extended for, or zero if there is no grace period currently
|
||||
in effect.
|
||||
|
||||
o "max" is the age in jiffies of the longest-duration grace period
|
||||
thus far.
|
||||
|
||||
The output of "cat rcu/rcu_preempt/rcuhier" looks as follows:
|
||||
|
||||
c=14407 g=14408 s=0 jfq=2 j=c863 nfqs=12040/nfqsng=0(12040) fqlh=1051 oqlen=0/0
|
||||
3/3 ..>. 0:7 ^0
|
||||
e/e ..>. 0:3 ^0 d/d ..>. 4:7 ^1
|
||||
|
||||
The fields are as follows:
|
||||
|
||||
o "c" is exactly the same as "completed" under rcu/rcu_preempt/rcugp.
|
||||
|
||||
o "g" is exactly the same as "gpnum" under rcu/rcu_preempt/rcugp.
|
||||
|
||||
o "s" is the current state of the force_quiescent_state()
|
||||
state machine.
|
||||
|
||||
o "jfq" is the number of jiffies remaining for this grace period
|
||||
before force_quiescent_state() is invoked to help push things
|
||||
along. Note that CPUs in idle mode throughout the grace period
|
||||
will not report on their own, but rather must be check by some
|
||||
other CPU via force_quiescent_state().
|
||||
|
||||
o "j" is the low-order four hex digits of the jiffies counter.
|
||||
Yes, Paul did run into a number of problems that turned out to
|
||||
be due to the jiffies counter no longer counting. Why do you ask?
|
||||
|
||||
o "nfqs" is the number of calls to force_quiescent_state() since
|
||||
boot.
|
||||
|
||||
o "nfqsng" is the number of useless calls to force_quiescent_state(),
|
||||
where there wasn't actually a grace period active. This can
|
||||
no longer happen due to grace-period processing being pushed
|
||||
into a kthread. The number in parentheses is the difference
|
||||
between "nfqs" and "nfqsng", or the number of times that
|
||||
force_quiescent_state() actually did some real work.
|
||||
|
||||
o "fqlh" is the number of calls to force_quiescent_state() that
|
||||
exited immediately (without even being counted in nfqs above)
|
||||
due to contention on ->fqslock.
|
||||
|
||||
o Each element of the form "3/3 ..>. 0:7 ^0" represents one rcu_node
|
||||
structure. Each line represents one level of the hierarchy,
|
||||
from root to leaves. It is best to think of the rcu_data
|
||||
structures as forming yet another level after the leaves.
|
||||
Note that there might be either one, two, three, or even four
|
||||
levels of rcu_node structures, depending on the relationship
|
||||
between CONFIG_RCU_FANOUT, CONFIG_RCU_FANOUT_LEAF (possibly
|
||||
adjusted using the rcu_fanout_leaf kernel boot parameter), and
|
||||
CONFIG_NR_CPUS (possibly adjusted using the nr_cpu_ids count of
|
||||
possible CPUs for the booting hardware).
|
||||
|
||||
o The numbers separated by the "/" are the qsmask followed
|
||||
by the qsmaskinit. The qsmask will have one bit
|
||||
set for each entity in the next lower level that has
|
||||
not yet checked in for the current grace period ("e"
|
||||
indicating CPUs 5, 6, and 7 in the example above).
|
||||
The qsmaskinit will have one bit for each entity that is
|
||||
currently expected to check in during each grace period.
|
||||
The value of qsmaskinit is assigned to that of qsmask
|
||||
at the beginning of each grace period.
|
||||
|
||||
o The characters separated by the ">" indicate the state
|
||||
of the blocked-tasks lists. A "G" preceding the ">"
|
||||
indicates that at least one task blocked in an RCU
|
||||
read-side critical section blocks the current grace
|
||||
period, while a "E" preceding the ">" indicates that
|
||||
at least one task blocked in an RCU read-side critical
|
||||
section blocks the current expedited grace period.
|
||||
A "T" character following the ">" indicates that at
|
||||
least one task is blocked within an RCU read-side
|
||||
critical section, regardless of whether any current
|
||||
grace period (expedited or normal) is inconvenienced.
|
||||
A "." character appears if the corresponding condition
|
||||
does not hold, so that "..>." indicates that no tasks
|
||||
are blocked. In contrast, "GE>T" indicates maximal
|
||||
inconvenience from blocked tasks. CONFIG_TREE_RCU
|
||||
builds of the kernel will always show "..>.".
|
||||
|
||||
o The numbers separated by the ":" are the range of CPUs
|
||||
served by this struct rcu_node. This can be helpful
|
||||
in working out how the hierarchy is wired together.
|
||||
|
||||
For example, the example rcu_node structure shown above
|
||||
has "0:7", indicating that it covers CPUs 0 through 7.
|
||||
|
||||
o The number after the "^" indicates the bit in the
|
||||
next higher level rcu_node structure that this rcu_node
|
||||
structure corresponds to. For example, the "d/d ..>. 4:7
|
||||
^1" has a "1" in this position, indicating that it
|
||||
corresponds to the "1" bit in the "3" shown in the
|
||||
"3/3 ..>. 0:7 ^0" entry on the next level up.
|
||||
|
||||
|
||||
The output of "cat rcu/rcu_sched/rcu_pending" looks as follows:
|
||||
|
||||
0!np=26111 qsp=29 rpq=5386 cbr=1 cng=570 gpc=3674 gps=577 nn=15903 ndw=0
|
||||
1!np=28913 qsp=35 rpq=6097 cbr=1 cng=448 gpc=3700 gps=554 nn=18113 ndw=0
|
||||
2!np=32740 qsp=37 rpq=6202 cbr=0 cng=476 gpc=4627 gps=546 nn=20889 ndw=0
|
||||
3 np=23679 qsp=22 rpq=5044 cbr=1 cng=415 gpc=3403 gps=347 nn=14469 ndw=0
|
||||
4!np=30714 qsp=4 rpq=5574 cbr=0 cng=528 gpc=3931 gps=639 nn=20042 ndw=0
|
||||
5 np=28910 qsp=2 rpq=5246 cbr=0 cng=428 gpc=4105 gps=709 nn=18422 ndw=0
|
||||
6!np=38648 qsp=5 rpq=7076 cbr=0 cng=840 gpc=4072 gps=961 nn=25699 ndw=0
|
||||
7 np=37275 qsp=2 rpq=6873 cbr=0 cng=868 gpc=3416 gps=971 nn=25147 ndw=0
|
||||
|
||||
The fields are as follows:
|
||||
|
||||
o The leading number is the CPU number, with "!" indicating
|
||||
an offline CPU.
|
||||
|
||||
o "np" is the number of times that __rcu_pending() has been invoked
|
||||
for the corresponding flavor of RCU.
|
||||
|
||||
o "qsp" is the number of times that the RCU was waiting for a
|
||||
quiescent state from this CPU.
|
||||
|
||||
o "rpq" is the number of times that the CPU had passed through
|
||||
a quiescent state, but not yet reported it to RCU.
|
||||
|
||||
o "cbr" is the number of times that this CPU had RCU callbacks
|
||||
that had passed through a grace period, and were thus ready
|
||||
to be invoked.
|
||||
|
||||
o "cng" is the number of times that this CPU needed another
|
||||
grace period while RCU was idle.
|
||||
|
||||
o "gpc" is the number of times that an old grace period had
|
||||
completed, but this CPU was not yet aware of it.
|
||||
|
||||
o "gps" is the number of times that a new grace period had started,
|
||||
but this CPU was not yet aware of it.
|
||||
|
||||
o "ndw" is the number of times that a wakeup of an rcuo
|
||||
callback-offload kthread had to be deferred in order to avoid
|
||||
deadlock.
|
||||
|
||||
o "nn" is the number of times that this CPU needed nothing.
|
||||
|
||||
|
||||
The output of "cat rcu/rcuboost" looks as follows:
|
||||
|
||||
0:3 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894
|
||||
balk: nt=0 egt=4695 bt=0 nb=0 ny=56 nos=0
|
||||
4:7 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894
|
||||
balk: nt=0 egt=6541 bt=0 nb=0 ny=126 nos=0
|
||||
|
||||
This information is output only for rcu_preempt. Each two-line entry
|
||||
corresponds to a leaf rcu_node structure. The fields are as follows:
|
||||
|
||||
o "n:m" is the CPU-number range for the corresponding two-line
|
||||
entry. In the sample output above, the first entry covers
|
||||
CPUs zero through three and the second entry covers CPUs four
|
||||
through seven.
|
||||
|
||||
o "tasks=TNEB" gives the state of the various segments of the
|
||||
rnp->blocked_tasks list:
|
||||
|
||||
"T" This indicates that there are some tasks that blocked
|
||||
while running on one of the corresponding CPUs while
|
||||
in an RCU read-side critical section.
|
||||
|
||||
"N" This indicates that some of the blocked tasks are preventing
|
||||
the current normal (non-expedited) grace period from
|
||||
completing.
|
||||
|
||||
"E" This indicates that some of the blocked tasks are preventing
|
||||
the current expedited grace period from completing.
|
||||
|
||||
"B" This indicates that some of the blocked tasks are in
|
||||
need of RCU priority boosting.
|
||||
|
||||
Each character is replaced with "." if the corresponding
|
||||
condition does not hold.
|
||||
|
||||
o "kt" is the state of the RCU priority-boosting kernel
|
||||
thread associated with the corresponding rcu_node structure.
|
||||
The state can be one of the following:
|
||||
|
||||
"S" The kernel thread is stopped, in other words, all
|
||||
CPUs corresponding to this rcu_node structure are
|
||||
offline.
|
||||
|
||||
"R" The kernel thread is running.
|
||||
|
||||
"W" The kernel thread is waiting because there is no work
|
||||
for it to do.
|
||||
|
||||
"Y" The kernel thread is yielding to avoid hogging CPU.
|
||||
|
||||
"?" Unknown value, indicates a bug.
|
||||
|
||||
o "ntb" is the number of tasks boosted.
|
||||
|
||||
o "neb" is the number of tasks boosted in order to complete an
|
||||
expedited grace period.
|
||||
|
||||
o "nnb" is the number of tasks boosted in order to complete a
|
||||
normal (non-expedited) grace period. When boosting a task
|
||||
that was blocking both an expedited and a normal grace period,
|
||||
it is counted against the expedited total above.
|
||||
|
||||
o "j" is the low-order 16 bits of the jiffies counter in
|
||||
hexadecimal.
|
||||
|
||||
o "bt" is the low-order 16 bits of the value that the jiffies
|
||||
counter will have when we next start boosting, assuming that
|
||||
the current grace period does not end beforehand. This is
|
||||
also in hexadecimal.
|
||||
|
||||
o "balk: nt" counts the number of times we didn't boost (in
|
||||
other words, we balked) even though it was time to boost because
|
||||
there were no blocked tasks to boost. This situation occurs
|
||||
when there is one blocked task on one rcu_node structure and
|
||||
none on some other rcu_node structure.
|
||||
|
||||
o "egt" counts the number of times we balked because although
|
||||
there were blocked tasks, none of them were blocking the
|
||||
current grace period, whether expedited or otherwise.
|
||||
|
||||
o "bt" counts the number of times we balked because boosting
|
||||
had already been initiated for the current grace period.
|
||||
|
||||
o "nb" counts the number of times we balked because there
|
||||
was at least one task blocking the current non-expedited grace
|
||||
period that never had blocked. If it is already running, it
|
||||
just won't help to boost its priority!
|
||||
|
||||
o "ny" counts the number of times we balked because it was
|
||||
not yet time to start boosting.
|
||||
|
||||
o "nos" counts the number of times we balked for other
|
||||
reasons, e.g., the grace period ended first.
|
||||
|
||||
|
||||
CONFIG_TINY_RCU debugfs Files and Formats
|
||||
|
||||
These implementations of RCU provides a single debugfs file under the
|
||||
top-level directory RCU, namely rcu/rcudata, which displays fields in
|
||||
rcu_bh_ctrlblk and rcu_sched_ctrlblk.
|
||||
|
||||
The output of "cat rcu/rcudata" is as follows:
|
||||
|
||||
rcu_sched: qlen: 0
|
||||
rcu_bh: qlen: 0
|
||||
|
||||
This is split into rcu_sched and rcu_bh sections. The field is as
|
||||
follows:
|
||||
|
||||
o "qlen" is the number of RCU callbacks currently waiting either
|
||||
for an RCU grace period or waiting to be invoked. This is the
|
||||
only field present for rcu_sched and rcu_bh, due to the
|
||||
short-circuiting of grace period in those two cases.
|
@ -1,5 +1,9 @@
|
||||
Linux 2.4.2 Secure Attention Key (SAK) handling
|
||||
18 March 2001, Andrew Morton
|
||||
=========================================
|
||||
Linux Secure Attention Key (SAK) handling
|
||||
=========================================
|
||||
|
||||
:Date: 18 March 2001
|
||||
:Author: Andrew Morton
|
||||
|
||||
An operating system's Secure Attention Key is a security tool which is
|
||||
provided as protection against trojan password capturing programs. It
|
||||
@ -13,7 +17,7 @@ this sequence. It is only available if the kernel was compiled with
|
||||
sysrq support.
|
||||
|
||||
The proper way of generating a SAK is to define the key sequence using
|
||||
`loadkeys'. This will work whether or not sysrq support is compiled
|
||||
``loadkeys``. This will work whether or not sysrq support is compiled
|
||||
into the kernel.
|
||||
|
||||
SAK works correctly when the keyboard is in raw mode. This means that
|
||||
@ -25,64 +29,63 @@ What key sequence should you use? Well, CTRL-ALT-DEL is used to reboot
|
||||
the machine. CTRL-ALT-BACKSPACE is magical to the X server. We'll
|
||||
choose CTRL-ALT-PAUSE.
|
||||
|
||||
In your rc.sysinit (or rc.local) file, add the command
|
||||
In your rc.sysinit (or rc.local) file, add the command::
|
||||
|
||||
echo "control alt keycode 101 = SAK" | /bin/loadkeys
|
||||
|
||||
And that's it! Only the superuser may reprogram the SAK key.
|
||||
|
||||
|
||||
NOTES
|
||||
=====
|
||||
.. note::
|
||||
|
||||
1: Linux SAK is said to be not a "true SAK" as is required by
|
||||
systems which implement C2 level security. This author does not
|
||||
know why.
|
||||
1. Linux SAK is said to be not a "true SAK" as is required by
|
||||
systems which implement C2 level security. This author does not
|
||||
know why.
|
||||
|
||||
|
||||
2: On the PC keyboard, SAK kills all applications which have
|
||||
/dev/console opened.
|
||||
2. On the PC keyboard, SAK kills all applications which have
|
||||
/dev/console opened.
|
||||
|
||||
Unfortunately this includes a number of things which you don't
|
||||
actually want killed. This is because these applications are
|
||||
incorrectly holding /dev/console open. Be sure to complain to your
|
||||
Linux distributor about this!
|
||||
Unfortunately this includes a number of things which you don't
|
||||
actually want killed. This is because these applications are
|
||||
incorrectly holding /dev/console open. Be sure to complain to your
|
||||
Linux distributor about this!
|
||||
|
||||
You can identify processes which will be killed by SAK with the
|
||||
command
|
||||
You can identify processes which will be killed by SAK with the
|
||||
command::
|
||||
|
||||
# ls -l /proc/[0-9]*/fd/* | grep console
|
||||
l-wx------ 1 root root 64 Mar 18 00:46 /proc/579/fd/0 -> /dev/console
|
||||
|
||||
Then:
|
||||
Then::
|
||||
|
||||
# ps aux|grep 579
|
||||
root 579 0.0 0.1 1088 436 ? S 00:43 0:00 gpm -t ps/2
|
||||
|
||||
So `gpm' will be killed by SAK. This is a bug in gpm. It should
|
||||
be closing standard input. You can work around this by finding the
|
||||
initscript which launches gpm and changing it thusly:
|
||||
So ``gpm`` will be killed by SAK. This is a bug in gpm. It should
|
||||
be closing standard input. You can work around this by finding the
|
||||
initscript which launches gpm and changing it thusly:
|
||||
|
||||
Old:
|
||||
Old::
|
||||
|
||||
daemon gpm
|
||||
|
||||
New:
|
||||
New::
|
||||
|
||||
daemon gpm < /dev/null
|
||||
|
||||
Vixie cron also seems to have this problem, and needs the same treatment.
|
||||
Vixie cron also seems to have this problem, and needs the same treatment.
|
||||
|
||||
Also, one prominent Linux distribution has the following three
|
||||
lines in its rc.sysinit and rc scripts:
|
||||
Also, one prominent Linux distribution has the following three
|
||||
lines in its rc.sysinit and rc scripts::
|
||||
|
||||
exec 3<&0
|
||||
exec 4>&1
|
||||
exec 5>&2
|
||||
|
||||
These commands cause *all* daemons which are launched by the
|
||||
initscripts to have file descriptors 3, 4 and 5 attached to
|
||||
/dev/console. So SAK kills them all. A workaround is to simply
|
||||
delete these lines, but this may cause system management
|
||||
applications to malfunction - test everything well.
|
||||
These commands cause **all** daemons which are launched by the
|
||||
initscripts to have file descriptors 3, 4 and 5 attached to
|
||||
/dev/console. So SAK kills them all. A workaround is to simply
|
||||
delete these lines, but this may cause system management
|
||||
applications to malfunction - test everything well.
|
||||
|
||||
|
@ -1,7 +1,10 @@
|
||||
SM501 Driver
|
||||
============
|
||||
.. include:: <isonum.txt>
|
||||
|
||||
Copyright 2006, 2007 Simtec Electronics
|
||||
============
|
||||
SM501 Driver
|
||||
============
|
||||
|
||||
:Copyright: |copy| 2006, 2007 Simtec Electronics
|
||||
|
||||
The Silicon Motion SM501 multimedia companion chip is a multifunction device
|
||||
which may provide numerous interfaces including USB host controller USB gadget,
|
||||
|
@ -59,20 +59,28 @@ button driver uses the following 3 modes in order not to trigger issues.
|
||||
If the userspace hasn't been prepared to ignore the unreliable "opened"
|
||||
events and the unreliable initial state notification, Linux users can use
|
||||
the following kernel parameters to handle the possible issues:
|
||||
A. button.lid_init_state=open:
|
||||
A. button.lid_init_state=method:
|
||||
When this option is specified, the ACPI button driver reports the
|
||||
initial lid state using the returning value of the _LID control method
|
||||
and whether the "opened"/"closed" events are paired fully relies on the
|
||||
firmware implementation.
|
||||
This option can be used to fix some platforms where the returning value
|
||||
of the _LID control method is reliable but the initial lid state
|
||||
notification is missing.
|
||||
This option is the default behavior during the period the userspace
|
||||
isn't ready to handle the buggy AML tables.
|
||||
B. button.lid_init_state=open:
|
||||
When this option is specified, the ACPI button driver always reports the
|
||||
initial lid state as "opened" and whether the "opened"/"closed" events
|
||||
are paired fully relies on the firmware implementation.
|
||||
This may fix some platforms where the returning value of the _LID
|
||||
control method is not reliable and the initial lid state notification is
|
||||
missing.
|
||||
This option is the default behavior during the period the userspace
|
||||
isn't ready to handle the buggy AML tables.
|
||||
|
||||
If the userspace has been prepared to ignore the unreliable "opened" events
|
||||
and the unreliable initial state notification, Linux users should always
|
||||
use the following kernel parameter:
|
||||
B. button.lid_init_state=ignore:
|
||||
C. button.lid_init_state=ignore:
|
||||
When this option is specified, the ACPI button driver never reports the
|
||||
initial lid state and there is a compensation mechanism implemented to
|
||||
ensure that the reliable "closed" notifications can always be delievered
|
||||
|
@ -156,3 +156,68 @@ pointed to by its first argument. That should be done in the driver's .probe()
|
||||
routine. On removal, the driver should unregister its GPIO mapping table by
|
||||
calling acpi_dev_remove_driver_gpios() on the ACPI device object where that
|
||||
table was previously registered.
|
||||
|
||||
Using the _CRS fallback
|
||||
-----------------------
|
||||
|
||||
If a device does not have _DSD or the driver does not create ACPI GPIO
|
||||
mapping, the Linux GPIO framework refuses to return any GPIOs. This is
|
||||
because the driver does not know what it actually gets. For example if we
|
||||
have a device like below:
|
||||
|
||||
Device (BTH)
|
||||
{
|
||||
Name (_HID, ...)
|
||||
|
||||
Name (_CRS, ResourceTemplate () {
|
||||
GpioIo (Exclusive, PullNone, 0, 0, IoRestrictionNone,
|
||||
"\\_SB.GPO0", 0, ResourceConsumer) {15}
|
||||
GpioIo (Exclusive, PullNone, 0, 0, IoRestrictionNone,
|
||||
"\\_SB.GPO0", 0, ResourceConsumer) {27}
|
||||
})
|
||||
}
|
||||
|
||||
The driver might expect to get the right GPIO when it does:
|
||||
|
||||
desc = gpiod_get(dev, "reset", GPIOD_OUT_LOW);
|
||||
|
||||
but since there is no way to know the mapping between "reset" and
|
||||
the GpioIo() in _CRS desc will hold ERR_PTR(-ENOENT).
|
||||
|
||||
The driver author can solve this by passing the mapping explictly
|
||||
(the recommended way and documented in the above chapter).
|
||||
|
||||
The ACPI GPIO mapping tables should not contaminate drivers that are not
|
||||
knowing about which exact device they are servicing on. It implies that
|
||||
the ACPI GPIO mapping tables are hardly linked to ACPI ID and certain
|
||||
objects, as listed in the above chapter, of the device in question.
|
||||
|
||||
Getting GPIO descriptor
|
||||
-----------------------
|
||||
|
||||
There are two main approaches to get GPIO resource from ACPI:
|
||||
desc = gpiod_get(dev, connection_id, flags);
|
||||
desc = gpiod_get_index(dev, connection_id, index, flags);
|
||||
|
||||
We may consider two different cases here, i.e. when connection ID is
|
||||
provided and otherwise.
|
||||
|
||||
Case 1:
|
||||
desc = gpiod_get(dev, "non-null-connection-id", flags);
|
||||
desc = gpiod_get_index(dev, "non-null-connection-id", index, flags);
|
||||
|
||||
Case 2:
|
||||
desc = gpiod_get(dev, NULL, flags);
|
||||
desc = gpiod_get_index(dev, NULL, index, flags);
|
||||
|
||||
Case 1 assumes that corresponding ACPI device description must have
|
||||
defined device properties and will prevent to getting any GPIO resources
|
||||
otherwise.
|
||||
|
||||
Case 2 explicitly tells GPIO core to look for resources in _CRS.
|
||||
|
||||
Be aware that gpiod_get_index() in cases 1 and 2, assuming that there
|
||||
are two versions of ACPI device description provided and no mapping is
|
||||
present in the driver, will return different resources. That's why a
|
||||
certain driver has to handle them carefully as explained in previous
|
||||
chapter.
|
||||
|
@ -1,3 +1,7 @@
|
||||
=======
|
||||
LoadPin
|
||||
=======
|
||||
|
||||
LoadPin is a Linux Security Module that ensures all kernel-loaded files
|
||||
(modules, firmware, etc) all originate from the same filesystem, with
|
||||
the expectation that such a filesystem is backed by a read-only device
|
||||
@ -5,13 +9,13 @@ such as dm-verity or CDROM. This allows systems that have a verified
|
||||
and/or unchangeable filesystem to enforce module and firmware loading
|
||||
restrictions without needing to sign the files individually.
|
||||
|
||||
The LSM is selectable at build-time with CONFIG_SECURITY_LOADPIN, and
|
||||
The LSM is selectable at build-time with ``CONFIG_SECURITY_LOADPIN``, and
|
||||
can be controlled at boot-time with the kernel command line option
|
||||
"loadpin.enabled". By default, it is enabled, but can be disabled at
|
||||
boot ("loadpin.enabled=0").
|
||||
"``loadpin.enabled``". By default, it is enabled, but can be disabled at
|
||||
boot ("``loadpin.enabled=0``").
|
||||
|
||||
LoadPin starts pinning when it sees the first file loaded. If the
|
||||
block device backing the filesystem is not read-only, a sysctl is
|
||||
created to toggle pinning: /proc/sys/kernel/loadpin/enabled. (Having
|
||||
created to toggle pinning: ``/proc/sys/kernel/loadpin/enabled``. (Having
|
||||
a mutable filesystem means pinning is mutable too, but having the
|
||||
sysctl allows for easy testing on systems with a mutable filesystem.)
|
@ -1,27 +1,33 @@
|
||||
=======
|
||||
SELinux
|
||||
=======
|
||||
|
||||
If you want to use SELinux, chances are you will want
|
||||
to use the distro-provided policies, or install the
|
||||
latest reference policy release from
|
||||
|
||||
http://oss.tresys.com/projects/refpolicy
|
||||
|
||||
However, if you want to install a dummy policy for
|
||||
testing, you can do using 'mdp' provided under
|
||||
testing, you can do using ``mdp`` provided under
|
||||
scripts/selinux. Note that this requires the selinux
|
||||
userspace to be installed - in particular you will
|
||||
need checkpolicy to compile a kernel, and setfiles and
|
||||
fixfiles to label the filesystem.
|
||||
|
||||
1. Compile the kernel with selinux enabled.
|
||||
2. Type 'make' to compile mdp.
|
||||
2. Type ``make`` to compile ``mdp``.
|
||||
3. Make sure that you are not running with
|
||||
SELinux enabled and a real policy. If
|
||||
you are, reboot with selinux disabled
|
||||
before continuing.
|
||||
4. Run install_policy.sh:
|
||||
4. Run install_policy.sh::
|
||||
|
||||
cd scripts/selinux
|
||||
sh install_policy.sh
|
||||
|
||||
Step 4 will create a new dummy policy valid for your
|
||||
kernel, with a single selinux user, role, and type.
|
||||
It will compile the policy, will set your SELINUXTYPE to
|
||||
dummy in /etc/selinux/config, install the compiled policy
|
||||
as 'dummy', and relabel your filesystem.
|
||||
It will compile the policy, will set your ``SELINUXTYPE`` to
|
||||
``dummy`` in ``/etc/selinux/config``, install the compiled policy
|
||||
as ``dummy``, and relabel your filesystem.
|
@ -1,3 +1,6 @@
|
||||
=====
|
||||
Smack
|
||||
=====
|
||||
|
||||
|
||||
"Good for you, you've decided to clean the elevator!"
|
||||
@ -14,6 +17,7 @@ available to determine which is best suited to the problem
|
||||
at hand.
|
||||
|
||||
Smack consists of three major components:
|
||||
|
||||
- The kernel
|
||||
- Basic utilities, which are helpful but not required
|
||||
- Configuration data
|
||||
@ -39,16 +43,24 @@ The current git repository for Smack user space is:
|
||||
This should make and install on most modern distributions.
|
||||
There are five commands included in smackutil:
|
||||
|
||||
chsmack - display or set Smack extended attribute values
|
||||
smackctl - load the Smack access rules
|
||||
smackaccess - report if a process with one label has access
|
||||
to an object with another
|
||||
chsmack:
|
||||
display or set Smack extended attribute values
|
||||
|
||||
smackctl:
|
||||
load the Smack access rules
|
||||
|
||||
smackaccess:
|
||||
report if a process with one label has access
|
||||
to an object with another
|
||||
|
||||
These two commands are obsolete with the introduction of
|
||||
the smackfs/load2 and smackfs/cipso2 interfaces.
|
||||
|
||||
smackload - properly formats data for writing to smackfs/load
|
||||
smackcipso - properly formats data for writing to smackfs/cipso
|
||||
smackload:
|
||||
properly formats data for writing to smackfs/load
|
||||
|
||||
smackcipso:
|
||||
properly formats data for writing to smackfs/cipso
|
||||
|
||||
In keeping with the intent of Smack, configuration data is
|
||||
minimal and not strictly required. The most important
|
||||
@ -56,15 +68,15 @@ configuration step is mounting the smackfs pseudo filesystem.
|
||||
If smackutil is installed the startup script will take care
|
||||
of this, but it can be manually as well.
|
||||
|
||||
Add this line to /etc/fstab:
|
||||
Add this line to ``/etc/fstab``::
|
||||
|
||||
smackfs /sys/fs/smackfs smackfs defaults 0 0
|
||||
|
||||
The /sys/fs/smackfs directory is created by the kernel.
|
||||
The ``/sys/fs/smackfs`` directory is created by the kernel.
|
||||
|
||||
Smack uses extended attributes (xattrs) to store labels on filesystem
|
||||
objects. The attributes are stored in the extended attribute security
|
||||
name space. A process must have CAP_MAC_ADMIN to change any of these
|
||||
name space. A process must have ``CAP_MAC_ADMIN`` to change any of these
|
||||
attributes.
|
||||
|
||||
The extended attributes that Smack uses are:
|
||||
@ -73,14 +85,17 @@ SMACK64
|
||||
Used to make access control decisions. In almost all cases
|
||||
the label given to a new filesystem object will be the label
|
||||
of the process that created it.
|
||||
|
||||
SMACK64EXEC
|
||||
The Smack label of a process that execs a program file with
|
||||
this attribute set will run with this attribute's value.
|
||||
|
||||
SMACK64MMAP
|
||||
Don't allow the file to be mmapped by a process whose Smack
|
||||
label does not allow all of the access permitted to a process
|
||||
with the label contained in this attribute. This is a very
|
||||
specific use case for shared libraries.
|
||||
|
||||
SMACK64TRANSMUTE
|
||||
Can only have the value "TRUE". If this attribute is present
|
||||
on a directory when an object is created in the directory and
|
||||
@ -89,27 +104,29 @@ SMACK64TRANSMUTE
|
||||
gets the label of the directory instead of the label of the
|
||||
creating process. If the object being created is a directory
|
||||
the SMACK64TRANSMUTE attribute is set as well.
|
||||
|
||||
SMACK64IPIN
|
||||
This attribute is only available on file descriptors for sockets.
|
||||
Use the Smack label in this attribute for access control
|
||||
decisions on packets being delivered to this socket.
|
||||
|
||||
SMACK64IPOUT
|
||||
This attribute is only available on file descriptors for sockets.
|
||||
Use the Smack label in this attribute for access control
|
||||
decisions on packets coming from this socket.
|
||||
|
||||
There are multiple ways to set a Smack label on a file:
|
||||
There are multiple ways to set a Smack label on a file::
|
||||
|
||||
# attr -S -s SMACK64 -V "value" path
|
||||
# chsmack -a value path
|
||||
|
||||
A process can see the Smack label it is running with by
|
||||
reading /proc/self/attr/current. A process with CAP_MAC_ADMIN
|
||||
reading ``/proc/self/attr/current``. A process with ``CAP_MAC_ADMIN``
|
||||
can set the process Smack by writing there.
|
||||
|
||||
Most Smack configuration is accomplished by writing to files
|
||||
in the smackfs filesystem. This pseudo-filesystem is mounted
|
||||
on /sys/fs/smackfs.
|
||||
on ``/sys/fs/smackfs``.
|
||||
|
||||
access
|
||||
Provided for backward compatibility. The access2 interface
|
||||
@ -120,6 +137,7 @@ access
|
||||
this file. The next read will indicate whether the access
|
||||
would be permitted. The text will be either "1" indicating
|
||||
access, or "0" indicating denial.
|
||||
|
||||
access2
|
||||
This interface reports whether a subject with the specified
|
||||
Smack label has a particular access to an object with a
|
||||
@ -127,13 +145,17 @@ access2
|
||||
this file. The next read will indicate whether the access
|
||||
would be permitted. The text will be either "1" indicating
|
||||
access, or "0" indicating denial.
|
||||
|
||||
ambient
|
||||
This contains the Smack label applied to unlabeled network
|
||||
packets.
|
||||
|
||||
change-rule
|
||||
This interface allows modification of existing access control rules.
|
||||
The format accepted on write is:
|
||||
The format accepted on write is::
|
||||
|
||||
"%s %s %s %s"
|
||||
|
||||
where the first string is the subject label, the second the
|
||||
object label, the third the access to allow and the fourth the
|
||||
access to deny. The access strings may contain only the characters
|
||||
@ -141,47 +163,63 @@ change-rule
|
||||
modified by enabling the permissions in the third string and disabling
|
||||
those in the fourth string. If there is no such rule it will be
|
||||
created using the access specified in the third and the fourth strings.
|
||||
|
||||
cipso
|
||||
Provided for backward compatibility. The cipso2 interface
|
||||
is preferred and should be used instead.
|
||||
This interface allows a specific CIPSO header to be assigned
|
||||
to a Smack label. The format accepted on write is:
|
||||
to a Smack label. The format accepted on write is::
|
||||
|
||||
"%24s%4d%4d"["%4d"]...
|
||||
|
||||
The first string is a fixed Smack label. The first number is
|
||||
the level to use. The second number is the number of categories.
|
||||
The following numbers are the categories.
|
||||
"level-3-cats-5-19 3 2 5 19"
|
||||
The following numbers are the categories::
|
||||
|
||||
"level-3-cats-5-19 3 2 5 19"
|
||||
|
||||
cipso2
|
||||
This interface allows a specific CIPSO header to be assigned
|
||||
to a Smack label. The format accepted on write is:
|
||||
"%s%4d%4d"["%4d"]...
|
||||
to a Smack label. The format accepted on write is::
|
||||
|
||||
"%s%4d%4d"["%4d"]...
|
||||
|
||||
The first string is a long Smack label. The first number is
|
||||
the level to use. The second number is the number of categories.
|
||||
The following numbers are the categories.
|
||||
"level-3-cats-5-19 3 2 5 19"
|
||||
The following numbers are the categories::
|
||||
|
||||
"level-3-cats-5-19 3 2 5 19"
|
||||
|
||||
direct
|
||||
This contains the CIPSO level used for Smack direct label
|
||||
representation in network packets.
|
||||
|
||||
doi
|
||||
This contains the CIPSO domain of interpretation used in
|
||||
network packets.
|
||||
|
||||
ipv6host
|
||||
This interface allows specific IPv6 internet addresses to be
|
||||
treated as single label hosts. Packets are sent to single
|
||||
label hosts only from processes that have Smack write access
|
||||
to the host label. All packets received from single label hosts
|
||||
are given the specified label. The format accepted on write is:
|
||||
are given the specified label. The format accepted on write is::
|
||||
|
||||
"%h:%h:%h:%h:%h:%h:%h:%h label" or
|
||||
"%h:%h:%h:%h:%h:%h:%h:%h/%d label".
|
||||
|
||||
The "::" address shortcut is not supported.
|
||||
If label is "-DELETE" a matched entry will be deleted.
|
||||
|
||||
load
|
||||
Provided for backward compatibility. The load2 interface
|
||||
is preferred and should be used instead.
|
||||
This interface allows access control rules in addition to
|
||||
the system defined rules to be specified. The format accepted
|
||||
on write is:
|
||||
on write is::
|
||||
|
||||
"%24s%24s%5s"
|
||||
|
||||
where the first string is the subject label, the second the
|
||||
object label, and the third the requested access. The access
|
||||
string may contain only the characters "rwxat-", and specifies
|
||||
@ -189,17 +227,21 @@ load
|
||||
permissions that are not allowed. The string "r-x--" would
|
||||
specify read and execute access. Labels are limited to 23
|
||||
characters in length.
|
||||
|
||||
load2
|
||||
This interface allows access control rules in addition to
|
||||
the system defined rules to be specified. The format accepted
|
||||
on write is:
|
||||
on write is::
|
||||
|
||||
"%s %s %s"
|
||||
|
||||
where the first string is the subject label, the second the
|
||||
object label, and the third the requested access. The access
|
||||
string may contain only the characters "rwxat-", and specifies
|
||||
which sort of access is allowed. The "-" is a placeholder for
|
||||
permissions that are not allowed. The string "r-x--" would
|
||||
specify read and execute access.
|
||||
|
||||
load-self
|
||||
Provided for backward compatibility. The load-self2 interface
|
||||
is preferred and should be used instead.
|
||||
@ -208,66 +250,83 @@ load-self
|
||||
otherwise be permitted, and are intended to provide additional
|
||||
restrictions on the process. The format is the same as for
|
||||
the load interface.
|
||||
|
||||
load-self2
|
||||
This interface allows process specific access rules to be
|
||||
defined. These rules are only consulted if access would
|
||||
otherwise be permitted, and are intended to provide additional
|
||||
restrictions on the process. The format is the same as for
|
||||
the load2 interface.
|
||||
|
||||
logging
|
||||
This contains the Smack logging state.
|
||||
|
||||
mapped
|
||||
This contains the CIPSO level used for Smack mapped label
|
||||
representation in network packets.
|
||||
|
||||
netlabel
|
||||
This interface allows specific internet addresses to be
|
||||
treated as single label hosts. Packets are sent to single
|
||||
label hosts without CIPSO headers, but only from processes
|
||||
that have Smack write access to the host label. All packets
|
||||
received from single label hosts are given the specified
|
||||
label. The format accepted on write is:
|
||||
label. The format accepted on write is::
|
||||
|
||||
"%d.%d.%d.%d label" or "%d.%d.%d.%d/%d label".
|
||||
|
||||
If the label specified is "-CIPSO" the address is treated
|
||||
as a host that supports CIPSO headers.
|
||||
|
||||
onlycap
|
||||
This contains labels processes must have for CAP_MAC_ADMIN
|
||||
and CAP_MAC_OVERRIDE to be effective. If this file is empty
|
||||
and ``CAP_MAC_OVERRIDE`` to be effective. If this file is empty
|
||||
these capabilities are effective at for processes with any
|
||||
label. The values are set by writing the desired labels, separated
|
||||
by spaces, to the file or cleared by writing "-" to the file.
|
||||
|
||||
ptrace
|
||||
This is used to define the current ptrace policy
|
||||
0 - default: this is the policy that relies on Smack access rules.
|
||||
For the PTRACE_READ a subject needs to have a read access on
|
||||
object. For the PTRACE_ATTACH a read-write access is required.
|
||||
1 - exact: this is the policy that limits PTRACE_ATTACH. Attach is
|
||||
|
||||
0 - default:
|
||||
this is the policy that relies on Smack access rules.
|
||||
For the ``PTRACE_READ`` a subject needs to have a read access on
|
||||
object. For the ``PTRACE_ATTACH`` a read-write access is required.
|
||||
|
||||
1 - exact:
|
||||
this is the policy that limits ``PTRACE_ATTACH``. Attach is
|
||||
only allowed when subject's and object's labels are equal.
|
||||
PTRACE_READ is not affected. Can be overridden with CAP_SYS_PTRACE.
|
||||
2 - draconian: this policy behaves like the 'exact' above with an
|
||||
exception that it can't be overridden with CAP_SYS_PTRACE.
|
||||
``PTRACE_READ`` is not affected. Can be overridden with ``CAP_SYS_PTRACE``.
|
||||
|
||||
2 - draconian:
|
||||
this policy behaves like the 'exact' above with an
|
||||
exception that it can't be overridden with ``CAP_SYS_PTRACE``.
|
||||
|
||||
revoke-subject
|
||||
Writing a Smack label here sets the access to '-' for all access
|
||||
rules with that subject label.
|
||||
|
||||
unconfined
|
||||
If the kernel is configured with CONFIG_SECURITY_SMACK_BRINGUP
|
||||
a process with CAP_MAC_ADMIN can write a label into this interface.
|
||||
If the kernel is configured with ``CONFIG_SECURITY_SMACK_BRINGUP``
|
||||
a process with ``CAP_MAC_ADMIN`` can write a label into this interface.
|
||||
Thereafter, accesses that involve that label will be logged and
|
||||
the access permitted if it wouldn't be otherwise. Note that this
|
||||
is dangerous and can ruin the proper labeling of your system.
|
||||
It should never be used in production.
|
||||
|
||||
relabel-self
|
||||
This interface contains a list of labels to which the process can
|
||||
transition to, by writing to /proc/self/attr/current.
|
||||
transition to, by writing to ``/proc/self/attr/current``.
|
||||
Normally a process can change its own label to any legal value, but only
|
||||
if it has CAP_MAC_ADMIN. This interface allows a process without
|
||||
CAP_MAC_ADMIN to relabel itself to one of labels from predefined list.
|
||||
A process without CAP_MAC_ADMIN can change its label only once. When it
|
||||
if it has ``CAP_MAC_ADMIN``. This interface allows a process without
|
||||
``CAP_MAC_ADMIN`` to relabel itself to one of labels from predefined list.
|
||||
A process without ``CAP_MAC_ADMIN`` can change its label only once. When it
|
||||
does, this list will be cleared.
|
||||
The values are set by writing the desired labels, separated
|
||||
by spaces, to the file or cleared by writing "-" to the file.
|
||||
|
||||
If you are using the smackload utility
|
||||
you can add access rules in /etc/smack/accesses. They take the form:
|
||||
you can add access rules in ``/etc/smack/accesses``. They take the form::
|
||||
|
||||
subjectlabel objectlabel access
|
||||
|
||||
@ -277,14 +336,14 @@ object with objectlabel. If there is no rule no access is allowed.
|
||||
|
||||
Look for additional programs on http://schaufler-ca.com
|
||||
|
||||
From the Smack Whitepaper:
|
||||
|
||||
The Simplified Mandatory Access Control Kernel
|
||||
The Simplified Mandatory Access Control Kernel (Whitepaper)
|
||||
===========================================================
|
||||
|
||||
Casey Schaufler
|
||||
casey@schaufler-ca.com
|
||||
|
||||
Mandatory Access Control
|
||||
------------------------
|
||||
|
||||
Computer systems employ a variety of schemes to constrain how information is
|
||||
shared among the people and services using the machine. Some of these schemes
|
||||
@ -297,6 +356,7 @@ access control mechanisms because you don't have a choice regarding the users
|
||||
or programs that have access to pieces of data.
|
||||
|
||||
Bell & LaPadula
|
||||
---------------
|
||||
|
||||
From the middle of the 1980's until the turn of the century Mandatory Access
|
||||
Control (MAC) was very closely associated with the Bell & LaPadula security
|
||||
@ -306,6 +366,7 @@ within the Capital Beltway and Scandinavian supercomputer centers but was
|
||||
often sited as failing to address general needs.
|
||||
|
||||
Domain Type Enforcement
|
||||
-----------------------
|
||||
|
||||
Around the turn of the century Domain Type Enforcement (DTE) became popular.
|
||||
This scheme organizes users, programs, and data into domains that are
|
||||
@ -316,6 +377,7 @@ necessary to provide a secure domain mapping leads to the scheme being
|
||||
disabled or used in limited ways in the majority of cases.
|
||||
|
||||
Smack
|
||||
-----
|
||||
|
||||
Smack is a Mandatory Access Control mechanism designed to provide useful MAC
|
||||
while avoiding the pitfalls of its predecessors. The limitations of Bell &
|
||||
@ -326,46 +388,55 @@ Enforcement and avoided by defining access controls in terms of the access
|
||||
modes already in use.
|
||||
|
||||
Smack Terminology
|
||||
-----------------
|
||||
|
||||
The jargon used to talk about Smack will be familiar to those who have dealt
|
||||
with other MAC systems and shouldn't be too difficult for the uninitiated to
|
||||
pick up. There are four terms that are used in a specific way and that are
|
||||
especially important:
|
||||
|
||||
Subject: A subject is an active entity on the computer system.
|
||||
Subject:
|
||||
A subject is an active entity on the computer system.
|
||||
On Smack a subject is a task, which is in turn the basic unit
|
||||
of execution.
|
||||
|
||||
Object: An object is a passive entity on the computer system.
|
||||
Object:
|
||||
An object is a passive entity on the computer system.
|
||||
On Smack files of all types, IPC, and tasks can be objects.
|
||||
|
||||
Access: Any attempt by a subject to put information into or get
|
||||
Access:
|
||||
Any attempt by a subject to put information into or get
|
||||
information from an object is an access.
|
||||
|
||||
Label: Data that identifies the Mandatory Access Control
|
||||
Label:
|
||||
Data that identifies the Mandatory Access Control
|
||||
characteristics of a subject or an object.
|
||||
|
||||
These definitions are consistent with the traditional use in the security
|
||||
community. There are also some terms from Linux that are likely to crop up:
|
||||
|
||||
Capability: A task that possesses a capability has permission to
|
||||
Capability:
|
||||
A task that possesses a capability has permission to
|
||||
violate an aspect of the system security policy, as identified by
|
||||
the specific capability. A task that possesses one or more
|
||||
capabilities is a privileged task, whereas a task with no
|
||||
capabilities is an unprivileged task.
|
||||
|
||||
Privilege: A task that is allowed to violate the system security
|
||||
Privilege:
|
||||
A task that is allowed to violate the system security
|
||||
policy is said to have privilege. As of this writing a task can
|
||||
have privilege either by possessing capabilities or by having an
|
||||
effective user of root.
|
||||
|
||||
Smack Basics
|
||||
------------
|
||||
|
||||
Smack is an extension to a Linux system. It enforces additional restrictions
|
||||
on what subjects can access which objects, based on the labels attached to
|
||||
each of the subject and the object.
|
||||
|
||||
Labels
|
||||
~~~~~~
|
||||
|
||||
Smack labels are ASCII character strings. They can be up to 255 characters
|
||||
long, but keeping them to twenty-three characters is recommended.
|
||||
@ -377,7 +448,7 @@ contain unprintable characters, the "/" (slash), the "\" (backslash), the "'"
|
||||
(quote) and '"' (double-quote) characters.
|
||||
Smack labels cannot begin with a '-'. This is reserved for special options.
|
||||
|
||||
There are some predefined labels:
|
||||
There are some predefined labels::
|
||||
|
||||
_ Pronounced "floor", a single underscore character.
|
||||
^ Pronounced "hat", a single circumflex character.
|
||||
@ -390,14 +461,18 @@ of a process will usually be assigned by the system initialization
|
||||
mechanism.
|
||||
|
||||
Access Rules
|
||||
~~~~~~~~~~~~
|
||||
|
||||
Smack uses the traditional access modes of Linux. These modes are read,
|
||||
execute, write, and occasionally append. There are a few cases where the
|
||||
access mode may not be obvious. These include:
|
||||
|
||||
Signals: A signal is a write operation from the subject task to
|
||||
Signals:
|
||||
A signal is a write operation from the subject task to
|
||||
the object task.
|
||||
Internet Domain IPC: Transmission of a packet is considered a
|
||||
|
||||
Internet Domain IPC:
|
||||
Transmission of a packet is considered a
|
||||
write operation from the source task to the destination task.
|
||||
|
||||
Smack restricts access based on the label attached to a subject and the label
|
||||
@ -417,6 +492,7 @@ order:
|
||||
7. Any other access is denied.
|
||||
|
||||
Smack Access Rules
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
With the isolation provided by Smack access separation is simple. There are
|
||||
many interesting cases where limited access by subjects to objects with
|
||||
@ -427,8 +503,9 @@ be "born" highly classified. To accommodate such schemes Smack includes a
|
||||
mechanism for specifying rules allowing access between labels.
|
||||
|
||||
Access Rule Format
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The format of an access rule is:
|
||||
The format of an access rule is::
|
||||
|
||||
subject-label object-label access
|
||||
|
||||
@ -446,7 +523,7 @@ describe access modes:
|
||||
|
||||
Uppercase values for the specification letters are allowed as well.
|
||||
Access mode specifications can be in any order. Examples of acceptable rules
|
||||
are:
|
||||
are::
|
||||
|
||||
TopSecret Secret rx
|
||||
Secret Unclass R
|
||||
@ -456,7 +533,7 @@ are:
|
||||
New Old rRrRr
|
||||
Closed Off -
|
||||
|
||||
Examples of unacceptable rules are:
|
||||
Examples of unacceptable rules are::
|
||||
|
||||
Top Secret Secret rx
|
||||
Ace Ace r
|
||||
@ -469,6 +546,7 @@ access specifications. The dash is a placeholder, so "a-r" is the same
|
||||
as "ar". A lone dash is used to specify that no access should be allowed.
|
||||
|
||||
Applying Access Rules
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The developers of Linux rarely define new sorts of things, usually importing
|
||||
schemes and concepts from other systems. Most often, the other systems are
|
||||
@ -511,6 +589,7 @@ one process to another requires that the sender have write access to the
|
||||
receiver. The receiver is not required to have read access to the sender.
|
||||
|
||||
Setting Access Rules
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The configuration file /etc/smack/accesses contains the rules to be set at
|
||||
system startup. The contents are written to the special file
|
||||
@ -520,6 +599,7 @@ one rule, with the most recently specified overriding any earlier
|
||||
specification.
|
||||
|
||||
Task Attribute
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
The Smack label of a process can be read from /proc/<pid>/attr/current. A
|
||||
process can read its own Smack label from /proc/self/attr/current. A
|
||||
@ -527,12 +607,14 @@ privileged process can change its own Smack label by writing to
|
||||
/proc/self/attr/current but not the label of another process.
|
||||
|
||||
File Attribute
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
The Smack label of a filesystem object is stored as an extended attribute
|
||||
named SMACK64 on the file. This attribute is in the security namespace. It can
|
||||
only be changed by a process with privilege.
|
||||
|
||||
Privilege
|
||||
~~~~~~~~~
|
||||
|
||||
A process with CAP_MAC_OVERRIDE or CAP_MAC_ADMIN is privileged.
|
||||
CAP_MAC_OVERRIDE allows the process access to objects it would
|
||||
@ -540,6 +622,7 @@ be denied otherwise. CAP_MAC_ADMIN allows a process to change
|
||||
Smack data, including rules and attributes.
|
||||
|
||||
Smack Networking
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
As mentioned before, Smack enforces access control on network protocol
|
||||
transmissions. Every packet sent by a Smack process is tagged with its Smack
|
||||
@ -551,6 +634,7 @@ packet has write access to the receiving process and if that is not the case
|
||||
the packet is dropped.
|
||||
|
||||
CIPSO Configuration
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
It is normally unnecessary to specify the CIPSO configuration. The default
|
||||
values used by the system handle all internal cases. Smack will compose CIPSO
|
||||
@ -571,13 +655,13 @@ discarded. The DOI is 3 by default. The value can be read from
|
||||
The label and category set are mapped to a Smack label as defined in
|
||||
/etc/smack/cipso.
|
||||
|
||||
A Smack/CIPSO mapping has the form:
|
||||
A Smack/CIPSO mapping has the form::
|
||||
|
||||
smack level [category [category]*]
|
||||
|
||||
Smack does not expect the level or category sets to be related in any
|
||||
particular way and does not assume or assign accesses based on them. Some
|
||||
examples of mappings:
|
||||
examples of mappings::
|
||||
|
||||
TopSecret 7
|
||||
TS:A,B 7 1 2
|
||||
@ -597,25 +681,30 @@ value can be read from /sys/fs/smackfs/direct and changed by writing to
|
||||
/sys/fs/smackfs/direct.
|
||||
|
||||
Socket Attributes
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
There are two attributes that are associated with sockets. These attributes
|
||||
can only be set by privileged tasks, but any task can read them for their own
|
||||
sockets.
|
||||
|
||||
SMACK64IPIN: The Smack label of the task object. A privileged
|
||||
SMACK64IPIN:
|
||||
The Smack label of the task object. A privileged
|
||||
program that will enforce policy may set this to the star label.
|
||||
|
||||
SMACK64IPOUT: The Smack label transmitted with outgoing packets.
|
||||
SMACK64IPOUT:
|
||||
The Smack label transmitted with outgoing packets.
|
||||
A privileged program may set this to match the label of another
|
||||
task with which it hopes to communicate.
|
||||
|
||||
Smack Netlabel Exceptions
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
You will often find that your labeled application has to talk to the outside,
|
||||
unlabeled world. To do this there's a special file /sys/fs/smackfs/netlabel
|
||||
where you can add some exceptions in the form of :
|
||||
@IP1 LABEL1 or
|
||||
@IP2/MASK LABEL2
|
||||
where you can add some exceptions in the form of::
|
||||
|
||||
@IP1 LABEL1 or
|
||||
@IP2/MASK LABEL2
|
||||
|
||||
It means that your application will have unlabeled access to @IP1 if it has
|
||||
write access on LABEL1, and access to the subnet @IP2/MASK if it has write
|
||||
@ -624,28 +713,32 @@ access on LABEL2.
|
||||
Entries in the /sys/fs/smackfs/netlabel file are matched by longest mask
|
||||
first, like in classless IPv4 routing.
|
||||
|
||||
A special label '@' and an option '-CIPSO' can be used there :
|
||||
@ means Internet, any application with any label has access to it
|
||||
-CIPSO means standard CIPSO networking
|
||||
A special label '@' and an option '-CIPSO' can be used there::
|
||||
|
||||
If you don't know what CIPSO is and don't plan to use it, you can just do :
|
||||
echo 127.0.0.1 -CIPSO > /sys/fs/smackfs/netlabel
|
||||
echo 0.0.0.0/0 @ > /sys/fs/smackfs/netlabel
|
||||
@ means Internet, any application with any label has access to it
|
||||
-CIPSO means standard CIPSO networking
|
||||
|
||||
If you don't know what CIPSO is and don't plan to use it, you can just do::
|
||||
|
||||
echo 127.0.0.1 -CIPSO > /sys/fs/smackfs/netlabel
|
||||
echo 0.0.0.0/0 @ > /sys/fs/smackfs/netlabel
|
||||
|
||||
If you use CIPSO on your 192.168.0.0/16 local network and need also unlabeled
|
||||
Internet access, you can have :
|
||||
echo 127.0.0.1 -CIPSO > /sys/fs/smackfs/netlabel
|
||||
echo 192.168.0.0/16 -CIPSO > /sys/fs/smackfs/netlabel
|
||||
echo 0.0.0.0/0 @ > /sys/fs/smackfs/netlabel
|
||||
Internet access, you can have::
|
||||
|
||||
echo 127.0.0.1 -CIPSO > /sys/fs/smackfs/netlabel
|
||||
echo 192.168.0.0/16 -CIPSO > /sys/fs/smackfs/netlabel
|
||||
echo 0.0.0.0/0 @ > /sys/fs/smackfs/netlabel
|
||||
|
||||
Writing Applications for Smack
|
||||
------------------------------
|
||||
|
||||
There are three sorts of applications that will run on a Smack system. How an
|
||||
application interacts with Smack will determine what it will have to do to
|
||||
work properly under Smack.
|
||||
|
||||
Smack Ignorant Applications
|
||||
---------------------------
|
||||
|
||||
By far the majority of applications have no reason whatever to care about the
|
||||
unique properties of Smack. Since invoking a program has no impact on the
|
||||
@ -653,12 +746,14 @@ Smack label associated with the process the only concern likely to arise is
|
||||
whether the process has execute access to the program.
|
||||
|
||||
Smack Relevant Applications
|
||||
---------------------------
|
||||
|
||||
Some programs can be improved by teaching them about Smack, but do not make
|
||||
any security decisions themselves. The utility ls(1) is one example of such a
|
||||
program.
|
||||
|
||||
Smack Enforcing Applications
|
||||
----------------------------
|
||||
|
||||
These are special programs that not only know about Smack, but participate in
|
||||
the enforcement of system policy. In most cases these are the programs that
|
||||
@ -666,15 +761,16 @@ set up user sessions. There are also network services that provide information
|
||||
to processes running with various labels.
|
||||
|
||||
File System Interfaces
|
||||
----------------------
|
||||
|
||||
Smack maintains labels on file system objects using extended attributes. The
|
||||
Smack label of a file, directory, or other file system object can be obtained
|
||||
using getxattr(2).
|
||||
using getxattr(2)::
|
||||
|
||||
len = getxattr("/", "security.SMACK64", value, sizeof (value));
|
||||
|
||||
will put the Smack label of the root directory into value. A privileged
|
||||
process can set the Smack label of a file system object with setxattr(2).
|
||||
process can set the Smack label of a file system object with setxattr(2)::
|
||||
|
||||
len = strlen("Rubble");
|
||||
rc = setxattr("/foo", "security.SMACK64", "Rubble", len, 0);
|
||||
@ -683,17 +779,18 @@ will set the Smack label of /foo to "Rubble" if the program has appropriate
|
||||
privilege.
|
||||
|
||||
Socket Interfaces
|
||||
-----------------
|
||||
|
||||
The socket attributes can be read using fgetxattr(2).
|
||||
|
||||
A privileged process can set the Smack label of outgoing packets with
|
||||
fsetxattr(2).
|
||||
fsetxattr(2)::
|
||||
|
||||
len = strlen("Rubble");
|
||||
rc = fsetxattr(fd, "security.SMACK64IPOUT", "Rubble", len, 0);
|
||||
|
||||
will set the Smack label "Rubble" on packets going out from the socket if the
|
||||
program has appropriate privilege.
|
||||
program has appropriate privilege::
|
||||
|
||||
rc = fsetxattr(fd, "security.SMACK64IPIN, "*", strlen("*"), 0);
|
||||
|
||||
@ -701,33 +798,40 @@ will set the Smack label "*" as the object label against which incoming
|
||||
packets will be checked if the program has appropriate privilege.
|
||||
|
||||
Administration
|
||||
--------------
|
||||
|
||||
Smack supports some mount options:
|
||||
|
||||
smackfsdef=label: specifies the label to give files that lack
|
||||
smackfsdef=label:
|
||||
specifies the label to give files that lack
|
||||
the Smack label extended attribute.
|
||||
|
||||
smackfsroot=label: specifies the label to assign the root of the
|
||||
smackfsroot=label:
|
||||
specifies the label to assign the root of the
|
||||
file system if it lacks the Smack extended attribute.
|
||||
|
||||
smackfshat=label: specifies a label that must have read access to
|
||||
smackfshat=label:
|
||||
specifies a label that must have read access to
|
||||
all labels set on the filesystem. Not yet enforced.
|
||||
|
||||
smackfsfloor=label: specifies a label to which all labels set on the
|
||||
smackfsfloor=label:
|
||||
specifies a label to which all labels set on the
|
||||
filesystem must have read access. Not yet enforced.
|
||||
|
||||
These mount options apply to all file system types.
|
||||
|
||||
Smack auditing
|
||||
--------------
|
||||
|
||||
If you want Smack auditing of security events, you need to set CONFIG_AUDIT
|
||||
in your kernel configuration.
|
||||
By default, all denied events will be audited. You can change this behavior by
|
||||
writing a single character to the /sys/fs/smackfs/logging file :
|
||||
0 : no logging
|
||||
1 : log denied (default)
|
||||
2 : log accepted
|
||||
3 : log denied & accepted
|
||||
writing a single character to the /sys/fs/smackfs/logging file::
|
||||
|
||||
0 : no logging
|
||||
1 : log denied (default)
|
||||
2 : log accepted
|
||||
3 : log denied & accepted
|
||||
|
||||
Events are logged as 'key=value' pairs, for each event you at least will get
|
||||
the subject, the object, the rights requested, the action, the kernel function
|
||||
@ -735,6 +839,7 @@ that triggered the event, plus other pairs depending on the type of event
|
||||
audited.
|
||||
|
||||
Bringup Mode
|
||||
------------
|
||||
|
||||
Bringup mode provides logging features that can make application
|
||||
configuration and system bringup easier. Configure the kernel with
|
@ -1,13 +1,14 @@
|
||||
====
|
||||
Yama
|
||||
====
|
||||
|
||||
Yama is a Linux Security Module that collects system-wide DAC security
|
||||
protections that are not handled by the core kernel itself. This is
|
||||
selectable at build-time with CONFIG_SECURITY_YAMA, and can be controlled
|
||||
at run-time through sysctls in /proc/sys/kernel/yama:
|
||||
selectable at build-time with ``CONFIG_SECURITY_YAMA``, and can be controlled
|
||||
at run-time through sysctls in ``/proc/sys/kernel/yama``:
|
||||
|
||||
- ptrace_scope
|
||||
|
||||
==============================================================
|
||||
|
||||
ptrace_scope:
|
||||
ptrace_scope
|
||||
============
|
||||
|
||||
As Linux grows in popularity, it will become a larger target for
|
||||
malware. One particularly troubling weakness of the Linux process
|
||||
@ -25,47 +26,49 @@ exist and remain possible if ptrace is allowed to operate as before.
|
||||
Since ptrace is not commonly used by non-developers and non-admins, system
|
||||
builders should be allowed the option to disable this debugging system.
|
||||
|
||||
For a solution, some applications use prctl(PR_SET_DUMPABLE, ...) to
|
||||
For a solution, some applications use ``prctl(PR_SET_DUMPABLE, ...)`` to
|
||||
specifically disallow such ptrace attachment (e.g. ssh-agent), but many
|
||||
do not. A more general solution is to only allow ptrace directly from a
|
||||
parent to a child process (i.e. direct "gdb EXE" and "strace EXE" still
|
||||
work), or with CAP_SYS_PTRACE (i.e. "gdb --pid=PID", and "strace -p PID"
|
||||
work), or with ``CAP_SYS_PTRACE`` (i.e. "gdb --pid=PID", and "strace -p PID"
|
||||
still work as root).
|
||||
|
||||
In mode 1, software that has defined application-specific relationships
|
||||
between a debugging process and its inferior (crash handlers, etc),
|
||||
prctl(PR_SET_PTRACER, pid, ...) can be used. An inferior can declare which
|
||||
other process (and its descendants) are allowed to call PTRACE_ATTACH
|
||||
``prctl(PR_SET_PTRACER, pid, ...)`` can be used. An inferior can declare which
|
||||
other process (and its descendants) are allowed to call ``PTRACE_ATTACH``
|
||||
against it. Only one such declared debugging process can exists for
|
||||
each inferior at a time. For example, this is used by KDE, Chromium, and
|
||||
Firefox's crash handlers, and by Wine for allowing only Wine processes
|
||||
to ptrace each other. If a process wishes to entirely disable these ptrace
|
||||
restrictions, it can call prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, ...)
|
||||
restrictions, it can call ``prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, ...)``
|
||||
so that any otherwise allowed process (even those in external pid namespaces)
|
||||
may attach.
|
||||
|
||||
The sysctl settings (writable only with CAP_SYS_PTRACE) are:
|
||||
The sysctl settings (writable only with ``CAP_SYS_PTRACE``) are:
|
||||
|
||||
0 - classic ptrace permissions: a process can PTRACE_ATTACH to any other
|
||||
0 - classic ptrace permissions:
|
||||
a process can ``PTRACE_ATTACH`` to any other
|
||||
process running under the same uid, as long as it is dumpable (i.e.
|
||||
did not transition uids, start privileged, or have called
|
||||
prctl(PR_SET_DUMPABLE...) already). Similarly, PTRACE_TRACEME is
|
||||
``prctl(PR_SET_DUMPABLE...)`` already). Similarly, ``PTRACE_TRACEME`` is
|
||||
unchanged.
|
||||
|
||||
1 - restricted ptrace: a process must have a predefined relationship
|
||||
with the inferior it wants to call PTRACE_ATTACH on. By default,
|
||||
1 - restricted ptrace:
|
||||
a process must have a predefined relationship
|
||||
with the inferior it wants to call ``PTRACE_ATTACH`` on. By default,
|
||||
this relationship is that of only its descendants when the above
|
||||
classic criteria is also met. To change the relationship, an
|
||||
inferior can call prctl(PR_SET_PTRACER, debugger, ...) to declare
|
||||
an allowed debugger PID to call PTRACE_ATTACH on the inferior.
|
||||
Using PTRACE_TRACEME is unchanged.
|
||||
inferior can call ``prctl(PR_SET_PTRACER, debugger, ...)`` to declare
|
||||
an allowed debugger PID to call ``PTRACE_ATTACH`` on the inferior.
|
||||
Using ``PTRACE_TRACEME`` is unchanged.
|
||||
|
||||
2 - admin-only attach: only processes with CAP_SYS_PTRACE may use ptrace
|
||||
with PTRACE_ATTACH, or through children calling PTRACE_TRACEME.
|
||||
2 - admin-only attach:
|
||||
only processes with ``CAP_SYS_PTRACE`` may use ptrace
|
||||
with ``PTRACE_ATTACH``, or through children calling ``PTRACE_TRACEME``.
|
||||
|
||||
3 - no attach: no processes may use ptrace with PTRACE_ATTACH nor via
|
||||
PTRACE_TRACEME. Once set, this sysctl value cannot be changed.
|
||||
3 - no attach:
|
||||
no processes may use ptrace with ``PTRACE_ATTACH`` nor via
|
||||
``PTRACE_TRACEME``. Once set, this sysctl value cannot be changed.
|
||||
|
||||
The original children-only logic was based on the restrictions in grsecurity.
|
||||
|
||||
==============================================================
|
@ -1,4 +1,9 @@
|
||||
--- What is AppArmor? ---
|
||||
========
|
||||
AppArmor
|
||||
========
|
||||
|
||||
What is AppArmor?
|
||||
=================
|
||||
|
||||
AppArmor is MAC style security extension for the Linux kernel. It implements
|
||||
a task centered policy, with task "profiles" being created and loaded
|
||||
@ -6,34 +11,41 @@ from user space. Tasks on the system that do not have a profile defined for
|
||||
them run in an unconfined state which is equivalent to standard Linux DAC
|
||||
permissions.
|
||||
|
||||
--- How to enable/disable ---
|
||||
How to enable/disable
|
||||
=====================
|
||||
|
||||
set CONFIG_SECURITY_APPARMOR=y
|
||||
set ``CONFIG_SECURITY_APPARMOR=y``
|
||||
|
||||
If AppArmor should be selected as the default security module then
|
||||
set CONFIG_DEFAULT_SECURITY="apparmor"
|
||||
and CONFIG_SECURITY_APPARMOR_BOOTPARAM_VALUE=1
|
||||
If AppArmor should be selected as the default security module then set::
|
||||
|
||||
CONFIG_DEFAULT_SECURITY="apparmor"
|
||||
CONFIG_SECURITY_APPARMOR_BOOTPARAM_VALUE=1
|
||||
|
||||
Build the kernel
|
||||
|
||||
If AppArmor is not the default security module it can be enabled by passing
|
||||
security=apparmor on the kernel's command line.
|
||||
``security=apparmor`` on the kernel's command line.
|
||||
|
||||
If AppArmor is the default security module it can be disabled by passing
|
||||
apparmor=0, security=XXXX (where XXX is valid security module), on the
|
||||
kernel's command line
|
||||
``apparmor=0, security=XXXX`` (where ``XXXX`` is valid security module), on the
|
||||
kernel's command line.
|
||||
|
||||
For AppArmor to enforce any restrictions beyond standard Linux DAC permissions
|
||||
policy must be loaded into the kernel from user space (see the Documentation
|
||||
and tools links).
|
||||
|
||||
--- Documentation ---
|
||||
Documentation
|
||||
=============
|
||||
|
||||
Documentation can be found on the wiki.
|
||||
Documentation can be found on the wiki, linked below.
|
||||
|
||||
--- Links ---
|
||||
Links
|
||||
=====
|
||||
|
||||
Mailing List - apparmor@lists.ubuntu.com
|
||||
|
||||
Wiki - http://apparmor.wiki.kernel.org/
|
||||
|
||||
User space tools - https://launchpad.net/apparmor
|
||||
|
||||
Kernel module - git://git.kernel.org/pub/scm/linux/kernel/git/jj/apparmor-dev.git
|
@ -1,12 +1,13 @@
|
||||
Linux Security Module framework
|
||||
-------------------------------
|
||||
===========================
|
||||
Linux Security Module Usage
|
||||
===========================
|
||||
|
||||
The Linux Security Module (LSM) framework provides a mechanism for
|
||||
various security checks to be hooked by new kernel extensions. The name
|
||||
"module" is a bit of a misnomer since these extensions are not actually
|
||||
loadable kernel modules. Instead, they are selectable at build-time via
|
||||
CONFIG_DEFAULT_SECURITY and can be overridden at boot-time via the
|
||||
"security=..." kernel command line argument, in the case where multiple
|
||||
``"security=..."`` kernel command line argument, in the case where multiple
|
||||
LSMs were built into a given kernel.
|
||||
|
||||
The primary users of the LSM interface are Mandatory Access Control
|
||||
@ -19,23 +20,22 @@ in the core functionality of Linux itself.
|
||||
Without a specific LSM built into the kernel, the default LSM will be the
|
||||
Linux capabilities system. Most LSMs choose to extend the capabilities
|
||||
system, building their checks on top of the defined capability hooks.
|
||||
For more details on capabilities, see capabilities(7) in the Linux
|
||||
For more details on capabilities, see ``capabilities(7)`` in the Linux
|
||||
man-pages project.
|
||||
|
||||
A list of the active security modules can be found by reading
|
||||
/sys/kernel/security/lsm. This is a comma separated list, and
|
||||
``/sys/kernel/security/lsm``. This is a comma separated list, and
|
||||
will always include the capability module. The list reflects the
|
||||
order in which checks are made. The capability module will always
|
||||
be first, followed by any "minor" modules (e.g. Yama) and then
|
||||
the one "major" module (e.g. SELinux) if there is one configured.
|
||||
|
||||
Based on https://lkml.org/lkml/2007/10/26/215,
|
||||
a new LSM is accepted into the kernel when its intent (a description of
|
||||
what it tries to protect against and in what cases one would expect to
|
||||
use it) has been appropriately documented in Documentation/security/.
|
||||
This allows an LSM's code to be easily compared to its goals, and so
|
||||
that end users and distros can make a more informed decision about which
|
||||
LSMs suit their requirements.
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
For extensive documentation on the available LSM hook interfaces, please
|
||||
see include/linux/security.h.
|
||||
apparmor
|
||||
LoadPin
|
||||
SELinux
|
||||
Smack
|
||||
tomoyo
|
||||
Yama
|
@ -1,21 +1,30 @@
|
||||
--- What is TOMOYO? ---
|
||||
======
|
||||
TOMOYO
|
||||
======
|
||||
|
||||
What is TOMOYO?
|
||||
===============
|
||||
|
||||
TOMOYO is a name-based MAC extension (LSM module) for the Linux kernel.
|
||||
|
||||
LiveCD-based tutorials are available at
|
||||
|
||||
http://tomoyo.sourceforge.jp/1.7/1st-step/ubuntu10.04-live/
|
||||
http://tomoyo.sourceforge.jp/1.7/1st-step/centos5-live/ .
|
||||
http://tomoyo.sourceforge.jp/1.7/1st-step/centos5-live/
|
||||
|
||||
Though these tutorials use non-LSM version of TOMOYO, they are useful for you
|
||||
to know what TOMOYO is.
|
||||
|
||||
--- How to enable TOMOYO? ---
|
||||
How to enable TOMOYO?
|
||||
=====================
|
||||
|
||||
Build the kernel with CONFIG_SECURITY_TOMOYO=y and pass "security=tomoyo" on
|
||||
Build the kernel with ``CONFIG_SECURITY_TOMOYO=y`` and pass ``security=tomoyo`` on
|
||||
kernel's command line.
|
||||
|
||||
Please see http://tomoyo.sourceforge.jp/2.3/ for details.
|
||||
|
||||
--- Where is documentation? ---
|
||||
Where is documentation?
|
||||
=======================
|
||||
|
||||
User <-> Kernel interface documentation is available at
|
||||
http://tomoyo.sourceforge.jp/2.3/policy-reference.html .
|
||||
@ -42,7 +51,8 @@ History of TOMOYO?
|
||||
Realities of Mainlining
|
||||
http://sourceforge.jp/projects/tomoyo/docs/lfj2008.pdf
|
||||
|
||||
--- What is future plan? ---
|
||||
What is future plan?
|
||||
====================
|
||||
|
||||
We believe that inode based security and name based security are complementary
|
||||
and both should be used together. But unfortunately, so far, we cannot enable
|
@ -55,12 +55,6 @@ Documentation
|
||||
contains information about the problems, which may result by upgrading
|
||||
your kernel.
|
||||
|
||||
- The Documentation/DocBook/ subdirectory contains several guides for
|
||||
kernel developers and users. These guides can be rendered in a
|
||||
number of formats: PostScript (.ps), PDF, HTML, & man-pages, among others.
|
||||
After installation, ``make psdocs``, ``make pdfdocs``, ``make htmldocs``,
|
||||
or ``make mandocs`` will render the documentation in the requested format.
|
||||
|
||||
Installing the kernel source
|
||||
----------------------------
|
||||
|
||||
|
@ -369,8 +369,10 @@
|
||||
237 = /dev/loop-control Loopback control device
|
||||
238 = /dev/vhost-net Host kernel accelerator for virtio net
|
||||
239 = /dev/uhid User-space I/O driver support for HID subsystem
|
||||
240 = /dev/userio Serio driver testing device
|
||||
241 = /dev/vhost-vsock Host kernel driver for virtio vsock
|
||||
|
||||
240-254 Reserved for local use
|
||||
242-254 Reserved for local use
|
||||
255 Reserved for MISC_DYNAMIC_MINOR
|
||||
|
||||
11 char Raw keyboard device (Linux/SPARC only)
|
||||
|
@ -61,6 +61,8 @@ configure specific aspects of kernel behavior to your liking.
|
||||
java
|
||||
ras
|
||||
pm/index
|
||||
thunderbolt
|
||||
LSM/index
|
||||
|
||||
.. only:: subproject and html
|
||||
|
||||
|
@ -649,6 +649,13 @@
|
||||
/proc/<pid>/coredump_filter.
|
||||
See also Documentation/filesystems/proc.txt.
|
||||
|
||||
coresight_cpu_debug.enable
|
||||
[ARM,ARM64]
|
||||
Format: <bool>
|
||||
Enable/disable the CPU sampling based debugging.
|
||||
0: default value, disable debugging
|
||||
1: enable debugging at boot time
|
||||
|
||||
cpuidle.off=1 [CPU_IDLE]
|
||||
disable the cpuidle sub-system
|
||||
|
||||
@ -720,7 +727,8 @@
|
||||
See also Documentation/input/joystick-parport.txt
|
||||
|
||||
ddebug_query= [KNL,DYNAMIC_DEBUG] Enable debug messages at early boot
|
||||
time. See Documentation/dynamic-debug-howto.txt for
|
||||
time. See
|
||||
Documentation/admin-guide/dynamic-debug-howto.rst for
|
||||
details. Deprecated, see dyndbg.
|
||||
|
||||
debug [KNL] Enable kernel debugging (events log level).
|
||||
@ -866,6 +874,15 @@
|
||||
|
||||
dscc4.setup= [NET]
|
||||
|
||||
dt_cpu_ftrs= [PPC]
|
||||
Format: {"off" | "known"}
|
||||
Control how the dt_cpu_ftrs device-tree binding is
|
||||
used for CPU feature discovery and setup (if it
|
||||
exists).
|
||||
off: Do not use it, fall back to legacy cpu table.
|
||||
known: Do not pass through unknown features to guests
|
||||
or userspace, only those that the kernel is aware of.
|
||||
|
||||
dump_apple_properties [X86]
|
||||
Dump name and content of EFI device properties on
|
||||
x86 Macs. Useful for driver authors to determine
|
||||
@ -874,7 +891,8 @@
|
||||
dyndbg[="val"] [KNL,DYNAMIC_DEBUG]
|
||||
module.dyndbg[="val"]
|
||||
Enable debug messages at boot time. See
|
||||
Documentation/dynamic-debug-howto.txt for details.
|
||||
Documentation/admin-guide/dynamic-debug-howto.rst
|
||||
for details.
|
||||
|
||||
nompx [X86] Disables Intel Memory Protection Extensions.
|
||||
See Documentation/x86/intel_mpx.txt for more
|
||||
@ -945,6 +963,12 @@
|
||||
must already be setup and configured. Options are not
|
||||
yet supported.
|
||||
|
||||
owl,<addr>
|
||||
Start an early, polled-mode console on a serial port
|
||||
of an Actions Semi SoC, such as S500 or S900, at the
|
||||
specified address. The serial port must already be
|
||||
setup and configured. Options are not yet supported.
|
||||
|
||||
smh Use ARM semihosting calls for early console.
|
||||
|
||||
s3c2410,<addr>
|
||||
@ -1838,6 +1862,18 @@
|
||||
for all guests.
|
||||
Default is 1 (enabled) if in 64-bit or 32-bit PAE mode.
|
||||
|
||||
kvm-arm.vgic_v3_group0_trap=
|
||||
[KVM,ARM] Trap guest accesses to GICv3 group-0
|
||||
system registers
|
||||
|
||||
kvm-arm.vgic_v3_group1_trap=
|
||||
[KVM,ARM] Trap guest accesses to GICv3 group-1
|
||||
system registers
|
||||
|
||||
kvm-arm.vgic_v3_common_trap=
|
||||
[KVM,ARM] Trap guest accesses to GICv3 common
|
||||
system registers
|
||||
|
||||
kvm-intel.ept= [KVM,Intel] Disable extended page tables
|
||||
(virtualized MMU) support on capable Intel chips.
|
||||
Default is 1 (enabled)
|
||||
@ -2136,6 +2172,12 @@
|
||||
memmap=nn[KMG]@ss[KMG]
|
||||
[KNL] Force usage of a specific region of memory.
|
||||
Region of memory to be used is from ss to ss+nn.
|
||||
If @ss[KMG] is omitted, it is equivalent to mem=nn[KMG],
|
||||
which limits max address to nn[KMG].
|
||||
Multiple different regions can be specified,
|
||||
comma delimited.
|
||||
Example:
|
||||
memmap=100M@2G,100M#3G,1G!1024G
|
||||
|
||||
memmap=nn[KMG]#ss[KMG]
|
||||
[KNL,ACPI] Mark specific memory as ACPI data.
|
||||
@ -2148,6 +2190,9 @@
|
||||
memmap=64K$0x18690000
|
||||
or
|
||||
memmap=0x10000$0x18690000
|
||||
Some bootloaders may need an escape character before '$',
|
||||
like Grub2, otherwise '$' and the following number
|
||||
will be eaten.
|
||||
|
||||
memmap=nn[KMG]!ss[KMG]
|
||||
[KNL,X86] Mark specific memory as protected.
|
||||
@ -2270,8 +2315,11 @@
|
||||
that the amount of memory usable for all allocations
|
||||
is not too small.
|
||||
|
||||
movable_node [KNL] Boot-time switch to enable the effects
|
||||
of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details.
|
||||
movable_node [KNL] Boot-time switch to make hotplugable memory
|
||||
NUMA nodes to be movable. This means that the memory
|
||||
of such nodes will be usable only for movable
|
||||
allocations which rules out almost all kernel
|
||||
allocations. Use with caution!
|
||||
|
||||
MTD_Partition= [MTD]
|
||||
Format: <name>,<region-number>,<size>,<offset>
|
||||
@ -3238,21 +3286,17 @@
|
||||
|
||||
rcutree.gp_cleanup_delay= [KNL]
|
||||
Set the number of jiffies to delay each step of
|
||||
RCU grace-period cleanup. This only has effect
|
||||
when CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP is set.
|
||||
RCU grace-period cleanup.
|
||||
|
||||
rcutree.gp_init_delay= [KNL]
|
||||
Set the number of jiffies to delay each step of
|
||||
RCU grace-period initialization. This only has
|
||||
effect when CONFIG_RCU_TORTURE_TEST_SLOW_INIT
|
||||
is set.
|
||||
RCU grace-period initialization.
|
||||
|
||||
rcutree.gp_preinit_delay= [KNL]
|
||||
Set the number of jiffies to delay each step of
|
||||
RCU grace-period pre-initialization, that is,
|
||||
the propagation of recent CPU-hotplug changes up
|
||||
the rcu_node combining tree. This only has effect
|
||||
when CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT is set.
|
||||
the rcu_node combining tree.
|
||||
|
||||
rcutree.rcu_fanout_exact= [KNL]
|
||||
Disable autobalancing of the rcu_node combining
|
||||
@ -3328,6 +3372,17 @@
|
||||
This wake_up() will be accompanied by a
|
||||
WARN_ONCE() splat and an ftrace_dump().
|
||||
|
||||
rcuperf.gp_async= [KNL]
|
||||
Measure performance of asynchronous
|
||||
grace-period primitives such as call_rcu().
|
||||
|
||||
rcuperf.gp_async_max= [KNL]
|
||||
Specify the maximum number of outstanding
|
||||
callbacks per writer thread. When a writer
|
||||
thread exceeds this limit, it invokes the
|
||||
corresponding flavor of rcu_barrier() to allow
|
||||
previously posted callbacks to drain.
|
||||
|
||||
rcuperf.gp_exp= [KNL]
|
||||
Measure performance of expedited synchronous
|
||||
grace-period primitives.
|
||||
@ -3355,17 +3410,22 @@
|
||||
rcuperf.perf_runnable= [BOOT]
|
||||
Start rcuperf running at boot time.
|
||||
|
||||
rcuperf.perf_type= [KNL]
|
||||
Specify the RCU implementation to test.
|
||||
|
||||
rcuperf.shutdown= [KNL]
|
||||
Shut the system down after performance tests
|
||||
complete. This is useful for hands-off automated
|
||||
testing.
|
||||
|
||||
rcuperf.perf_type= [KNL]
|
||||
Specify the RCU implementation to test.
|
||||
|
||||
rcuperf.verbose= [KNL]
|
||||
Enable additional printk() statements.
|
||||
|
||||
rcuperf.writer_holdoff= [KNL]
|
||||
Write-side holdoff between grace periods,
|
||||
in microseconds. The default of zero says
|
||||
no holdoff.
|
||||
|
||||
rcutorture.cbflood_inter_holdoff= [KNL]
|
||||
Set holdoff time (jiffies) between successive
|
||||
callback-flood tests.
|
||||
@ -3715,8 +3775,14 @@
|
||||
slab_nomerge [MM]
|
||||
Disable merging of slabs with similar size. May be
|
||||
necessary if there is some reason to distinguish
|
||||
allocs to different slabs. Debug options disable
|
||||
merging on their own.
|
||||
allocs to different slabs, especially in hardened
|
||||
environments where the risk of heap overflows and
|
||||
layout control by attackers can usually be
|
||||
frustrated by disabling merging. This will reduce
|
||||
most of the exposure of a heap attack to a single
|
||||
cache (risks via metadata attacks are mostly
|
||||
unchanged). Debug options disable merging on their
|
||||
own.
|
||||
For more information see Documentation/vm/slub.txt.
|
||||
|
||||
slab_max_order= [MM, SLAB]
|
||||
@ -3803,6 +3869,15 @@
|
||||
spia_pedr=
|
||||
spia_peddr=
|
||||
|
||||
srcutree.counter_wrap_check [KNL]
|
||||
Specifies how frequently to check for
|
||||
grace-period sequence counter wrap for the
|
||||
srcu_data structure's ->srcu_gp_seq_needed field.
|
||||
The greater the number of bits set in this kernel
|
||||
parameter, the less frequently counter wrap will
|
||||
be checked for. Note that the bottom two bits
|
||||
are ignored.
|
||||
|
||||
srcutree.exp_holdoff [KNL]
|
||||
Specifies how many nanoseconds must elapse
|
||||
since the end of the last SRCU grace period for
|
||||
@ -3811,6 +3886,13 @@
|
||||
expediting. Set to zero to disable automatic
|
||||
expediting.
|
||||
|
||||
stack_guard_gap= [MM]
|
||||
override the default stack gap protection. The value
|
||||
is in page units and it defines how many pages prior
|
||||
to (for stacks growing down) resp. after (for stacks
|
||||
growing up) the main stack are reserved for no other
|
||||
mapping. Default value is 256 pages.
|
||||
|
||||
stacktrace [FTRACE]
|
||||
Enabled the stack tracer on boot up.
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
.. |struct cpufreq_policy| replace:: :c:type:`struct cpufreq_policy <cpufreq_policy>`
|
||||
.. |intel_pstate| replace:: :doc:`intel_pstate <intel_pstate>`
|
||||
|
||||
=======================
|
||||
CPU Performance Scaling
|
||||
@ -75,7 +76,7 @@ feedback registers, as that information is typically specific to the hardware
|
||||
interface it comes from and may not be easily represented in an abstract,
|
||||
platform-independent way. For this reason, ``CPUFreq`` allows scaling drivers
|
||||
to bypass the governor layer and implement their own performance scaling
|
||||
algorithms. That is done by the ``intel_pstate`` scaling driver.
|
||||
algorithms. That is done by the |intel_pstate| scaling driver.
|
||||
|
||||
|
||||
``CPUFreq`` Policy Objects
|
||||
@ -174,13 +175,13 @@ necessary to restart the scaling governor so that it can take the new online CPU
|
||||
into account. That is achieved by invoking the governor's ``->stop`` and
|
||||
``->start()`` callbacks, in this order, for the entire policy.
|
||||
|
||||
As mentioned before, the ``intel_pstate`` scaling driver bypasses the scaling
|
||||
As mentioned before, the |intel_pstate| scaling driver bypasses the scaling
|
||||
governor layer of ``CPUFreq`` and provides its own P-state selection algorithms.
|
||||
Consequently, if ``intel_pstate`` is used, scaling governors are not attached to
|
||||
Consequently, if |intel_pstate| is used, scaling governors are not attached to
|
||||
new policy objects. Instead, the driver's ``->setpolicy()`` callback is invoked
|
||||
to register per-CPU utilization update callbacks for each policy. These
|
||||
callbacks are invoked by the CPU scheduler in the same way as for scaling
|
||||
governors, but in the ``intel_pstate`` case they both determine the P-state to
|
||||
governors, but in the |intel_pstate| case they both determine the P-state to
|
||||
use and change the hardware configuration accordingly in one go from scheduler
|
||||
context.
|
||||
|
||||
@ -257,7 +258,7 @@ are the following:
|
||||
|
||||
``scaling_available_governors``
|
||||
List of ``CPUFreq`` scaling governors present in the kernel that can
|
||||
be attached to this policy or (if the ``intel_pstate`` scaling driver is
|
||||
be attached to this policy or (if the |intel_pstate| scaling driver is
|
||||
in use) list of scaling algorithms provided by the driver that can be
|
||||
applied to this policy.
|
||||
|
||||
@ -268,29 +269,29 @@ are the following:
|
||||
``scaling_cur_freq``
|
||||
Current frequency of all of the CPUs belonging to this policy (in kHz).
|
||||
|
||||
For the majority of scaling drivers, this is the frequency of the last
|
||||
P-state requested by the driver from the hardware using the scaling
|
||||
In the majority of cases, this is the frequency of the last P-state
|
||||
requested by the scaling driver from the hardware using the scaling
|
||||
interface provided by it, which may or may not reflect the frequency
|
||||
the CPU is actually running at (due to hardware design and other
|
||||
limitations).
|
||||
|
||||
Some scaling drivers (e.g. ``intel_pstate``) attempt to provide
|
||||
information more precisely reflecting the current CPU frequency through
|
||||
this attribute, but that still may not be the exact current CPU
|
||||
frequency as seen by the hardware at the moment.
|
||||
Some architectures (e.g. ``x86``) may attempt to provide information
|
||||
more precisely reflecting the current CPU frequency through this
|
||||
attribute, but that still may not be the exact current CPU frequency as
|
||||
seen by the hardware at the moment.
|
||||
|
||||
``scaling_driver``
|
||||
The scaling driver currently in use.
|
||||
|
||||
``scaling_governor``
|
||||
The scaling governor currently attached to this policy or (if the
|
||||
``intel_pstate`` scaling driver is in use) the scaling algorithm
|
||||
|intel_pstate| scaling driver is in use) the scaling algorithm
|
||||
provided by the driver that is currently applied to this policy.
|
||||
|
||||
This attribute is read-write and writing to it will cause a new scaling
|
||||
governor to be attached to this policy or a new scaling algorithm
|
||||
provided by the scaling driver to be applied to it (in the
|
||||
``intel_pstate`` case), as indicated by the string written to this
|
||||
|intel_pstate| case), as indicated by the string written to this
|
||||
attribute (which must be one of the names listed by the
|
||||
``scaling_available_governors`` attribute described above).
|
||||
|
||||
@ -619,7 +620,7 @@ This file is located under :file:`/sys/devices/system/cpu/cpufreq/` and controls
|
||||
the "boost" setting for the whole system. It is not present if the underlying
|
||||
scaling driver does not support the frequency boost mechanism (or supports it,
|
||||
but provides a driver-specific interface for controlling it, like
|
||||
``intel_pstate``).
|
||||
|intel_pstate|).
|
||||
|
||||
If the value in this file is 1, the frequency boost mechanism is enabled. This
|
||||
means that either the hardware can be put into states in which it is able to
|
||||
|
@ -6,6 +6,7 @@ Power Management
|
||||
:maxdepth: 2
|
||||
|
||||
cpufreq
|
||||
intel_pstate
|
||||
|
||||
.. only:: subproject and html
|
||||
|
||||
|
753
Documentation/admin-guide/pm/intel_pstate.rst
Normal file
753
Documentation/admin-guide/pm/intel_pstate.rst
Normal file
@ -0,0 +1,753 @@
|
||||
===============================================
|
||||
``intel_pstate`` CPU Performance Scaling Driver
|
||||
===============================================
|
||||
|
||||
::
|
||||
|
||||
Copyright (c) 2017 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
|
||||
|
||||
General Information
|
||||
===================
|
||||
|
||||
``intel_pstate`` is a part of the
|
||||
:doc:`CPU performance scaling subsystem <cpufreq>` in the Linux kernel
|
||||
(``CPUFreq``). It is a scaling driver for the Sandy Bridge and later
|
||||
generations of Intel processors. Note, however, that some of those processors
|
||||
may not be supported. [To understand ``intel_pstate`` it is necessary to know
|
||||
how ``CPUFreq`` works in general, so this is the time to read :doc:`cpufreq` if
|
||||
you have not done that yet.]
|
||||
|
||||
For the processors supported by ``intel_pstate``, the P-state concept is broader
|
||||
than just an operating frequency or an operating performance point (see the
|
||||
`LinuxCon Europe 2015 presentation by Kristen Accardi <LCEU2015_>`_ for more
|
||||
information about that). For this reason, the representation of P-states used
|
||||
by ``intel_pstate`` internally follows the hardware specification (for details
|
||||
refer to `Intel® 64 and IA-32 Architectures Software Developer’s Manual
|
||||
Volume 3: System Programming Guide <SDM_>`_). However, the ``CPUFreq`` core
|
||||
uses frequencies for identifying operating performance points of CPUs and
|
||||
frequencies are involved in the user space interface exposed by it, so
|
||||
``intel_pstate`` maps its internal representation of P-states to frequencies too
|
||||
(fortunately, that mapping is unambiguous). At the same time, it would not be
|
||||
practical for ``intel_pstate`` to supply the ``CPUFreq`` core with a table of
|
||||
available frequencies due to the possible size of it, so the driver does not do
|
||||
that. Some functionality of the core is limited by that.
|
||||
|
||||
Since the hardware P-state selection interface used by ``intel_pstate`` is
|
||||
available at the logical CPU level, the driver always works with individual
|
||||
CPUs. Consequently, if ``intel_pstate`` is in use, every ``CPUFreq`` policy
|
||||
object corresponds to one logical CPU and ``CPUFreq`` policies are effectively
|
||||
equivalent to CPUs. In particular, this means that they become "inactive" every
|
||||
time the corresponding CPU is taken offline and need to be re-initialized when
|
||||
it goes back online.
|
||||
|
||||
``intel_pstate`` is not modular, so it cannot be unloaded, which means that the
|
||||
only way to pass early-configuration-time parameters to it is via the kernel
|
||||
command line. However, its configuration can be adjusted via ``sysfs`` to a
|
||||
great extent. In some configurations it even is possible to unregister it via
|
||||
``sysfs`` which allows another ``CPUFreq`` scaling driver to be loaded and
|
||||
registered (see `below <status_attr_>`_).
|
||||
|
||||
|
||||
Operation Modes
|
||||
===============
|
||||
|
||||
``intel_pstate`` can operate in three different modes: in the active mode with
|
||||
or without hardware-managed P-states support and in the passive mode. Which of
|
||||
them will be in effect depends on what kernel command line options are used and
|
||||
on the capabilities of the processor.
|
||||
|
||||
Active Mode
|
||||
-----------
|
||||
|
||||
This is the default operation mode of ``intel_pstate``. If it works in this
|
||||
mode, the ``scaling_driver`` policy attribute in ``sysfs`` for all ``CPUFreq``
|
||||
policies contains the string "intel_pstate".
|
||||
|
||||
In this mode the driver bypasses the scaling governors layer of ``CPUFreq`` and
|
||||
provides its own scaling algorithms for P-state selection. Those algorithms
|
||||
can be applied to ``CPUFreq`` policies in the same way as generic scaling
|
||||
governors (that is, through the ``scaling_governor`` policy attribute in
|
||||
``sysfs``). [Note that different P-state selection algorithms may be chosen for
|
||||
different policies, but that is not recommended.]
|
||||
|
||||
They are not generic scaling governors, but their names are the same as the
|
||||
names of some of those governors. Moreover, confusingly enough, they generally
|
||||
do not work in the same way as the generic governors they share the names with.
|
||||
For example, the ``powersave`` P-state selection algorithm provided by
|
||||
``intel_pstate`` is not a counterpart of the generic ``powersave`` governor
|
||||
(roughly, it corresponds to the ``schedutil`` and ``ondemand`` governors).
|
||||
|
||||
There are two P-state selection algorithms provided by ``intel_pstate`` in the
|
||||
active mode: ``powersave`` and ``performance``. The way they both operate
|
||||
depends on whether or not the hardware-managed P-states (HWP) feature has been
|
||||
enabled in the processor and possibly on the processor model.
|
||||
|
||||
Which of the P-state selection algorithms is used by default depends on the
|
||||
:c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option.
|
||||
Namely, if that option is set, the ``performance`` algorithm will be used by
|
||||
default, and the other one will be used by default if it is not set.
|
||||
|
||||
Active Mode With HWP
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
If the processor supports the HWP feature, it will be enabled during the
|
||||
processor initialization and cannot be disabled after that. It is possible
|
||||
to avoid enabling it by passing the ``intel_pstate=no_hwp`` argument to the
|
||||
kernel in the command line.
|
||||
|
||||
If the HWP feature has been enabled, ``intel_pstate`` relies on the processor to
|
||||
select P-states by itself, but still it can give hints to the processor's
|
||||
internal P-state selection logic. What those hints are depends on which P-state
|
||||
selection algorithm has been applied to the given policy (or to the CPU it
|
||||
corresponds to).
|
||||
|
||||
Even though the P-state selection is carried out by the processor automatically,
|
||||
``intel_pstate`` registers utilization update callbacks with the CPU scheduler
|
||||
in this mode. However, they are not used for running a P-state selection
|
||||
algorithm, but for periodic updates of the current CPU frequency information to
|
||||
be made available from the ``scaling_cur_freq`` policy attribute in ``sysfs``.
|
||||
|
||||
HWP + ``performance``
|
||||
.....................
|
||||
|
||||
In this configuration ``intel_pstate`` will write 0 to the processor's
|
||||
Energy-Performance Preference (EPP) knob (if supported) or its
|
||||
Energy-Performance Bias (EPB) knob (otherwise), which means that the processor's
|
||||
internal P-state selection logic is expected to focus entirely on performance.
|
||||
|
||||
This will override the EPP/EPB setting coming from the ``sysfs`` interface
|
||||
(see `Energy vs Performance Hints`_ below).
|
||||
|
||||
Also, in this configuration the range of P-states available to the processor's
|
||||
internal P-state selection logic is always restricted to the upper boundary
|
||||
(that is, the maximum P-state that the driver is allowed to use).
|
||||
|
||||
HWP + ``powersave``
|
||||
...................
|
||||
|
||||
In this configuration ``intel_pstate`` will set the processor's
|
||||
Energy-Performance Preference (EPP) knob (if supported) or its
|
||||
Energy-Performance Bias (EPB) knob (otherwise) to whatever value it was
|
||||
previously set to via ``sysfs`` (or whatever default value it was
|
||||
set to by the platform firmware). This usually causes the processor's
|
||||
internal P-state selection logic to be less performance-focused.
|
||||
|
||||
Active Mode Without HWP
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This is the default operation mode for processors that do not support the HWP
|
||||
feature. It also is used by default with the ``intel_pstate=no_hwp`` argument
|
||||
in the kernel command line. However, in this mode ``intel_pstate`` may refuse
|
||||
to work with the given processor if it does not recognize it. [Note that
|
||||
``intel_pstate`` will never refuse to work with any processor with the HWP
|
||||
feature enabled.]
|
||||
|
||||
In this mode ``intel_pstate`` registers utilization update callbacks with the
|
||||
CPU scheduler in order to run a P-state selection algorithm, either
|
||||
``powersave`` or ``performance``, depending on the ``scaling_cur_freq`` policy
|
||||
setting in ``sysfs``. The current CPU frequency information to be made
|
||||
available from the ``scaling_cur_freq`` policy attribute in ``sysfs`` is
|
||||
periodically updated by those utilization update callbacks too.
|
||||
|
||||
``performance``
|
||||
...............
|
||||
|
||||
Without HWP, this P-state selection algorithm is always the same regardless of
|
||||
the processor model and platform configuration.
|
||||
|
||||
It selects the maximum P-state it is allowed to use, subject to limits set via
|
||||
``sysfs``, every time the driver configuration for the given CPU is updated
|
||||
(e.g. via ``sysfs``).
|
||||
|
||||
This is the default P-state selection algorithm if the
|
||||
:c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option
|
||||
is set.
|
||||
|
||||
``powersave``
|
||||
.............
|
||||
|
||||
Without HWP, this P-state selection algorithm generally depends on the
|
||||
processor model and/or the system profile setting in the ACPI tables and there
|
||||
are two variants of it.
|
||||
|
||||
One of them is used with processors from the Atom line and (regardless of the
|
||||
processor model) on platforms with the system profile in the ACPI tables set to
|
||||
"mobile" (laptops mostly), "tablet", "appliance PC", "desktop", or
|
||||
"workstation". It is also used with processors supporting the HWP feature if
|
||||
that feature has not been enabled (that is, with the ``intel_pstate=no_hwp``
|
||||
argument in the kernel command line). It is similar to the algorithm
|
||||
implemented by the generic ``schedutil`` scaling governor except that the
|
||||
utilization metric used by it is based on numbers coming from feedback
|
||||
registers of the CPU. It generally selects P-states proportional to the
|
||||
current CPU utilization, so it is referred to as the "proportional" algorithm.
|
||||
|
||||
The second variant of the ``powersave`` P-state selection algorithm, used in all
|
||||
of the other cases (generally, on processors from the Core line, so it is
|
||||
referred to as the "Core" algorithm), is based on the values read from the APERF
|
||||
and MPERF feedback registers and the previously requested target P-state.
|
||||
It does not really take CPU utilization into account explicitly, but as a rule
|
||||
it causes the CPU P-state to ramp up very quickly in response to increased
|
||||
utilization which is generally desirable in server environments.
|
||||
|
||||
Regardless of the variant, this algorithm is run by the driver's utilization
|
||||
update callback for the given CPU when it is invoked by the CPU scheduler, but
|
||||
not more often than every 10 ms (that can be tweaked via ``debugfs`` in `this
|
||||
particular case <Tuning Interface in debugfs_>`_). Like in the ``performance``
|
||||
case, the hardware configuration is not touched if the new P-state turns out to
|
||||
be the same as the current one.
|
||||
|
||||
This is the default P-state selection algorithm if the
|
||||
:c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option
|
||||
is not set.
|
||||
|
||||
Passive Mode
|
||||
------------
|
||||
|
||||
This mode is used if the ``intel_pstate=passive`` argument is passed to the
|
||||
kernel in the command line (it implies the ``intel_pstate=no_hwp`` setting too).
|
||||
Like in the active mode without HWP support, in this mode ``intel_pstate`` may
|
||||
refuse to work with the given processor if it does not recognize it.
|
||||
|
||||
If the driver works in this mode, the ``scaling_driver`` policy attribute in
|
||||
``sysfs`` for all ``CPUFreq`` policies contains the string "intel_cpufreq".
|
||||
Then, the driver behaves like a regular ``CPUFreq`` scaling driver. That is,
|
||||
it is invoked by generic scaling governors when necessary to talk to the
|
||||
hardware in order to change the P-state of a CPU (in particular, the
|
||||
``schedutil`` governor can invoke it directly from scheduler context).
|
||||
|
||||
While in this mode, ``intel_pstate`` can be used with all of the (generic)
|
||||
scaling governors listed by the ``scaling_available_governors`` policy attribute
|
||||
in ``sysfs`` (and the P-state selection algorithms described above are not
|
||||
used). Then, it is responsible for the configuration of policy objects
|
||||
corresponding to CPUs and provides the ``CPUFreq`` core (and the scaling
|
||||
governors attached to the policy objects) with accurate information on the
|
||||
maximum and minimum operating frequencies supported by the hardware (including
|
||||
the so-called "turbo" frequency ranges). In other words, in the passive mode
|
||||
the entire range of available P-states is exposed by ``intel_pstate`` to the
|
||||
``CPUFreq`` core. However, in this mode the driver does not register
|
||||
utilization update callbacks with the CPU scheduler and the ``scaling_cur_freq``
|
||||
information comes from the ``CPUFreq`` core (and is the last frequency selected
|
||||
by the current scaling governor for the given policy).
|
||||
|
||||
|
||||
.. _turbo:
|
||||
|
||||
Turbo P-states Support
|
||||
======================
|
||||
|
||||
In the majority of cases, the entire range of P-states available to
|
||||
``intel_pstate`` can be divided into two sub-ranges that correspond to
|
||||
different types of processor behavior, above and below a boundary that
|
||||
will be referred to as the "turbo threshold" in what follows.
|
||||
|
||||
The P-states above the turbo threshold are referred to as "turbo P-states" and
|
||||
the whole sub-range of P-states they belong to is referred to as the "turbo
|
||||
range". These names are related to the Turbo Boost technology allowing a
|
||||
multicore processor to opportunistically increase the P-state of one or more
|
||||
cores if there is enough power to do that and if that is not going to cause the
|
||||
thermal envelope of the processor package to be exceeded.
|
||||
|
||||
Specifically, if software sets the P-state of a CPU core within the turbo range
|
||||
(that is, above the turbo threshold), the processor is permitted to take over
|
||||
performance scaling control for that core and put it into turbo P-states of its
|
||||
choice going forward. However, that permission is interpreted differently by
|
||||
different processor generations. Namely, the Sandy Bridge generation of
|
||||
processors will never use any P-states above the last one set by software for
|
||||
the given core, even if it is within the turbo range, whereas all of the later
|
||||
processor generations will take it as a license to use any P-states from the
|
||||
turbo range, even above the one set by software. In other words, on those
|
||||
processors setting any P-state from the turbo range will enable the processor
|
||||
to put the given core into all turbo P-states up to and including the maximum
|
||||
supported one as it sees fit.
|
||||
|
||||
One important property of turbo P-states is that they are not sustainable. More
|
||||
precisely, there is no guarantee that any CPUs will be able to stay in any of
|
||||
those states indefinitely, because the power distribution within the processor
|
||||
package may change over time or the thermal envelope it was designed for might
|
||||
be exceeded if a turbo P-state was used for too long.
|
||||
|
||||
In turn, the P-states below the turbo threshold generally are sustainable. In
|
||||
fact, if one of them is set by software, the processor is not expected to change
|
||||
it to a lower one unless in a thermal stress or a power limit violation
|
||||
situation (a higher P-state may still be used if it is set for another CPU in
|
||||
the same package at the same time, for example).
|
||||
|
||||
Some processors allow multiple cores to be in turbo P-states at the same time,
|
||||
but the maximum P-state that can be set for them generally depends on the number
|
||||
of cores running concurrently. The maximum turbo P-state that can be set for 3
|
||||
cores at the same time usually is lower than the analogous maximum P-state for
|
||||
2 cores, which in turn usually is lower than the maximum turbo P-state that can
|
||||
be set for 1 core. The one-core maximum turbo P-state is thus the maximum
|
||||
supported one overall.
|
||||
|
||||
The maximum supported turbo P-state, the turbo threshold (the maximum supported
|
||||
non-turbo P-state) and the minimum supported P-state are specific to the
|
||||
processor model and can be determined by reading the processor's model-specific
|
||||
registers (MSRs). Moreover, some processors support the Configurable TDP
|
||||
(Thermal Design Power) feature and, when that feature is enabled, the turbo
|
||||
threshold effectively becomes a configurable value that can be set by the
|
||||
platform firmware.
|
||||
|
||||
Unlike ``_PSS`` objects in the ACPI tables, ``intel_pstate`` always exposes
|
||||
the entire range of available P-states, including the whole turbo range, to the
|
||||
``CPUFreq`` core and (in the passive mode) to generic scaling governors. This
|
||||
generally causes turbo P-states to be set more often when ``intel_pstate`` is
|
||||
used relative to ACPI-based CPU performance scaling (see `below <acpi-cpufreq_>`_
|
||||
for more information).
|
||||
|
||||
Moreover, since ``intel_pstate`` always knows what the real turbo threshold is
|
||||
(even if the Configurable TDP feature is enabled in the processor), its
|
||||
``no_turbo`` attribute in ``sysfs`` (described `below <no_turbo_attr_>`_) should
|
||||
work as expected in all cases (that is, if set to disable turbo P-states, it
|
||||
always should prevent ``intel_pstate`` from using them).
|
||||
|
||||
|
||||
Processor Support
|
||||
=================
|
||||
|
||||
To handle a given processor ``intel_pstate`` requires a number of different
|
||||
pieces of information on it to be known, including:
|
||||
|
||||
* The minimum supported P-state.
|
||||
|
||||
* The maximum supported `non-turbo P-state <turbo_>`_.
|
||||
|
||||
* Whether or not turbo P-states are supported at all.
|
||||
|
||||
* The maximum supported `one-core turbo P-state <turbo_>`_ (if turbo P-states
|
||||
are supported).
|
||||
|
||||
* The scaling formula to translate the driver's internal representation
|
||||
of P-states into frequencies and the other way around.
|
||||
|
||||
Generally, ways to obtain that information are specific to the processor model
|
||||
or family. Although it often is possible to obtain all of it from the processor
|
||||
itself (using model-specific registers), there are cases in which hardware
|
||||
manuals need to be consulted to get to it too.
|
||||
|
||||
For this reason, there is a list of supported processors in ``intel_pstate`` and
|
||||
the driver initialization will fail if the detected processor is not in that
|
||||
list, unless it supports the `HWP feature <Active Mode_>`_. [The interface to
|
||||
obtain all of the information listed above is the same for all of the processors
|
||||
supporting the HWP feature, which is why they all are supported by
|
||||
``intel_pstate``.]
|
||||
|
||||
|
||||
User Space Interface in ``sysfs``
|
||||
=================================
|
||||
|
||||
Global Attributes
|
||||
-----------------
|
||||
|
||||
``intel_pstate`` exposes several global attributes (files) in ``sysfs`` to
|
||||
control its functionality at the system level. They are located in the
|
||||
``/sys/devices/system/cpu/cpufreq/intel_pstate/`` directory and affect all
|
||||
CPUs.
|
||||
|
||||
Some of them are not present if the ``intel_pstate=per_cpu_perf_limits``
|
||||
argument is passed to the kernel in the command line.
|
||||
|
||||
``max_perf_pct``
|
||||
Maximum P-state the driver is allowed to set in percent of the
|
||||
maximum supported performance level (the highest supported `turbo
|
||||
P-state <turbo_>`_).
|
||||
|
||||
This attribute will not be exposed if the
|
||||
``intel_pstate=per_cpu_perf_limits`` argument is present in the kernel
|
||||
command line.
|
||||
|
||||
``min_perf_pct``
|
||||
Minimum P-state the driver is allowed to set in percent of the
|
||||
maximum supported performance level (the highest supported `turbo
|
||||
P-state <turbo_>`_).
|
||||
|
||||
This attribute will not be exposed if the
|
||||
``intel_pstate=per_cpu_perf_limits`` argument is present in the kernel
|
||||
command line.
|
||||
|
||||
``num_pstates``
|
||||
Number of P-states supported by the processor (between 0 and 255
|
||||
inclusive) including both turbo and non-turbo P-states (see
|
||||
`Turbo P-states Support`_).
|
||||
|
||||
The value of this attribute is not affected by the ``no_turbo``
|
||||
setting described `below <no_turbo_attr_>`_.
|
||||
|
||||
This attribute is read-only.
|
||||
|
||||
``turbo_pct``
|
||||
Ratio of the `turbo range <turbo_>`_ size to the size of the entire
|
||||
range of supported P-states, in percent.
|
||||
|
||||
This attribute is read-only.
|
||||
|
||||
.. _no_turbo_attr:
|
||||
|
||||
``no_turbo``
|
||||
If set (equal to 1), the driver is not allowed to set any turbo P-states
|
||||
(see `Turbo P-states Support`_). If unset (equalt to 0, which is the
|
||||
default), turbo P-states can be set by the driver.
|
||||
[Note that ``intel_pstate`` does not support the general ``boost``
|
||||
attribute (supported by some other scaling drivers) which is replaced
|
||||
by this one.]
|
||||
|
||||
This attrubute does not affect the maximum supported frequency value
|
||||
supplied to the ``CPUFreq`` core and exposed via the policy interface,
|
||||
but it affects the maximum possible value of per-policy P-state limits
|
||||
(see `Interpretation of Policy Attributes`_ below for details).
|
||||
|
||||
.. _status_attr:
|
||||
|
||||
``status``
|
||||
Operation mode of the driver: "active", "passive" or "off".
|
||||
|
||||
"active"
|
||||
The driver is functional and in the `active mode
|
||||
<Active Mode_>`_.
|
||||
|
||||
"passive"
|
||||
The driver is functional and in the `passive mode
|
||||
<Passive Mode_>`_.
|
||||
|
||||
"off"
|
||||
The driver is not functional (it is not registered as a scaling
|
||||
driver with the ``CPUFreq`` core).
|
||||
|
||||
This attribute can be written to in order to change the driver's
|
||||
operation mode or to unregister it. The string written to it must be
|
||||
one of the possible values of it and, if successful, the write will
|
||||
cause the driver to switch over to the operation mode represented by
|
||||
that string - or to be unregistered in the "off" case. [Actually,
|
||||
switching over from the active mode to the passive mode or the other
|
||||
way around causes the driver to be unregistered and registered again
|
||||
with a different set of callbacks, so all of its settings (the global
|
||||
as well as the per-policy ones) are then reset to their default
|
||||
values, possibly depending on the target operation mode.]
|
||||
|
||||
That only is supported in some configurations, though (for example, if
|
||||
the `HWP feature is enabled in the processor <Active Mode With HWP_>`_,
|
||||
the operation mode of the driver cannot be changed), and if it is not
|
||||
supported in the current configuration, writes to this attribute with
|
||||
fail with an appropriate error.
|
||||
|
||||
Interpretation of Policy Attributes
|
||||
-----------------------------------
|
||||
|
||||
The interpretation of some ``CPUFreq`` policy attributes described in
|
||||
:doc:`cpufreq` is special with ``intel_pstate`` as the current scaling driver
|
||||
and it generally depends on the driver's `operation mode <Operation Modes_>`_.
|
||||
|
||||
First of all, the values of the ``cpuinfo_max_freq``, ``cpuinfo_min_freq`` and
|
||||
``scaling_cur_freq`` attributes are produced by applying a processor-specific
|
||||
multiplier to the internal P-state representation used by ``intel_pstate``.
|
||||
Also, the values of the ``scaling_max_freq`` and ``scaling_min_freq``
|
||||
attributes are capped by the frequency corresponding to the maximum P-state that
|
||||
the driver is allowed to set.
|
||||
|
||||
If the ``no_turbo`` `global attribute <no_turbo_attr_>`_ is set, the driver is
|
||||
not allowed to use turbo P-states, so the maximum value of ``scaling_max_freq``
|
||||
and ``scaling_min_freq`` is limited to the maximum non-turbo P-state frequency.
|
||||
Accordingly, setting ``no_turbo`` causes ``scaling_max_freq`` and
|
||||
``scaling_min_freq`` to go down to that value if they were above it before.
|
||||
However, the old values of ``scaling_max_freq`` and ``scaling_min_freq`` will be
|
||||
restored after unsetting ``no_turbo``, unless these attributes have been written
|
||||
to after ``no_turbo`` was set.
|
||||
|
||||
If ``no_turbo`` is not set, the maximum possible value of ``scaling_max_freq``
|
||||
and ``scaling_min_freq`` corresponds to the maximum supported turbo P-state,
|
||||
which also is the value of ``cpuinfo_max_freq`` in either case.
|
||||
|
||||
Next, the following policy attributes have special meaning if
|
||||
``intel_pstate`` works in the `active mode <Active Mode_>`_:
|
||||
|
||||
``scaling_available_governors``
|
||||
List of P-state selection algorithms provided by ``intel_pstate``.
|
||||
|
||||
``scaling_governor``
|
||||
P-state selection algorithm provided by ``intel_pstate`` currently in
|
||||
use with the given policy.
|
||||
|
||||
``scaling_cur_freq``
|
||||
Frequency of the average P-state of the CPU represented by the given
|
||||
policy for the time interval between the last two invocations of the
|
||||
driver's utilization update callback by the CPU scheduler for that CPU.
|
||||
|
||||
The meaning of these attributes in the `passive mode <Passive Mode_>`_ is the
|
||||
same as for other scaling drivers.
|
||||
|
||||
Additionally, the value of the ``scaling_driver`` attribute for ``intel_pstate``
|
||||
depends on the operation mode of the driver. Namely, it is either
|
||||
"intel_pstate" (in the `active mode <Active Mode_>`_) or "intel_cpufreq" (in the
|
||||
`passive mode <Passive Mode_>`_).
|
||||
|
||||
Coordination of P-State Limits
|
||||
------------------------------
|
||||
|
||||
``intel_pstate`` allows P-state limits to be set in two ways: with the help of
|
||||
the ``max_perf_pct`` and ``min_perf_pct`` `global attributes
|
||||
<Global Attributes_>`_ or via the ``scaling_max_freq`` and ``scaling_min_freq``
|
||||
``CPUFreq`` policy attributes. The coordination between those limits is based
|
||||
on the following rules, regardless of the current operation mode of the driver:
|
||||
|
||||
1. All CPUs are affected by the global limits (that is, none of them can be
|
||||
requested to run faster than the global maximum and none of them can be
|
||||
requested to run slower than the global minimum).
|
||||
|
||||
2. Each individual CPU is affected by its own per-policy limits (that is, it
|
||||
cannot be requested to run faster than its own per-policy maximum and it
|
||||
cannot be requested to run slower than its own per-policy minimum).
|
||||
|
||||
3. The global and per-policy limits can be set independently.
|
||||
|
||||
If the `HWP feature is enabled in the processor <Active Mode With HWP_>`_, the
|
||||
resulting effective values are written into its registers whenever the limits
|
||||
change in order to request its internal P-state selection logic to always set
|
||||
P-states within these limits. Otherwise, the limits are taken into account by
|
||||
scaling governors (in the `passive mode <Passive Mode_>`_) and by the driver
|
||||
every time before setting a new P-state for a CPU.
|
||||
|
||||
Additionally, if the ``intel_pstate=per_cpu_perf_limits`` command line argument
|
||||
is passed to the kernel, ``max_perf_pct`` and ``min_perf_pct`` are not exposed
|
||||
at all and the only way to set the limits is by using the policy attributes.
|
||||
|
||||
|
||||
Energy vs Performance Hints
|
||||
---------------------------
|
||||
|
||||
If ``intel_pstate`` works in the `active mode with the HWP feature enabled
|
||||
<Active Mode With HWP_>`_ in the processor, additional attributes are present
|
||||
in every ``CPUFreq`` policy directory in ``sysfs``. They are intended to allow
|
||||
user space to help ``intel_pstate`` to adjust the processor's internal P-state
|
||||
selection logic by focusing it on performance or on energy-efficiency, or
|
||||
somewhere between the two extremes:
|
||||
|
||||
``energy_performance_preference``
|
||||
Current value of the energy vs performance hint for the given policy
|
||||
(or the CPU represented by it).
|
||||
|
||||
The hint can be changed by writing to this attribute.
|
||||
|
||||
``energy_performance_available_preferences``
|
||||
List of strings that can be written to the
|
||||
``energy_performance_preference`` attribute.
|
||||
|
||||
They represent different energy vs performance hints and should be
|
||||
self-explanatory, except that ``default`` represents whatever hint
|
||||
value was set by the platform firmware.
|
||||
|
||||
Strings written to the ``energy_performance_preference`` attribute are
|
||||
internally translated to integer values written to the processor's
|
||||
Energy-Performance Preference (EPP) knob (if supported) or its
|
||||
Energy-Performance Bias (EPB) knob.
|
||||
|
||||
[Note that tasks may by migrated from one CPU to another by the scheduler's
|
||||
load-balancing algorithm and if different energy vs performance hints are
|
||||
set for those CPUs, that may lead to undesirable outcomes. To avoid such
|
||||
issues it is better to set the same energy vs performance hint for all CPUs
|
||||
or to pin every task potentially sensitive to them to a specific CPU.]
|
||||
|
||||
.. _acpi-cpufreq:
|
||||
|
||||
``intel_pstate`` vs ``acpi-cpufreq``
|
||||
====================================
|
||||
|
||||
On the majority of systems supported by ``intel_pstate``, the ACPI tables
|
||||
provided by the platform firmware contain ``_PSS`` objects returning information
|
||||
that can be used for CPU performance scaling (refer to the `ACPI specification`_
|
||||
for details on the ``_PSS`` objects and the format of the information returned
|
||||
by them).
|
||||
|
||||
The information returned by the ACPI ``_PSS`` objects is used by the
|
||||
``acpi-cpufreq`` scaling driver. On systems supported by ``intel_pstate``
|
||||
the ``acpi-cpufreq`` driver uses the same hardware CPU performance scaling
|
||||
interface, but the set of P-states it can use is limited by the ``_PSS``
|
||||
output.
|
||||
|
||||
On those systems each ``_PSS`` object returns a list of P-states supported by
|
||||
the corresponding CPU which basically is a subset of the P-states range that can
|
||||
be used by ``intel_pstate`` on the same system, with one exception: the whole
|
||||
`turbo range <turbo_>`_ is represented by one item in it (the topmost one). By
|
||||
convention, the frequency returned by ``_PSS`` for that item is greater by 1 MHz
|
||||
than the frequency of the highest non-turbo P-state listed by it, but the
|
||||
corresponding P-state representation (following the hardware specification)
|
||||
returned for it matches the maximum supported turbo P-state (or is the
|
||||
special value 255 meaning essentially "go as high as you can get").
|
||||
|
||||
The list of P-states returned by ``_PSS`` is reflected by the table of
|
||||
available frequencies supplied by ``acpi-cpufreq`` to the ``CPUFreq`` core and
|
||||
scaling governors and the minimum and maximum supported frequencies reported by
|
||||
it come from that list as well. In particular, given the special representation
|
||||
of the turbo range described above, this means that the maximum supported
|
||||
frequency reported by ``acpi-cpufreq`` is higher by 1 MHz than the frequency
|
||||
of the highest supported non-turbo P-state listed by ``_PSS`` which, of course,
|
||||
affects decisions made by the scaling governors, except for ``powersave`` and
|
||||
``performance``.
|
||||
|
||||
For example, if a given governor attempts to select a frequency proportional to
|
||||
estimated CPU load and maps the load of 100% to the maximum supported frequency
|
||||
(possibly multiplied by a constant), then it will tend to choose P-states below
|
||||
the turbo threshold if ``acpi-cpufreq`` is used as the scaling driver, because
|
||||
in that case the turbo range corresponds to a small fraction of the frequency
|
||||
band it can use (1 MHz vs 1 GHz or more). In consequence, it will only go to
|
||||
the turbo range for the highest loads and the other loads above 50% that might
|
||||
benefit from running at turbo frequencies will be given non-turbo P-states
|
||||
instead.
|
||||
|
||||
One more issue related to that may appear on systems supporting the
|
||||
`Configurable TDP feature <turbo_>`_ allowing the platform firmware to set the
|
||||
turbo threshold. Namely, if that is not coordinated with the lists of P-states
|
||||
returned by ``_PSS`` properly, there may be more than one item corresponding to
|
||||
a turbo P-state in those lists and there may be a problem with avoiding the
|
||||
turbo range (if desirable or necessary). Usually, to avoid using turbo
|
||||
P-states overall, ``acpi-cpufreq`` simply avoids using the topmost state listed
|
||||
by ``_PSS``, but that is not sufficient when there are other turbo P-states in
|
||||
the list returned by it.
|
||||
|
||||
Apart from the above, ``acpi-cpufreq`` works like ``intel_pstate`` in the
|
||||
`passive mode <Passive Mode_>`_, except that the number of P-states it can set
|
||||
is limited to the ones listed by the ACPI ``_PSS`` objects.
|
||||
|
||||
|
||||
Kernel Command Line Options for ``intel_pstate``
|
||||
================================================
|
||||
|
||||
Several kernel command line options can be used to pass early-configuration-time
|
||||
parameters to ``intel_pstate`` in order to enforce specific behavior of it. All
|
||||
of them have to be prepended with the ``intel_pstate=`` prefix.
|
||||
|
||||
``disable``
|
||||
Do not register ``intel_pstate`` as the scaling driver even if the
|
||||
processor is supported by it.
|
||||
|
||||
``passive``
|
||||
Register ``intel_pstate`` in the `passive mode <Passive Mode_>`_ to
|
||||
start with.
|
||||
|
||||
This option implies the ``no_hwp`` one described below.
|
||||
|
||||
``force``
|
||||
Register ``intel_pstate`` as the scaling driver instead of
|
||||
``acpi-cpufreq`` even if the latter is preferred on the given system.
|
||||
|
||||
This may prevent some platform features (such as thermal controls and
|
||||
power capping) that rely on the availability of ACPI P-states
|
||||
information from functioning as expected, so it should be used with
|
||||
caution.
|
||||
|
||||
This option does not work with processors that are not supported by
|
||||
``intel_pstate`` and on platforms where the ``pcc-cpufreq`` scaling
|
||||
driver is used instead of ``acpi-cpufreq``.
|
||||
|
||||
``no_hwp``
|
||||
Do not enable the `hardware-managed P-states (HWP) feature
|
||||
<Active Mode With HWP_>`_ even if it is supported by the processor.
|
||||
|
||||
``hwp_only``
|
||||
Register ``intel_pstate`` as the scaling driver only if the
|
||||
`hardware-managed P-states (HWP) feature <Active Mode With HWP_>`_ is
|
||||
supported by the processor.
|
||||
|
||||
``support_acpi_ppc``
|
||||
Take ACPI ``_PPC`` performance limits into account.
|
||||
|
||||
If the preferred power management profile in the FADT (Fixed ACPI
|
||||
Description Table) is set to "Enterprise Server" or "Performance
|
||||
Server", the ACPI ``_PPC`` limits are taken into account by default
|
||||
and this option has no effect.
|
||||
|
||||
``per_cpu_perf_limits``
|
||||
Use per-logical-CPU P-State limits (see `Coordination of P-state
|
||||
Limits`_ for details).
|
||||
|
||||
|
||||
Diagnostics and Tuning
|
||||
======================
|
||||
|
||||
Trace Events
|
||||
------------
|
||||
|
||||
There are two static trace events that can be used for ``intel_pstate``
|
||||
diagnostics. One of them is the ``cpu_frequency`` trace event generally used
|
||||
by ``CPUFreq``, and the other one is the ``pstate_sample`` trace event specific
|
||||
to ``intel_pstate``. Both of them are triggered by ``intel_pstate`` only if
|
||||
it works in the `active mode <Active Mode_>`_.
|
||||
|
||||
The following sequence of shell commands can be used to enable them and see
|
||||
their output (if the kernel is generally configured to support event tracing)::
|
||||
|
||||
# cd /sys/kernel/debug/tracing/
|
||||
# echo 1 > events/power/pstate_sample/enable
|
||||
# echo 1 > events/power/cpu_frequency/enable
|
||||
# cat trace
|
||||
gnome-terminal--4510 [001] ..s. 1177.680733: pstate_sample: core_busy=107 scaled=94 from=26 to=26 mperf=1143818 aperf=1230607 tsc=29838618 freq=2474476
|
||||
cat-5235 [002] ..s. 1177.681723: cpu_frequency: state=2900000 cpu_id=2
|
||||
|
||||
If ``intel_pstate`` works in the `passive mode <Passive Mode_>`_, the
|
||||
``cpu_frequency`` trace event will be triggered either by the ``schedutil``
|
||||
scaling governor (for the policies it is attached to), or by the ``CPUFreq``
|
||||
core (for the policies with other scaling governors).
|
||||
|
||||
``ftrace``
|
||||
----------
|
||||
|
||||
The ``ftrace`` interface can be used for low-level diagnostics of
|
||||
``intel_pstate``. For example, to check how often the function to set a
|
||||
P-state is called, the ``ftrace`` filter can be set to to
|
||||
:c:func:`intel_pstate_set_pstate`::
|
||||
|
||||
# cd /sys/kernel/debug/tracing/
|
||||
# cat available_filter_functions | grep -i pstate
|
||||
intel_pstate_set_pstate
|
||||
intel_pstate_cpu_init
|
||||
...
|
||||
# echo intel_pstate_set_pstate > set_ftrace_filter
|
||||
# echo function > current_tracer
|
||||
# cat trace | head -15
|
||||
# tracer: function
|
||||
#
|
||||
# entries-in-buffer/entries-written: 80/80 #P:4
|
||||
#
|
||||
# _-----=> irqs-off
|
||||
# / _----=> need-resched
|
||||
# | / _---=> hardirq/softirq
|
||||
# || / _--=> preempt-depth
|
||||
# ||| / delay
|
||||
# TASK-PID CPU# |||| TIMESTAMP FUNCTION
|
||||
# | | | |||| | |
|
||||
Xorg-3129 [000] ..s. 2537.644844: intel_pstate_set_pstate <-intel_pstate_timer_func
|
||||
gnome-terminal--4510 [002] ..s. 2537.649844: intel_pstate_set_pstate <-intel_pstate_timer_func
|
||||
gnome-shell-3409 [001] ..s. 2537.650850: intel_pstate_set_pstate <-intel_pstate_timer_func
|
||||
<idle>-0 [000] ..s. 2537.654843: intel_pstate_set_pstate <-intel_pstate_timer_func
|
||||
|
||||
Tuning Interface in ``debugfs``
|
||||
-------------------------------
|
||||
|
||||
The ``powersave`` algorithm provided by ``intel_pstate`` for `the Core line of
|
||||
processors in the active mode <powersave_>`_ is based on a `PID controller`_
|
||||
whose parameters were chosen to address a number of different use cases at the
|
||||
same time. However, it still is possible to fine-tune it to a specific workload
|
||||
and the ``debugfs`` interface under ``/sys/kernel/debug/pstate_snb/`` is
|
||||
provided for this purpose. [Note that the ``pstate_snb`` directory will be
|
||||
present only if the specific P-state selection algorithm matching the interface
|
||||
in it actually is in use.]
|
||||
|
||||
The following files present in that directory can be used to modify the PID
|
||||
controller parameters at run time:
|
||||
|
||||
| ``deadband``
|
||||
| ``d_gain_pct``
|
||||
| ``i_gain_pct``
|
||||
| ``p_gain_pct``
|
||||
| ``sample_rate_ms``
|
||||
| ``setpoint``
|
||||
|
||||
Note, however, that achieving desirable results this way generally requires
|
||||
expert-level understanding of the power vs performance tradeoff, so extra care
|
||||
is recommended when attempting to do that.
|
||||
|
||||
|
||||
.. _LCEU2015: http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf
|
||||
.. _SDM: http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-system-programming-manual-325384.html
|
||||
.. _ACPI specification: http://www.uefi.org/sites/default/files/resources/ACPI_6_1.pdf
|
||||
.. _PID controller: https://en.wikipedia.org/wiki/PID_controller
|
@ -344,9 +344,9 @@ for more than 2 channels, like Fully Buffered DIMMs (FB-DIMMs) memory
|
||||
controllers. The following example will assume 2 channels:
|
||||
|
||||
+------------+-----------------------+
|
||||
| Chip | Channels |
|
||||
| Select +-----------+-----------+
|
||||
| rows | ``ch0`` | ``ch1`` |
|
||||
| CS Rows | Channels |
|
||||
+------------+-----------+-----------+
|
||||
| | ``ch0`` | ``ch1`` |
|
||||
+============+===========+===========+
|
||||
| ``csrow0`` | DIMM_A0 | DIMM_B0 |
|
||||
+------------+ | |
|
||||
@ -698,7 +698,7 @@ information indicating that errors have been detected::
|
||||
The structure of the message is:
|
||||
|
||||
+---------------------------------------+-------------+
|
||||
| Content + Example |
|
||||
| Content | Example |
|
||||
+=======================================+=============+
|
||||
| The memory controller | MC0 |
|
||||
+---------------------------------------+-------------+
|
||||
@ -713,7 +713,7 @@ The structure of the message is:
|
||||
+---------------------------------------+-------------+
|
||||
| The error syndrome | 0xb741 |
|
||||
+---------------------------------------+-------------+
|
||||
| Memory row | row 0 +
|
||||
| Memory row | row 0 |
|
||||
+---------------------------------------+-------------+
|
||||
| Memory channel | channel 1 |
|
||||
+---------------------------------------+-------------+
|
||||
|
199
Documentation/admin-guide/thunderbolt.rst
Normal file
199
Documentation/admin-guide/thunderbolt.rst
Normal file
@ -0,0 +1,199 @@
|
||||
=============
|
||||
Thunderbolt
|
||||
=============
|
||||
The interface presented here is not meant for end users. Instead there
|
||||
should be a userspace tool that handles all the low-level details, keeps
|
||||
database of the authorized devices and prompts user for new connections.
|
||||
|
||||
More details about the sysfs interface for Thunderbolt devices can be
|
||||
found in ``Documentation/ABI/testing/sysfs-bus-thunderbolt``.
|
||||
|
||||
Those users who just want to connect any device without any sort of
|
||||
manual work, can add following line to
|
||||
``/etc/udev/rules.d/99-local.rules``::
|
||||
|
||||
ACTION=="add", SUBSYSTEM=="thunderbolt", ATTR{authorized}=="0", ATTR{authorized}="1"
|
||||
|
||||
This will authorize all devices automatically when they appear. However,
|
||||
keep in mind that this bypasses the security levels and makes the system
|
||||
vulnerable to DMA attacks.
|
||||
|
||||
Security levels and how to use them
|
||||
-----------------------------------
|
||||
Starting from Intel Falcon Ridge Thunderbolt controller there are 4
|
||||
security levels available. The reason for these is the fact that the
|
||||
connected devices can be DMA masters and thus read contents of the host
|
||||
memory without CPU and OS knowing about it. There are ways to prevent
|
||||
this by setting up an IOMMU but it is not always available for various
|
||||
reasons.
|
||||
|
||||
The security levels are as follows:
|
||||
|
||||
none
|
||||
All devices are automatically connected by the firmware. No user
|
||||
approval is needed. In BIOS settings this is typically called
|
||||
*Legacy mode*.
|
||||
|
||||
user
|
||||
User is asked whether the device is allowed to be connected.
|
||||
Based on the device identification information available through
|
||||
``/sys/bus/thunderbolt/devices``. user then can do the decision.
|
||||
In BIOS settings this is typically called *Unique ID*.
|
||||
|
||||
secure
|
||||
User is asked whether the device is allowed to be connected. In
|
||||
addition to UUID the device (if it supports secure connect) is sent
|
||||
a challenge that should match the expected one based on a random key
|
||||
written to ``key`` sysfs attribute. In BIOS settings this is
|
||||
typically called *One time saved key*.
|
||||
|
||||
dponly
|
||||
The firmware automatically creates tunnels for Display Port and
|
||||
USB. No PCIe tunneling is done. In BIOS settings this is
|
||||
typically called *Display Port Only*.
|
||||
|
||||
The current security level can be read from
|
||||
``/sys/bus/thunderbolt/devices/domainX/security`` where ``domainX`` is
|
||||
the Thunderbolt domain the host controller manages. There is typically
|
||||
one domain per Thunderbolt host controller.
|
||||
|
||||
If the security level reads as ``user`` or ``secure`` the connected
|
||||
device must be authorized by the user before PCIe tunnels are created
|
||||
(e.g the PCIe device appears).
|
||||
|
||||
Each Thunderbolt device plugged in will appear in sysfs under
|
||||
``/sys/bus/thunderbolt/devices``. The device directory carries
|
||||
information that can be used to identify the particular device,
|
||||
including its name and UUID.
|
||||
|
||||
Authorizing devices when security level is ``user`` or ``secure``
|
||||
-----------------------------------------------------------------
|
||||
When a device is plugged in it will appear in sysfs as follows::
|
||||
|
||||
/sys/bus/thunderbolt/devices/0-1/authorized - 0
|
||||
/sys/bus/thunderbolt/devices/0-1/device - 0x8004
|
||||
/sys/bus/thunderbolt/devices/0-1/device_name - Thunderbolt to FireWire Adapter
|
||||
/sys/bus/thunderbolt/devices/0-1/vendor - 0x1
|
||||
/sys/bus/thunderbolt/devices/0-1/vendor_name - Apple, Inc.
|
||||
/sys/bus/thunderbolt/devices/0-1/unique_id - e0376f00-0300-0100-ffff-ffffffffffff
|
||||
|
||||
The ``authorized`` attribute reads 0 which means no PCIe tunnels are
|
||||
created yet. The user can authorize the device by simply::
|
||||
|
||||
# echo 1 > /sys/bus/thunderbolt/devices/0-1/authorized
|
||||
|
||||
This will create the PCIe tunnels and the device is now connected.
|
||||
|
||||
If the device supports secure connect, and the domain security level is
|
||||
set to ``secure``, it has an additional attribute ``key`` which can hold
|
||||
a random 32 byte value used for authorization and challenging the device in
|
||||
future connects::
|
||||
|
||||
/sys/bus/thunderbolt/devices/0-3/authorized - 0
|
||||
/sys/bus/thunderbolt/devices/0-3/device - 0x305
|
||||
/sys/bus/thunderbolt/devices/0-3/device_name - AKiTiO Thunder3 PCIe Box
|
||||
/sys/bus/thunderbolt/devices/0-3/key -
|
||||
/sys/bus/thunderbolt/devices/0-3/vendor - 0x41
|
||||
/sys/bus/thunderbolt/devices/0-3/vendor_name - inXtron
|
||||
/sys/bus/thunderbolt/devices/0-3/unique_id - dc010000-0000-8508-a22d-32ca6421cb16
|
||||
|
||||
Notice the key is empty by default.
|
||||
|
||||
If the user does not want to use secure connect it can just ``echo 1``
|
||||
to the ``authorized`` attribute and the PCIe tunnels will be created in
|
||||
the same way than in ``user`` security level.
|
||||
|
||||
If the user wants to use secure connect, the first time the device is
|
||||
plugged a key needs to be created and send to the device::
|
||||
|
||||
# key=$(openssl rand -hex 32)
|
||||
# echo $key > /sys/bus/thunderbolt/devices/0-3/key
|
||||
# echo 1 > /sys/bus/thunderbolt/devices/0-3/authorized
|
||||
|
||||
Now the device is connected (PCIe tunnels are created) and in addition
|
||||
the key is stored on the device NVM.
|
||||
|
||||
Next time the device is plugged in the user can verify (challenge) the
|
||||
device using the same key::
|
||||
|
||||
# echo $key > /sys/bus/thunderbolt/devices/0-3/key
|
||||
# echo 2 > /sys/bus/thunderbolt/devices/0-3/authorized
|
||||
|
||||
If the challenge the device returns back matches the one we expect based
|
||||
on the key, the device is connected and the PCIe tunnels are created.
|
||||
However, if the challenge failed no tunnels are created and error is
|
||||
returned to the user.
|
||||
|
||||
If the user still wants to connect the device it can either approve
|
||||
the device without a key or write new key and write 1 to the
|
||||
``authorized`` file to get the new key stored on the device NVM.
|
||||
|
||||
Upgrading NVM on Thunderbolt device or host
|
||||
-------------------------------------------
|
||||
Since most of the functionality is handled in a firmware running on a
|
||||
host controller or a device, it is important that the firmware can be
|
||||
upgraded to the latest where possible bugs in it have been fixed.
|
||||
Typically OEMs provide this firmware from their support site.
|
||||
|
||||
There is also a central site which has links where to download firmwares
|
||||
for some machines:
|
||||
|
||||
`Thunderbolt Updates <https://thunderbolttechnology.net/updates>`_
|
||||
|
||||
Before you upgrade firmware on a device or host, please make sure it is
|
||||
the suitable. Failing to do that may render the device (or host) in a
|
||||
state where it cannot be used properly anymore without special tools!
|
||||
|
||||
Host NVM upgrade on Apple Macs is not supported.
|
||||
|
||||
Once the NVM image has been downloaded, you need to plug in a
|
||||
Thunderbolt device so that the host controller appears. It does not
|
||||
matter which device is connected (unless you are upgrading NVM on a
|
||||
device - then you need to connect that particular device).
|
||||
|
||||
Note OEM-specific method to power the controller up ("force power") may
|
||||
be available for your system in which case there is no need to plug in a
|
||||
Thunderbolt device.
|
||||
|
||||
After that we can write the firmware to the non-active parts of the NVM
|
||||
of the host or device. As an example here is how Intel NUC6i7KYK (Skull
|
||||
Canyon) Thunderbolt controller NVM is upgraded::
|
||||
|
||||
# dd if=KYK_TBT_FW_0018.bin of=/sys/bus/thunderbolt/devices/0-0/nvm_non_active0/nvmem
|
||||
|
||||
Once the operation completes we can trigger NVM authentication and
|
||||
upgrade process as follows::
|
||||
|
||||
# echo 1 > /sys/bus/thunderbolt/devices/0-0/nvm_authenticate
|
||||
|
||||
If no errors are returned, the host controller shortly disappears. Once
|
||||
it comes back the driver notices it and initiates a full power cycle.
|
||||
After a while the host controller appears again and this time it should
|
||||
be fully functional.
|
||||
|
||||
We can verify that the new NVM firmware is active by running following
|
||||
commands::
|
||||
|
||||
# cat /sys/bus/thunderbolt/devices/0-0/nvm_authenticate
|
||||
0x0
|
||||
# cat /sys/bus/thunderbolt/devices/0-0/nvm_version
|
||||
18.0
|
||||
|
||||
If ``nvm_authenticate`` contains anything else than 0x0 it is the error
|
||||
code from the last authentication cycle, which means the authentication
|
||||
of the NVM image failed.
|
||||
|
||||
Note names of the NVMem devices ``nvm_activeN`` and ``nvm_non_activeN``
|
||||
depends on the order they are registered in the NVMem subsystem. N in
|
||||
the name is the identifier added by the NVMem subsystem.
|
||||
|
||||
Upgrading NVM when host controller is in safe mode
|
||||
--------------------------------------------------
|
||||
If the existing NVM is not properly authenticated (or is missing) the
|
||||
host controller goes into safe mode which means that only available
|
||||
functionality is flashing new NVM image. When in this mode the reading
|
||||
``nvm_version`` fails with ``ENODATA`` and the device identification
|
||||
information is missing.
|
||||
|
||||
To recover from this mode, one needs to flash a valid NVM image to the
|
||||
host host controller in the same way it is done in the previous chapter.
|
@ -16,7 +16,7 @@ git branches/tags and email subject always contain this "at91" sub-string.
|
||||
|
||||
AT91 SoCs
|
||||
---------
|
||||
Documentation and detailled datasheet for each product are available on
|
||||
Documentation and detailed datasheet for each product are available on
|
||||
the Atmel website: http://www.atmel.com.
|
||||
|
||||
Flavors:
|
||||
@ -101,6 +101,42 @@ the Atmel website: http://www.atmel.com.
|
||||
+ Datasheet
|
||||
http://www.atmel.com/Images/Atmel-11267-32-bit-Cortex-A5-Microcontroller-SAMA5D2_Datasheet.pdf
|
||||
|
||||
* ARM Cortex-M7 MCUs
|
||||
- sams70 family
|
||||
- sams70j19
|
||||
- sams70j20
|
||||
- sams70j21
|
||||
- sams70n19
|
||||
- sams70n20
|
||||
- sams70n21
|
||||
- sams70q19
|
||||
- sams70q20
|
||||
- sams70q21
|
||||
+ Datasheet
|
||||
http://www.atmel.com/Images/Atmel-11242-32-bit-Cortex-M7-Microcontroller-SAM-S70Q-SAM-S70N-SAM-S70J_Datasheet.pdf
|
||||
|
||||
- samv70 family
|
||||
- samv70j19
|
||||
- samv70j20
|
||||
- samv70n19
|
||||
- samv70n20
|
||||
- samv70q19
|
||||
- samv70q20
|
||||
+ Datasheet
|
||||
http://www.atmel.com/Images/Atmel-11297-32-bit-Cortex-M7-Microcontroller-SAM-V70Q-SAM-V70N-SAM-V70J_Datasheet.pdf
|
||||
|
||||
- samv71 family
|
||||
- samv71j19
|
||||
- samv71j20
|
||||
- samv71j21
|
||||
- samv71n19
|
||||
- samv71n20
|
||||
- samv71n21
|
||||
- samv71q19
|
||||
- samv71q20
|
||||
- samv71q21
|
||||
+ Datasheet
|
||||
http://www.atmel.com/Images/Atmel-44003-32-bit-Cortex-M7-Microcontroller-SAM-V71Q-SAM-V71N-SAM-V71J_Datasheet.pdf
|
||||
|
||||
Linux kernel information
|
||||
------------------------
|
||||
|
@ -61,11 +61,15 @@ stable kernels.
|
||||
| Cavium | ThunderX ITS | #23144 | CAVIUM_ERRATUM_23144 |
|
||||
| Cavium | ThunderX GICv3 | #23154 | CAVIUM_ERRATUM_23154 |
|
||||
| Cavium | ThunderX Core | #27456 | CAVIUM_ERRATUM_27456 |
|
||||
| Cavium | ThunderX Core | #30115 | CAVIUM_ERRATUM_30115 |
|
||||
| Cavium | ThunderX SMMUv2 | #27704 | N/A |
|
||||
| Cavium | ThunderX2 SMMUv3| #74 | N/A |
|
||||
| Cavium | ThunderX2 SMMUv3| #126 | N/A |
|
||||
| | | | |
|
||||
| Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 |
|
||||
| | | | |
|
||||
| Hisilicon | Hip0{5,6,7} | #161010101 | HISILICON_ERRATUM_161010101 |
|
||||
| Hisilicon | Hip0{6,7} | #161010701 | N/A |
|
||||
| | | | |
|
||||
| Qualcomm Tech. | Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 |
|
||||
| Qualcomm Tech. | Falkor v1 | E1009 | QCOM_FALKOR_ERRATUM_1009 |
|
||||
|
@ -1,10 +1,15 @@
|
||||
============================
|
||||
A block layer cache (bcache)
|
||||
============================
|
||||
|
||||
Say you've got a big slow raid 6, and an ssd or three. Wouldn't it be
|
||||
nice if you could use them as cache... Hence bcache.
|
||||
|
||||
Wiki and git repositories are at:
|
||||
http://bcache.evilpiepirate.org
|
||||
http://evilpiepirate.org/git/linux-bcache.git
|
||||
http://evilpiepirate.org/git/bcache-tools.git
|
||||
|
||||
- http://bcache.evilpiepirate.org
|
||||
- http://evilpiepirate.org/git/linux-bcache.git
|
||||
- http://evilpiepirate.org/git/bcache-tools.git
|
||||
|
||||
It's designed around the performance characteristics of SSDs - it only allocates
|
||||
in erase block sized buckets, and it uses a hybrid btree/log to track cached
|
||||
@ -37,17 +42,19 @@ to be flushed.
|
||||
|
||||
Getting started:
|
||||
You'll need make-bcache from the bcache-tools repository. Both the cache device
|
||||
and backing device must be formatted before use.
|
||||
and backing device must be formatted before use::
|
||||
|
||||
make-bcache -B /dev/sdb
|
||||
make-bcache -C /dev/sdc
|
||||
|
||||
make-bcache has the ability to format multiple devices at the same time - if
|
||||
you format your backing devices and cache device at the same time, you won't
|
||||
have to manually attach:
|
||||
have to manually attach::
|
||||
|
||||
make-bcache -B /dev/sda /dev/sdb -C /dev/sdc
|
||||
|
||||
bcache-tools now ships udev rules, and bcache devices are known to the kernel
|
||||
immediately. Without udev, you can manually register devices like this:
|
||||
immediately. Without udev, you can manually register devices like this::
|
||||
|
||||
echo /dev/sdb > /sys/fs/bcache/register
|
||||
echo /dev/sdc > /sys/fs/bcache/register
|
||||
@ -60,16 +67,16 @@ slow devices as bcache backing devices without a cache, and you can choose to ad
|
||||
a caching device later.
|
||||
See 'ATTACHING' section below.
|
||||
|
||||
The devices show up as:
|
||||
The devices show up as::
|
||||
|
||||
/dev/bcache<N>
|
||||
|
||||
As well as (with udev):
|
||||
As well as (with udev)::
|
||||
|
||||
/dev/bcache/by-uuid/<uuid>
|
||||
/dev/bcache/by-label/<label>
|
||||
|
||||
To get started:
|
||||
To get started::
|
||||
|
||||
mkfs.ext4 /dev/bcache0
|
||||
mount /dev/bcache0 /mnt
|
||||
@ -81,13 +88,13 @@ Cache devices are managed as sets; multiple caches per set isn't supported yet
|
||||
but will allow for mirroring of metadata and dirty data in the future. Your new
|
||||
cache set shows up as /sys/fs/bcache/<UUID>
|
||||
|
||||
ATTACHING
|
||||
Attaching
|
||||
---------
|
||||
|
||||
After your cache device and backing device are registered, the backing device
|
||||
must be attached to your cache set to enable caching. Attaching a backing
|
||||
device to a cache set is done thusly, with the UUID of the cache set in
|
||||
/sys/fs/bcache:
|
||||
/sys/fs/bcache::
|
||||
|
||||
echo <CSET-UUID> > /sys/block/bcache0/bcache/attach
|
||||
|
||||
@ -97,7 +104,7 @@ your bcache devices. If a backing device has data in a cache somewhere, the
|
||||
important if you have writeback caching turned on.
|
||||
|
||||
If you're booting up and your cache device is gone and never coming back, you
|
||||
can force run the backing device:
|
||||
can force run the backing device::
|
||||
|
||||
echo 1 > /sys/block/sdb/bcache/running
|
||||
|
||||
@ -110,7 +117,7 @@ but all the cached data will be invalidated. If there was dirty data in the
|
||||
cache, don't expect the filesystem to be recoverable - you will have massive
|
||||
filesystem corruption, though ext4's fsck does work miracles.
|
||||
|
||||
ERROR HANDLING
|
||||
Error Handling
|
||||
--------------
|
||||
|
||||
Bcache tries to transparently handle IO errors to/from the cache device without
|
||||
@ -134,25 +141,27 @@ the backing devices to passthrough mode.
|
||||
read some of the dirty data, though.
|
||||
|
||||
|
||||
HOWTO/COOKBOOK
|
||||
Howto/cookbook
|
||||
--------------
|
||||
|
||||
A) Starting a bcache with a missing caching device
|
||||
|
||||
If registering the backing device doesn't help, it's already there, you just need
|
||||
to force it to run without the cache:
|
||||
to force it to run without the cache::
|
||||
|
||||
host:~# echo /dev/sdb1 > /sys/fs/bcache/register
|
||||
[ 119.844831] bcache: register_bcache() error opening /dev/sdb1: device already registered
|
||||
|
||||
Next, you try to register your caching device if it's present. However
|
||||
if it's absent, or registration fails for some reason, you can still
|
||||
start your bcache without its cache, like so:
|
||||
start your bcache without its cache, like so::
|
||||
|
||||
host:/sys/block/sdb/sdb1/bcache# echo 1 > running
|
||||
|
||||
Note that this may cause data loss if you were running in writeback mode.
|
||||
|
||||
|
||||
B) Bcache does not find its cache
|
||||
B) Bcache does not find its cache::
|
||||
|
||||
host:/sys/block/md5/bcache# echo 0226553a-37cf-41d5-b3ce-8b1e944543a8 > attach
|
||||
[ 1933.455082] bcache: bch_cached_dev_attach() Couldn't find uuid for md5 in set
|
||||
@ -160,7 +169,8 @@ B) Bcache does not find its cache
|
||||
[ 1933.478179] : cache set not found
|
||||
|
||||
In this case, the caching device was simply not registered at boot
|
||||
or disappeared and came back, and needs to be (re-)registered:
|
||||
or disappeared and came back, and needs to be (re-)registered::
|
||||
|
||||
host:/sys/block/md5/bcache# echo /dev/sdh2 > /sys/fs/bcache/register
|
||||
|
||||
|
||||
@ -180,7 +190,8 @@ device is still available at an 8KiB offset. So either via a loopdev
|
||||
of the backing device created with --offset 8K, or any value defined by
|
||||
--data-offset when you originally formatted bcache with `make-bcache`.
|
||||
|
||||
For example:
|
||||
For example::
|
||||
|
||||
losetup -o 8192 /dev/loop0 /dev/your_bcache_backing_dev
|
||||
|
||||
This should present your unmodified backing device data in /dev/loop0
|
||||
@ -191,33 +202,38 @@ cache device without loosing data.
|
||||
|
||||
E) Wiping a cache device
|
||||
|
||||
host:~# wipefs -a /dev/sdh2
|
||||
16 bytes were erased at offset 0x1018 (bcache)
|
||||
they were: c6 85 73 f6 4e 1a 45 ca 82 65 f5 7f 48 ba 6d 81
|
||||
::
|
||||
|
||||
After you boot back with bcache enabled, you recreate the cache and attach it:
|
||||
host:~# make-bcache -C /dev/sdh2
|
||||
UUID: 7be7e175-8f4c-4f99-94b2-9c904d227045
|
||||
Set UUID: 5bc072a8-ab17-446d-9744-e247949913c1
|
||||
version: 0
|
||||
nbuckets: 106874
|
||||
block_size: 1
|
||||
bucket_size: 1024
|
||||
nr_in_set: 1
|
||||
nr_this_dev: 0
|
||||
first_bucket: 1
|
||||
[ 650.511912] bcache: run_cache_set() invalidating existing data
|
||||
[ 650.549228] bcache: register_cache() registered cache device sdh2
|
||||
host:~# wipefs -a /dev/sdh2
|
||||
16 bytes were erased at offset 0x1018 (bcache)
|
||||
they were: c6 85 73 f6 4e 1a 45 ca 82 65 f5 7f 48 ba 6d 81
|
||||
|
||||
start backing device with missing cache:
|
||||
host:/sys/block/md5/bcache# echo 1 > running
|
||||
After you boot back with bcache enabled, you recreate the cache and attach it::
|
||||
|
||||
attach new cache:
|
||||
host:/sys/block/md5/bcache# echo 5bc072a8-ab17-446d-9744-e247949913c1 > attach
|
||||
[ 865.276616] bcache: bch_cached_dev_attach() Caching md5 as bcache0 on set 5bc072a8-ab17-446d-9744-e247949913c1
|
||||
host:~# make-bcache -C /dev/sdh2
|
||||
UUID: 7be7e175-8f4c-4f99-94b2-9c904d227045
|
||||
Set UUID: 5bc072a8-ab17-446d-9744-e247949913c1
|
||||
version: 0
|
||||
nbuckets: 106874
|
||||
block_size: 1
|
||||
bucket_size: 1024
|
||||
nr_in_set: 1
|
||||
nr_this_dev: 0
|
||||
first_bucket: 1
|
||||
[ 650.511912] bcache: run_cache_set() invalidating existing data
|
||||
[ 650.549228] bcache: register_cache() registered cache device sdh2
|
||||
|
||||
start backing device with missing cache::
|
||||
|
||||
host:/sys/block/md5/bcache# echo 1 > running
|
||||
|
||||
attach new cache::
|
||||
|
||||
host:/sys/block/md5/bcache# echo 5bc072a8-ab17-446d-9744-e247949913c1 > attach
|
||||
[ 865.276616] bcache: bch_cached_dev_attach() Caching md5 as bcache0 on set 5bc072a8-ab17-446d-9744-e247949913c1
|
||||
|
||||
|
||||
F) Remove or replace a caching device
|
||||
F) Remove or replace a caching device::
|
||||
|
||||
host:/sys/block/sda/sda7/bcache# echo 1 > detach
|
||||
[ 695.872542] bcache: cached_dev_detach_finish() Caching disabled for sda7
|
||||
@ -226,13 +242,15 @@ F) Remove or replace a caching device
|
||||
wipefs: error: /dev/nvme0n1p4: probing initialization failed: Device or resource busy
|
||||
Ooops, it's disabled, but not unregistered, so it's still protected
|
||||
|
||||
We need to go and unregister it:
|
||||
We need to go and unregister it::
|
||||
|
||||
host:/sys/fs/bcache/b7ba27a1-2398-4649-8ae3-0959f57ba128# ls -l cache0
|
||||
lrwxrwxrwx 1 root root 0 Feb 25 18:33 cache0 -> ../../../devices/pci0000:00/0000:00:1d.0/0000:70:00.0/nvme/nvme0/nvme0n1/nvme0n1p4/bcache/
|
||||
host:/sys/fs/bcache/b7ba27a1-2398-4649-8ae3-0959f57ba128# echo 1 > stop
|
||||
kernel: [ 917.041908] bcache: cache_set_free() Cache set b7ba27a1-2398-4649-8ae3-0959f57ba128 unregistered
|
||||
|
||||
Now we can wipe it:
|
||||
Now we can wipe it::
|
||||
|
||||
host:~# wipefs -a /dev/nvme0n1p4
|
||||
/dev/nvme0n1p4: 16 bytes were erased at offset 0x00001018 (bcache): c6 85 73 f6 4e 1a 45 ca 82 65 f5 7f 48 ba 6d 81
|
||||
|
||||
@ -252,40 +270,44 @@ if there are any active backing or caching devices left on it:
|
||||
|
||||
1) Is it present in /dev/bcache* ? (there are times where it won't be)
|
||||
|
||||
If so, it's easy:
|
||||
If so, it's easy::
|
||||
|
||||
host:/sys/block/bcache0/bcache# echo 1 > stop
|
||||
|
||||
2) But if your backing device is gone, this won't work:
|
||||
2) But if your backing device is gone, this won't work::
|
||||
|
||||
host:/sys/block/bcache0# cd bcache
|
||||
bash: cd: bcache: No such file or directory
|
||||
|
||||
In this case, you may have to unregister the dmcrypt block device that
|
||||
references this bcache to free it up:
|
||||
In this case, you may have to unregister the dmcrypt block device that
|
||||
references this bcache to free it up::
|
||||
|
||||
host:~# dmsetup remove oldds1
|
||||
bcache: bcache_device_free() bcache0 stopped
|
||||
bcache: cache_set_free() Cache set 5bc072a8-ab17-446d-9744-e247949913c1 unregistered
|
||||
|
||||
This causes the backing bcache to be removed from /sys/fs/bcache and
|
||||
then it can be reused. This would be true of any block device stacking
|
||||
where bcache is a lower device.
|
||||
This causes the backing bcache to be removed from /sys/fs/bcache and
|
||||
then it can be reused. This would be true of any block device stacking
|
||||
where bcache is a lower device.
|
||||
|
||||
3) In other cases, you can also look in /sys/fs/bcache/:
|
||||
3) In other cases, you can also look in /sys/fs/bcache/::
|
||||
|
||||
host:/sys/fs/bcache# ls -l */{cache?,bdev?}
|
||||
lrwxrwxrwx 1 root root 0 Mar 5 09:39 0226553a-37cf-41d5-b3ce-8b1e944543a8/bdev1 -> ../../../devices/virtual/block/dm-1/bcache/
|
||||
lrwxrwxrwx 1 root root 0 Mar 5 09:39 0226553a-37cf-41d5-b3ce-8b1e944543a8/cache0 -> ../../../devices/virtual/block/dm-4/bcache/
|
||||
lrwxrwxrwx 1 root root 0 Mar 5 09:39 5bc072a8-ab17-446d-9744-e247949913c1/cache0 -> ../../../devices/pci0000:00/0000:00:01.0/0000:01:00.0/ata10/host9/target9:0:0/9:0:0:0/block/sdl/sdl2/bcache/
|
||||
host:/sys/fs/bcache# ls -l */{cache?,bdev?}
|
||||
lrwxrwxrwx 1 root root 0 Mar 5 09:39 0226553a-37cf-41d5-b3ce-8b1e944543a8/bdev1 -> ../../../devices/virtual/block/dm-1/bcache/
|
||||
lrwxrwxrwx 1 root root 0 Mar 5 09:39 0226553a-37cf-41d5-b3ce-8b1e944543a8/cache0 -> ../../../devices/virtual/block/dm-4/bcache/
|
||||
lrwxrwxrwx 1 root root 0 Mar 5 09:39 5bc072a8-ab17-446d-9744-e247949913c1/cache0 -> ../../../devices/pci0000:00/0000:00:01.0/0000:01:00.0/ata10/host9/target9:0:0/9:0:0:0/block/sdl/sdl2/bcache/
|
||||
|
||||
The device names will show which UUID is relevant, cd in that directory
|
||||
and stop the cache::
|
||||
|
||||
The device names will show which UUID is relevant, cd in that directory
|
||||
and stop the cache:
|
||||
host:/sys/fs/bcache/5bc072a8-ab17-446d-9744-e247949913c1# echo 1 > stop
|
||||
|
||||
This will free up bcache references and let you reuse the partition for
|
||||
other purposes.
|
||||
This will free up bcache references and let you reuse the partition for
|
||||
other purposes.
|
||||
|
||||
|
||||
|
||||
TROUBLESHOOTING PERFORMANCE
|
||||
Troubleshooting performance
|
||||
---------------------------
|
||||
|
||||
Bcache has a bunch of config options and tunables. The defaults are intended to
|
||||
@ -301,11 +323,13 @@ want for getting the best possible numbers when benchmarking.
|
||||
raid stripe size to get the disk multiples that you would like.
|
||||
|
||||
For example: If you have a 64k stripe size, then the following offset
|
||||
would provide alignment for many common RAID5 data spindle counts:
|
||||
would provide alignment for many common RAID5 data spindle counts::
|
||||
|
||||
64k * 2*2*2*3*3*5*7 bytes = 161280k
|
||||
|
||||
That space is wasted, but for only 157.5MB you can grow your RAID 5
|
||||
volume to the following data-spindle counts without re-aligning:
|
||||
volume to the following data-spindle counts without re-aligning::
|
||||
|
||||
3,4,5,6,7,8,9,10,12,14,15,18,20,21 ...
|
||||
|
||||
- Bad write performance
|
||||
@ -313,9 +337,9 @@ want for getting the best possible numbers when benchmarking.
|
||||
If write performance is not what you expected, you probably wanted to be
|
||||
running in writeback mode, which isn't the default (not due to a lack of
|
||||
maturity, but simply because in writeback mode you'll lose data if something
|
||||
happens to your SSD)
|
||||
happens to your SSD)::
|
||||
|
||||
# echo writeback > /sys/block/bcache0/bcache/cache_mode
|
||||
# echo writeback > /sys/block/bcache0/bcache/cache_mode
|
||||
|
||||
- Bad performance, or traffic not going to the SSD that you'd expect
|
||||
|
||||
@ -325,13 +349,13 @@ want for getting the best possible numbers when benchmarking.
|
||||
accessed data out of your cache.
|
||||
|
||||
But if you want to benchmark reads from cache, and you start out with fio
|
||||
writing an 8 gigabyte test file - so you want to disable that.
|
||||
writing an 8 gigabyte test file - so you want to disable that::
|
||||
|
||||
# echo 0 > /sys/block/bcache0/bcache/sequential_cutoff
|
||||
# echo 0 > /sys/block/bcache0/bcache/sequential_cutoff
|
||||
|
||||
To set it back to the default (4 mb), do
|
||||
To set it back to the default (4 mb), do::
|
||||
|
||||
# echo 4M > /sys/block/bcache0/bcache/sequential_cutoff
|
||||
# echo 4M > /sys/block/bcache0/bcache/sequential_cutoff
|
||||
|
||||
- Traffic's still going to the spindle/still getting cache misses
|
||||
|
||||
@ -344,10 +368,10 @@ want for getting the best possible numbers when benchmarking.
|
||||
throttles traffic if the latency exceeds a threshold (it does this by
|
||||
cranking down the sequential bypass).
|
||||
|
||||
You can disable this if you need to by setting the thresholds to 0:
|
||||
You can disable this if you need to by setting the thresholds to 0::
|
||||
|
||||
# echo 0 > /sys/fs/bcache/<cache set>/congested_read_threshold_us
|
||||
# echo 0 > /sys/fs/bcache/<cache set>/congested_write_threshold_us
|
||||
# echo 0 > /sys/fs/bcache/<cache set>/congested_read_threshold_us
|
||||
# echo 0 > /sys/fs/bcache/<cache set>/congested_write_threshold_us
|
||||
|
||||
The default is 2000 us (2 milliseconds) for reads, and 20000 for writes.
|
||||
|
||||
@ -369,7 +393,7 @@ want for getting the best possible numbers when benchmarking.
|
||||
a fix for the issue there).
|
||||
|
||||
|
||||
SYSFS - BACKING DEVICE
|
||||
Sysfs - backing device
|
||||
----------------------
|
||||
|
||||
Available at /sys/block/<bdev>/bcache, /sys/block/bcache*/bcache and
|
||||
@ -454,7 +478,8 @@ writeback_running
|
||||
still be added to the cache until it is mostly full; only meant for
|
||||
benchmarking. Defaults to on.
|
||||
|
||||
SYSFS - BACKING DEVICE STATS:
|
||||
Sysfs - backing device stats
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
There are directories with these numbers for a running total, as well as
|
||||
versions that decay over the past day, hour and 5 minutes; they're also
|
||||
@ -463,14 +488,11 @@ aggregated in the cache set directory as well.
|
||||
bypassed
|
||||
Amount of IO (both reads and writes) that has bypassed the cache
|
||||
|
||||
cache_hits
|
||||
cache_misses
|
||||
cache_hit_ratio
|
||||
cache_hits, cache_misses, cache_hit_ratio
|
||||
Hits and misses are counted per individual IO as bcache sees them; a
|
||||
partial hit is counted as a miss.
|
||||
|
||||
cache_bypass_hits
|
||||
cache_bypass_misses
|
||||
cache_bypass_hits, cache_bypass_misses
|
||||
Hits and misses for IO that is intended to skip the cache are still counted,
|
||||
but broken out here.
|
||||
|
||||
@ -482,7 +504,8 @@ cache_miss_collisions
|
||||
cache_readaheads
|
||||
Count of times readahead occurred.
|
||||
|
||||
SYSFS - CACHE SET:
|
||||
Sysfs - cache set
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
Available at /sys/fs/bcache/<cset-uuid>
|
||||
|
||||
@ -520,8 +543,7 @@ flash_vol_create
|
||||
Echoing a size to this file (in human readable units, k/M/G) creates a thinly
|
||||
provisioned volume backed by the cache set.
|
||||
|
||||
io_error_halflife
|
||||
io_error_limit
|
||||
io_error_halflife, io_error_limit
|
||||
These determines how many errors we accept before disabling the cache.
|
||||
Each error is decayed by the half life (in # ios). If the decaying count
|
||||
reaches io_error_limit dirty data is written out and the cache is disabled.
|
||||
@ -545,7 +567,8 @@ unregister
|
||||
Detaches all backing devices and closes the cache devices; if dirty data is
|
||||
present it will disable writeback caching and wait for it to be flushed.
|
||||
|
||||
SYSFS - CACHE SET INTERNAL:
|
||||
Sysfs - cache set internal
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This directory also exposes timings for a number of internal operations, with
|
||||
separate files for average duration, average frequency, last occurrence and max
|
||||
@ -574,7 +597,8 @@ cache_read_races
|
||||
trigger_gc
|
||||
Writing to this file forces garbage collection to run.
|
||||
|
||||
SYSFS - CACHE DEVICE:
|
||||
Sysfs - Cache device
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Available at /sys/block/<cdev>/bcache
|
||||
|
||||
|
@ -632,7 +632,7 @@ to i/o submission, if the bio fields are likely to be accessed after the
|
||||
i/o is issued (since the bio may otherwise get freed in case i/o completion
|
||||
happens in the meantime).
|
||||
|
||||
The bio_clone() routine may be used to duplicate a bio, where the clone
|
||||
The bio_clone_fast() routine may be used to duplicate a bio, where the clone
|
||||
shares the bio_vec_list with the original bio (i.e. both point to the
|
||||
same bio_vec_list). This would typically be used for splitting i/o requests
|
||||
in lvm or md.
|
||||
|
@ -192,7 +192,7 @@ will require extra work due to the application tag.
|
||||
supported by the block device.
|
||||
|
||||
|
||||
int bio_integrity_prep(bio);
|
||||
bool bio_integrity_prep(bio);
|
||||
|
||||
To generate IMD for WRITE and to set up buffers for READ, the
|
||||
filesystem must call bio_integrity_prep(bio).
|
||||
@ -201,9 +201,7 @@ will require extra work due to the application tag.
|
||||
sector must be set, and the bio should have all data pages
|
||||
added. It is up to the caller to ensure that the bio does not
|
||||
change while I/O is in progress.
|
||||
|
||||
bio_integrity_prep() should only be called if
|
||||
bio_integrity_enabled() returned 1.
|
||||
Complete bio with error if prepare failed for some reson.
|
||||
|
||||
|
||||
5.3 PASSING EXISTING INTEGRITY METADATA
|
||||
|
@ -1,12 +1,8 @@
|
||||
===============================================================
|
||||
== BT8XXGPIO driver ==
|
||||
== ==
|
||||
== A driver for a selfmade cheap BT8xx based PCI GPIO-card ==
|
||||
== ==
|
||||
== For advanced documentation, see ==
|
||||
== http://www.bu3sch.de/btgpio.php ==
|
||||
===============================================================
|
||||
===================================================================
|
||||
A driver for a selfmade cheap BT8xx based PCI GPIO-card (bt8xxgpio)
|
||||
===================================================================
|
||||
|
||||
For advanced documentation, see http://www.bu3sch.de/btgpio.php
|
||||
|
||||
A generic digital 24-port PCI GPIO card can be built out of an ordinary
|
||||
Brooktree bt848, bt849, bt878 or bt879 based analog TV tuner card. The
|
||||
@ -17,9 +13,8 @@ The bt8xx chip does have 24 digital GPIO ports.
|
||||
These ports are accessible via 24 pins on the SMD chip package.
|
||||
|
||||
|
||||
==============================================
|
||||
== How to physically access the GPIO pins ==
|
||||
==============================================
|
||||
How to physically access the GPIO pins
|
||||
======================================
|
||||
|
||||
The are several ways to access these pins. One might unsolder the whole chip
|
||||
and put it on a custom PCI board, or one might only unsolder each individual
|
||||
@ -27,7 +22,7 @@ GPIO pin and solder that to some tiny wire. As the chip package really is tiny
|
||||
there are some advanced soldering skills needed in any case.
|
||||
|
||||
The physical pinouts are drawn in the following ASCII art.
|
||||
The GPIO pins are marked with G00-G23
|
||||
The GPIO pins are marked with G00-G23::
|
||||
|
||||
G G G G G G G G G G G G G G G G G G
|
||||
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
|
||||
|
@ -1,18 +1,16 @@
|
||||
=======================================================================
|
||||
README for btmrvl driver
|
||||
=======================================================================
|
||||
|
||||
=============
|
||||
btmrvl driver
|
||||
=============
|
||||
|
||||
All commands are used via debugfs interface.
|
||||
|
||||
=====================
|
||||
Set/get driver configurations:
|
||||
Set/get driver configurations
|
||||
=============================
|
||||
|
||||
Path: /debug/btmrvl/config/
|
||||
|
||||
gpiogap=[n]
|
||||
hscfgcmd
|
||||
These commands are used to configure the host sleep parameters.
|
||||
gpiogap=[n], hscfgcmd
|
||||
These commands are used to configure the host sleep parameters::
|
||||
bit 8:0 -- Gap
|
||||
bit 16:8 -- GPIO
|
||||
|
||||
@ -23,7 +21,8 @@ hscfgcmd
|
||||
where Gap is the gap in milli seconds between wakeup signal and
|
||||
wakeup event, or 0xff for special host sleep setting.
|
||||
|
||||
Usage:
|
||||
Usage::
|
||||
|
||||
# Use SDIO interface to wake up the host and set GAP to 0x80:
|
||||
echo 0xff80 > /debug/btmrvl/config/gpiogap
|
||||
echo 1 > /debug/btmrvl/config/hscfgcmd
|
||||
@ -32,15 +31,16 @@ hscfgcmd
|
||||
echo 0x03ff > /debug/btmrvl/config/gpiogap
|
||||
echo 1 > /debug/btmrvl/config/hscfgcmd
|
||||
|
||||
psmode=[n]
|
||||
pscmd
|
||||
psmode=[n], pscmd
|
||||
These commands are used to enable/disable auto sleep mode
|
||||
|
||||
where the option is:
|
||||
where the option is::
|
||||
|
||||
1 -- Enable auto sleep mode
|
||||
0 -- Disable auto sleep mode
|
||||
|
||||
Usage:
|
||||
Usage::
|
||||
|
||||
# Enable auto sleep mode
|
||||
echo 1 > /debug/btmrvl/config/psmode
|
||||
echo 1 > /debug/btmrvl/config/pscmd
|
||||
@ -50,15 +50,16 @@ pscmd
|
||||
echo 1 > /debug/btmrvl/config/pscmd
|
||||
|
||||
|
||||
hsmode=[n]
|
||||
hscmd
|
||||
hsmode=[n], hscmd
|
||||
These commands are used to enable host sleep or wake up firmware
|
||||
|
||||
where the option is:
|
||||
where the option is::
|
||||
|
||||
1 -- Enable host sleep
|
||||
0 -- Wake up firmware
|
||||
|
||||
Usage:
|
||||
Usage::
|
||||
|
||||
# Enable host sleep
|
||||
echo 1 > /debug/btmrvl/config/hsmode
|
||||
echo 1 > /debug/btmrvl/config/hscmd
|
||||
@ -68,12 +69,13 @@ hscmd
|
||||
echo 1 > /debug/btmrvl/config/hscmd
|
||||
|
||||
|
||||
======================
|
||||
Get driver status:
|
||||
Get driver status
|
||||
=================
|
||||
|
||||
Path: /debug/btmrvl/status/
|
||||
|
||||
Usage:
|
||||
Usage::
|
||||
|
||||
cat /debug/btmrvl/status/<args>
|
||||
|
||||
where the args are:
|
||||
@ -90,14 +92,17 @@ hsstate
|
||||
txdnldrdy
|
||||
This command displays the value of Tx download ready flag.
|
||||
|
||||
|
||||
=====================
|
||||
Issuing a raw hci command
|
||||
=========================
|
||||
|
||||
Use hcitool to issue raw hci command, refer to hcitool manual
|
||||
|
||||
Usage: Hcitool cmd <ogf> <ocf> [Parameters]
|
||||
Usage::
|
||||
|
||||
Hcitool cmd <ogf> <ocf> [Parameters]
|
||||
|
||||
Interface Control Command::
|
||||
|
||||
Interface Control Command
|
||||
hcitool cmd 0x3f 0x5b 0xf5 0x01 0x00 --Enable All interface
|
||||
hcitool cmd 0x3f 0x5b 0xf5 0x01 0x01 --Enable Wlan interface
|
||||
hcitool cmd 0x3f 0x5b 0xf5 0x01 0x02 --Enable BT interface
|
||||
@ -105,13 +110,13 @@ Use hcitool to issue raw hci command, refer to hcitool manual
|
||||
hcitool cmd 0x3f 0x5b 0xf5 0x00 0x01 --Disable Wlan interface
|
||||
hcitool cmd 0x3f 0x5b 0xf5 0x00 0x02 --Disable BT interface
|
||||
|
||||
=======================================================================
|
||||
SD8688 firmware
|
||||
===============
|
||||
|
||||
Images:
|
||||
|
||||
SD8688 firmware:
|
||||
|
||||
/lib/firmware/sd8688_helper.bin
|
||||
/lib/firmware/sd8688.bin
|
||||
- /lib/firmware/sd8688_helper.bin
|
||||
- /lib/firmware/sd8688.bin
|
||||
|
||||
|
||||
The images can be downloaded from:
|
||||
|
@ -1,17 +1,27 @@
|
||||
[ NOTE: The virt_to_bus() and bus_to_virt() functions have been
|
||||
==========================================================
|
||||
How to access I/O mapped memory from within device drivers
|
||||
==========================================================
|
||||
|
||||
:Author: Linus
|
||||
|
||||
.. warning::
|
||||
|
||||
The virt_to_bus() and bus_to_virt() functions have been
|
||||
superseded by the functionality provided by the PCI DMA interface
|
||||
(see Documentation/DMA-API-HOWTO.txt). They continue
|
||||
to be documented below for historical purposes, but new code
|
||||
must not use them. --davidm 00/12/12 ]
|
||||
must not use them. --davidm 00/12/12
|
||||
|
||||
[ This is a mail message in response to a query on IO mapping, thus the
|
||||
strange format for a "document" ]
|
||||
::
|
||||
|
||||
[ This is a mail message in response to a query on IO mapping, thus the
|
||||
strange format for a "document" ]
|
||||
|
||||
The AHA-1542 is a bus-master device, and your patch makes the driver give the
|
||||
controller the physical address of the buffers, which is correct on x86
|
||||
(because all bus master devices see the physical memory mappings directly).
|
||||
|
||||
However, on many setups, there are actually _three_ different ways of looking
|
||||
However, on many setups, there are actually **three** different ways of looking
|
||||
at memory addresses, and in this case we actually want the third, the
|
||||
so-called "bus address".
|
||||
|
||||
@ -38,7 +48,7 @@ because the memory and the devices share the same address space, and that is
|
||||
not generally necessarily true on other PCI/ISA setups.
|
||||
|
||||
Now, just as an example, on the PReP (PowerPC Reference Platform), the
|
||||
CPU sees a memory map something like this (this is from memory):
|
||||
CPU sees a memory map something like this (this is from memory)::
|
||||
|
||||
0-2 GB "real memory"
|
||||
2 GB-3 GB "system IO" (inb/out and similar accesses on x86)
|
||||
@ -52,7 +62,7 @@ So when the CPU wants any bus master to write to physical memory 0, it
|
||||
has to give the master address 0x80000000 as the memory address.
|
||||
|
||||
So, for example, depending on how the kernel is actually mapped on the
|
||||
PPC, you can end up with a setup like this:
|
||||
PPC, you can end up with a setup like this::
|
||||
|
||||
physical address: 0
|
||||
virtual address: 0xC0000000
|
||||
@ -61,7 +71,7 @@ PPC, you can end up with a setup like this:
|
||||
where all the addresses actually point to the same thing. It's just seen
|
||||
through different translations..
|
||||
|
||||
Similarly, on the Alpha, the normal translation is
|
||||
Similarly, on the Alpha, the normal translation is::
|
||||
|
||||
physical address: 0
|
||||
virtual address: 0xfffffc0000000000
|
||||
@ -70,7 +80,7 @@ Similarly, on the Alpha, the normal translation is
|
||||
(but there are also Alphas where the physical address and the bus address
|
||||
are the same).
|
||||
|
||||
Anyway, the way to look up all these translations, you do
|
||||
Anyway, the way to look up all these translations, you do::
|
||||
|
||||
#include <asm/io.h>
|
||||
|
||||
@ -81,8 +91,8 @@ Anyway, the way to look up all these translations, you do
|
||||
|
||||
Now, when do you need these?
|
||||
|
||||
You want the _virtual_ address when you are actually going to access that
|
||||
pointer from the kernel. So you can have something like this:
|
||||
You want the **virtual** address when you are actually going to access that
|
||||
pointer from the kernel. So you can have something like this::
|
||||
|
||||
/*
|
||||
* this is the hardware "mailbox" we use to communicate with
|
||||
@ -104,7 +114,7 @@ pointer from the kernel. So you can have something like this:
|
||||
...
|
||||
|
||||
on the other hand, you want the bus address when you have a buffer that
|
||||
you want to give to the controller:
|
||||
you want to give to the controller::
|
||||
|
||||
/* ask the controller to read the sense status into "sense_buffer" */
|
||||
mbox.bufstart = virt_to_bus(&sense_buffer);
|
||||
@ -112,7 +122,7 @@ you want to give to the controller:
|
||||
mbox.status = 0;
|
||||
notify_controller(&mbox);
|
||||
|
||||
And you generally _never_ want to use the physical address, because you can't
|
||||
And you generally **never** want to use the physical address, because you can't
|
||||
use that from the CPU (the CPU only uses translated virtual addresses), and
|
||||
you can't use it from the bus master.
|
||||
|
||||
@ -124,8 +134,10 @@ be remapped as measured in units of pages, a.k.a. the pfn (the memory
|
||||
management layer doesn't know about devices outside the CPU, so it
|
||||
shouldn't need to know about "bus addresses" etc).
|
||||
|
||||
NOTE NOTE NOTE! The above is only one part of the whole equation. The above
|
||||
only talks about "real memory", that is, CPU memory (RAM).
|
||||
.. note::
|
||||
|
||||
The above is only one part of the whole equation. The above
|
||||
only talks about "real memory", that is, CPU memory (RAM).
|
||||
|
||||
There is a completely different type of memory too, and that's the "shared
|
||||
memory" on the PCI or ISA bus. That's generally not RAM (although in the case
|
||||
@ -137,20 +149,22 @@ whatever, and there is only one way to access it: the readb/writeb and
|
||||
related functions. You should never take the address of such memory, because
|
||||
there is really nothing you can do with such an address: it's not
|
||||
conceptually in the same memory space as "real memory" at all, so you cannot
|
||||
just dereference a pointer. (Sadly, on x86 it _is_ in the same memory space,
|
||||
just dereference a pointer. (Sadly, on x86 it **is** in the same memory space,
|
||||
so on x86 it actually works to just deference a pointer, but it's not
|
||||
portable).
|
||||
|
||||
For such memory, you can do things like
|
||||
For such memory, you can do things like:
|
||||
|
||||
- reading::
|
||||
|
||||
- reading:
|
||||
/*
|
||||
* read first 32 bits from ISA memory at 0xC0000, aka
|
||||
* C000:0000 in DOS terms
|
||||
*/
|
||||
unsigned int signature = isa_readl(0xC0000);
|
||||
|
||||
- remapping and writing:
|
||||
- remapping and writing::
|
||||
|
||||
/*
|
||||
* remap framebuffer PCI memory area at 0xFC000000,
|
||||
* size 1MB, so that we can access it: We can directly
|
||||
@ -165,7 +179,8 @@ For such memory, you can do things like
|
||||
/* unmap when we unload the driver */
|
||||
iounmap(baseptr);
|
||||
|
||||
- copying and clearing:
|
||||
- copying and clearing::
|
||||
|
||||
/* get the 6-byte Ethernet address at ISA address E000:0040 */
|
||||
memcpy_fromio(kernel_buffer, 0xE0040, 6);
|
||||
/* write a packet to the driver */
|
||||
@ -181,10 +196,10 @@ happy that your driver works ;)
|
||||
Note that kernel versions 2.0.x (and earlier) mistakenly called the
|
||||
ioremap() function "vremap()". ioremap() is the proper name, but I
|
||||
didn't think straight when I wrote it originally. People who have to
|
||||
support both can do something like:
|
||||
support both can do something like::
|
||||
|
||||
/* support old naming silliness */
|
||||
#if LINUX_VERSION_CODE < 0x020100
|
||||
#if LINUX_VERSION_CODE < 0x020100
|
||||
#define ioremap vremap
|
||||
#define iounmap vfree
|
||||
#endif
|
||||
@ -196,13 +211,10 @@ And the above sounds worse than it really is. Most real drivers really
|
||||
don't do all that complex things (or rather: the complexity is not so
|
||||
much in the actual IO accesses as in error handling and timeouts etc).
|
||||
It's generally not hard to fix drivers, and in many cases the code
|
||||
actually looks better afterwards:
|
||||
actually looks better afterwards::
|
||||
|
||||
unsigned long signature = *(unsigned int *) 0xC0000;
|
||||
vs
|
||||
unsigned long signature = readl(0xC0000);
|
||||
|
||||
I think the second version actually is more readable, no?
|
||||
|
||||
Linus
|
||||
|
||||
|
@ -1,7 +1,8 @@
|
||||
Cache and TLB Flushing
|
||||
Under Linux
|
||||
==================================
|
||||
Cache and TLB Flushing Under Linux
|
||||
==================================
|
||||
|
||||
David S. Miller <davem@redhat.com>
|
||||
:Author: David S. Miller <davem@redhat.com>
|
||||
|
||||
This document describes the cache/tlb flushing interfaces called
|
||||
by the Linux VM subsystem. It enumerates over each interface,
|
||||
@ -28,7 +29,7 @@ Therefore when software page table changes occur, the kernel will
|
||||
invoke one of the following flush methods _after_ the page table
|
||||
changes occur:
|
||||
|
||||
1) void flush_tlb_all(void)
|
||||
1) ``void flush_tlb_all(void)``
|
||||
|
||||
The most severe flush of all. After this interface runs,
|
||||
any previous page table modification whatsoever will be
|
||||
@ -37,7 +38,7 @@ changes occur:
|
||||
This is usually invoked when the kernel page tables are
|
||||
changed, since such translations are "global" in nature.
|
||||
|
||||
2) void flush_tlb_mm(struct mm_struct *mm)
|
||||
2) ``void flush_tlb_mm(struct mm_struct *mm)``
|
||||
|
||||
This interface flushes an entire user address space from
|
||||
the TLB. After running, this interface must make sure that
|
||||
@ -49,8 +50,8 @@ changes occur:
|
||||
page table operations such as what happens during
|
||||
fork, and exec.
|
||||
|
||||
3) void flush_tlb_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end)
|
||||
3) ``void flush_tlb_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end)``
|
||||
|
||||
Here we are flushing a specific range of (user) virtual
|
||||
address translations from the TLB. After running, this
|
||||
@ -69,7 +70,7 @@ changes occur:
|
||||
call flush_tlb_page (see below) for each entry which may be
|
||||
modified.
|
||||
|
||||
4) void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)
|
||||
4) ``void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)``
|
||||
|
||||
This time we need to remove the PAGE_SIZE sized translation
|
||||
from the TLB. The 'vma' is the backing structure used by
|
||||
@ -87,8 +88,8 @@ changes occur:
|
||||
|
||||
This is used primarily during fault processing.
|
||||
|
||||
5) void update_mmu_cache(struct vm_area_struct *vma,
|
||||
unsigned long address, pte_t *ptep)
|
||||
5) ``void update_mmu_cache(struct vm_area_struct *vma,
|
||||
unsigned long address, pte_t *ptep)``
|
||||
|
||||
At the end of every page fault, this routine is invoked to
|
||||
tell the architecture specific code that a translation
|
||||
@ -100,7 +101,7 @@ changes occur:
|
||||
translations for software managed TLB configurations.
|
||||
The sparc64 port currently does this.
|
||||
|
||||
6) void tlb_migrate_finish(struct mm_struct *mm)
|
||||
6) ``void tlb_migrate_finish(struct mm_struct *mm)``
|
||||
|
||||
This interface is called at the end of an explicit
|
||||
process migration. This interface provides a hook
|
||||
@ -112,7 +113,7 @@ changes occur:
|
||||
|
||||
Next, we have the cache flushing interfaces. In general, when Linux
|
||||
is changing an existing virtual-->physical mapping to a new value,
|
||||
the sequence will be in one of the following forms:
|
||||
the sequence will be in one of the following forms::
|
||||
|
||||
1) flush_cache_mm(mm);
|
||||
change_all_page_tables_of(mm);
|
||||
@ -143,7 +144,7 @@ and have no dependency on translation information.
|
||||
|
||||
Here are the routines, one by one:
|
||||
|
||||
1) void flush_cache_mm(struct mm_struct *mm)
|
||||
1) ``void flush_cache_mm(struct mm_struct *mm)``
|
||||
|
||||
This interface flushes an entire user address space from
|
||||
the caches. That is, after running, there will be no cache
|
||||
@ -152,7 +153,7 @@ Here are the routines, one by one:
|
||||
This interface is used to handle whole address space
|
||||
page table operations such as what happens during exit and exec.
|
||||
|
||||
2) void flush_cache_dup_mm(struct mm_struct *mm)
|
||||
2) ``void flush_cache_dup_mm(struct mm_struct *mm)``
|
||||
|
||||
This interface flushes an entire user address space from
|
||||
the caches. That is, after running, there will be no cache
|
||||
@ -164,8 +165,8 @@ Here are the routines, one by one:
|
||||
This option is separate from flush_cache_mm to allow some
|
||||
optimizations for VIPT caches.
|
||||
|
||||
3) void flush_cache_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end)
|
||||
3) ``void flush_cache_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end)``
|
||||
|
||||
Here we are flushing a specific range of (user) virtual
|
||||
addresses from the cache. After running, there will be no
|
||||
@ -181,7 +182,7 @@ Here are the routines, one by one:
|
||||
call flush_cache_page (see below) for each entry which may be
|
||||
modified.
|
||||
|
||||
4) void flush_cache_page(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn)
|
||||
4) ``void flush_cache_page(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn)``
|
||||
|
||||
This time we need to remove a PAGE_SIZE sized range
|
||||
from the cache. The 'vma' is the backing structure used by
|
||||
@ -202,7 +203,7 @@ Here are the routines, one by one:
|
||||
|
||||
This is used primarily during fault processing.
|
||||
|
||||
5) void flush_cache_kmaps(void)
|
||||
5) ``void flush_cache_kmaps(void)``
|
||||
|
||||
This routine need only be implemented if the platform utilizes
|
||||
highmem. It will be called right before all of the kmaps
|
||||
@ -214,8 +215,8 @@ Here are the routines, one by one:
|
||||
|
||||
This routing should be implemented in asm/highmem.h
|
||||
|
||||
6) void flush_cache_vmap(unsigned long start, unsigned long end)
|
||||
void flush_cache_vunmap(unsigned long start, unsigned long end)
|
||||
6) ``void flush_cache_vmap(unsigned long start, unsigned long end)``
|
||||
``void flush_cache_vunmap(unsigned long start, unsigned long end)``
|
||||
|
||||
Here in these two interfaces we are flushing a specific range
|
||||
of (kernel) virtual addresses from the cache. After running,
|
||||
@ -243,8 +244,10 @@ size). This setting will force the SYSv IPC layer to only allow user
|
||||
processes to mmap shared memory at address which are a multiple of
|
||||
this value.
|
||||
|
||||
NOTE: This does not fix shared mmaps, check out the sparc64 port for
|
||||
one way to solve this (in particular SPARC_FLAG_MMAPSHARED).
|
||||
.. note::
|
||||
|
||||
This does not fix shared mmaps, check out the sparc64 port for
|
||||
one way to solve this (in particular SPARC_FLAG_MMAPSHARED).
|
||||
|
||||
Next, you have to solve the D-cache aliasing issue for all
|
||||
other cases. Please keep in mind that fact that, for a given page
|
||||
@ -255,8 +258,8 @@ physical page into its address space, by implication the D-cache
|
||||
aliasing problem has the potential to exist since the kernel already
|
||||
maps this page at its virtual address.
|
||||
|
||||
void copy_user_page(void *to, void *from, unsigned long addr, struct page *page)
|
||||
void clear_user_page(void *to, unsigned long addr, struct page *page)
|
||||
``void copy_user_page(void *to, void *from, unsigned long addr, struct page *page)``
|
||||
``void clear_user_page(void *to, unsigned long addr, struct page *page)``
|
||||
|
||||
These two routines store data in user anonymous or COW
|
||||
pages. It allows a port to efficiently avoid D-cache alias
|
||||
@ -276,14 +279,16 @@ maps this page at its virtual address.
|
||||
If D-cache aliasing is not an issue, these two routines may
|
||||
simply call memcpy/memset directly and do nothing more.
|
||||
|
||||
void flush_dcache_page(struct page *page)
|
||||
``void flush_dcache_page(struct page *page)``
|
||||
|
||||
Any time the kernel writes to a page cache page, _OR_
|
||||
the kernel is about to read from a page cache page and
|
||||
user space shared/writable mappings of this page potentially
|
||||
exist, this routine is called.
|
||||
|
||||
NOTE: This routine need only be called for page cache pages
|
||||
.. note::
|
||||
|
||||
This routine need only be called for page cache pages
|
||||
which can potentially ever be mapped into the address
|
||||
space of a user process. So for example, VFS layer code
|
||||
handling vfs symlinks in the page cache need not call
|
||||
@ -322,18 +327,19 @@ maps this page at its virtual address.
|
||||
made of this flag bit, and if set the flush is done and the flag
|
||||
bit is cleared.
|
||||
|
||||
IMPORTANT NOTE: It is often important, if you defer the flush,
|
||||
.. important::
|
||||
|
||||
It is often important, if you defer the flush,
|
||||
that the actual flush occurs on the same CPU
|
||||
as did the cpu stores into the page to make it
|
||||
dirty. Again, see sparc64 for examples of how
|
||||
to deal with this.
|
||||
|
||||
void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
|
||||
unsigned long user_vaddr,
|
||||
void *dst, void *src, int len)
|
||||
void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
|
||||
unsigned long user_vaddr,
|
||||
void *dst, void *src, int len)
|
||||
``void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
|
||||
unsigned long user_vaddr, void *dst, void *src, int len)``
|
||||
``void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
|
||||
unsigned long user_vaddr, void *dst, void *src, int len)``
|
||||
|
||||
When the kernel needs to copy arbitrary data in and out
|
||||
of arbitrary user pages (f.e. for ptrace()) it will use
|
||||
these two routines.
|
||||
@ -344,8 +350,9 @@ maps this page at its virtual address.
|
||||
likely that you will need to flush the instruction cache
|
||||
for copy_to_user_page().
|
||||
|
||||
void flush_anon_page(struct vm_area_struct *vma, struct page *page,
|
||||
unsigned long vmaddr)
|
||||
``void flush_anon_page(struct vm_area_struct *vma, struct page *page,
|
||||
unsigned long vmaddr)``
|
||||
|
||||
When the kernel needs to access the contents of an anonymous
|
||||
page, it calls this function (currently only
|
||||
get_user_pages()). Note: flush_dcache_page() deliberately
|
||||
@ -354,7 +361,8 @@ maps this page at its virtual address.
|
||||
architectures). For incoherent architectures, it should flush
|
||||
the cache of the page at vmaddr.
|
||||
|
||||
void flush_kernel_dcache_page(struct page *page)
|
||||
``void flush_kernel_dcache_page(struct page *page)``
|
||||
|
||||
When the kernel needs to modify a user page is has obtained
|
||||
with kmap, it calls this function after all modifications are
|
||||
complete (but before kunmapping it) to bring the underlying
|
||||
@ -366,14 +374,16 @@ maps this page at its virtual address.
|
||||
the kernel cache for page (using page_address(page)).
|
||||
|
||||
|
||||
void flush_icache_range(unsigned long start, unsigned long end)
|
||||
``void flush_icache_range(unsigned long start, unsigned long end)``
|
||||
|
||||
When the kernel stores into addresses that it will execute
|
||||
out of (eg when loading modules), this function is called.
|
||||
|
||||
If the icache does not snoop stores then this routine will need
|
||||
to flush it.
|
||||
|
||||
void flush_icache_page(struct vm_area_struct *vma, struct page *page)
|
||||
``void flush_icache_page(struct vm_area_struct *vma, struct page *page)``
|
||||
|
||||
All the functionality of flush_icache_page can be implemented in
|
||||
flush_dcache_page and update_mmu_cache. In the future, the hope
|
||||
is to remove this interface completely.
|
||||
@ -387,7 +397,8 @@ the kernel trying to do I/O to vmap areas must manually manage
|
||||
coherency. It must do this by flushing the vmap range before doing
|
||||
I/O and invalidating it after the I/O returns.
|
||||
|
||||
void flush_kernel_vmap_range(void *vaddr, int size)
|
||||
``void flush_kernel_vmap_range(void *vaddr, int size)``
|
||||
|
||||
flushes the kernel cache for a given virtual address range in
|
||||
the vmap area. This is to make sure that any data the kernel
|
||||
modified in the vmap range is made visible to the physical
|
||||
@ -395,7 +406,8 @@ I/O and invalidating it after the I/O returns.
|
||||
Note that this API does *not* also flush the offset map alias
|
||||
of the area.
|
||||
|
||||
void invalidate_kernel_vmap_range(void *vaddr, int size) invalidates
|
||||
``void invalidate_kernel_vmap_range(void *vaddr, int size) invalidates``
|
||||
|
||||
the cache for a given virtual address range in the vmap area
|
||||
which prevents the processor from making the cache stale by
|
||||
speculatively reading data while the I/O was occurring to the
|
||||
|
@ -789,23 +789,46 @@ way to trigger. Applications should do whatever they can to help the
|
||||
system. It might be too late to consult with vmstat or any other
|
||||
statistics, so it's advisable to take an immediate action.
|
||||
|
||||
The events are propagated upward until the event is handled, i.e. the
|
||||
events are not pass-through. Here is what this means: for example you have
|
||||
three cgroups: A->B->C. Now you set up an event listener on cgroups A, B
|
||||
and C, and suppose group C experiences some pressure. In this situation,
|
||||
only group C will receive the notification, i.e. groups A and B will not
|
||||
receive it. This is done to avoid excessive "broadcasting" of messages,
|
||||
which disturbs the system and which is especially bad if we are low on
|
||||
memory or thrashing. So, organize the cgroups wisely, or propagate the
|
||||
events manually (or, ask us to implement the pass-through events,
|
||||
explaining why would you need them.)
|
||||
By default, events are propagated upward until the event is handled, i.e. the
|
||||
events are not pass-through. For example, you have three cgroups: A->B->C. Now
|
||||
you set up an event listener on cgroups A, B and C, and suppose group C
|
||||
experiences some pressure. In this situation, only group C will receive the
|
||||
notification, i.e. groups A and B will not receive it. This is done to avoid
|
||||
excessive "broadcasting" of messages, which disturbs the system and which is
|
||||
especially bad if we are low on memory or thrashing. Group B, will receive
|
||||
notification only if there are no event listers for group C.
|
||||
|
||||
There are three optional modes that specify different propagation behavior:
|
||||
|
||||
- "default": this is the default behavior specified above. This mode is the
|
||||
same as omitting the optional mode parameter, preserved by backwards
|
||||
compatibility.
|
||||
|
||||
- "hierarchy": events always propagate up to the root, similar to the default
|
||||
behavior, except that propagation continues regardless of whether there are
|
||||
event listeners at each level, with the "hierarchy" mode. In the above
|
||||
example, groups A, B, and C will receive notification of memory pressure.
|
||||
|
||||
- "local": events are pass-through, i.e. they only receive notifications when
|
||||
memory pressure is experienced in the memcg for which the notification is
|
||||
registered. In the above example, group C will receive notification if
|
||||
registered for "local" notification and the group experiences memory
|
||||
pressure. However, group B will never receive notification, regardless if
|
||||
there is an event listener for group C or not, if group B is registered for
|
||||
local notification.
|
||||
|
||||
The level and event notification mode ("hierarchy" or "local", if necessary) are
|
||||
specified by a comma-delimited string, i.e. "low,hierarchy" specifies
|
||||
hierarchical, pass-through, notification for all ancestor memcgs. Notification
|
||||
that is the default, non pass-through behavior, does not specify a mode.
|
||||
"medium,local" specifies pass-through notification for the medium level.
|
||||
|
||||
The file memory.pressure_level is only used to setup an eventfd. To
|
||||
register a notification, an application must:
|
||||
|
||||
- create an eventfd using eventfd(2);
|
||||
- open memory.pressure_level;
|
||||
- write string like "<event_fd> <fd of memory.pressure_level> <level>"
|
||||
- write string as "<event_fd> <fd of memory.pressure_level> <level[,mode]>"
|
||||
to cgroup.event_control.
|
||||
|
||||
Application will be notified through eventfd when memory pressure is at
|
||||
@ -821,7 +844,7 @@ Test:
|
||||
# cd /sys/fs/cgroup/memory/
|
||||
# mkdir foo
|
||||
# cd foo
|
||||
# cgroup_event_listener memory.pressure_level low &
|
||||
# cgroup_event_listener memory.pressure_level low,hierarchy &
|
||||
# echo 8000000 > memory.limit_in_bytes
|
||||
# echo 8000000 > memory.memsw.limit_in_bytes
|
||||
# echo $$ > tasks
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,9 +1,9 @@
|
||||
================
|
||||
CIRCULAR BUFFERS
|
||||
================
|
||||
================
|
||||
Circular Buffers
|
||||
================
|
||||
|
||||
By: David Howells <dhowells@redhat.com>
|
||||
Paul E. McKenney <paulmck@linux.vnet.ibm.com>
|
||||
:Author: David Howells <dhowells@redhat.com>
|
||||
:Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
|
||||
|
||||
|
||||
Linux provides a number of features that can be used to implement circular
|
||||
@ -20,7 +20,7 @@ producer and just one consumer. It is possible to handle multiple producers by
|
||||
serialising them, and to handle multiple consumers by serialising them.
|
||||
|
||||
|
||||
Contents:
|
||||
.. Contents:
|
||||
|
||||
(*) What is a circular buffer?
|
||||
|
||||
@ -31,8 +31,8 @@ Contents:
|
||||
- The consumer.
|
||||
|
||||
|
||||
==========================
|
||||
WHAT IS A CIRCULAR BUFFER?
|
||||
|
||||
What is a circular buffer?
|
||||
==========================
|
||||
|
||||
First of all, what is a circular buffer? A circular buffer is a buffer of
|
||||
@ -60,9 +60,7 @@ buffer, provided that neither index overtakes the other. The implementer must
|
||||
be careful, however, as a region more than one unit in size may wrap the end of
|
||||
the buffer and be broken into two segments.
|
||||
|
||||
|
||||
============================
|
||||
MEASURING POWER-OF-2 BUFFERS
|
||||
Measuring power-of-2 buffers
|
||||
============================
|
||||
|
||||
Calculation of the occupancy or the remaining capacity of an arbitrarily sized
|
||||
@ -71,13 +69,13 @@ modulus (divide) instruction. However, if the buffer is of a power-of-2 size,
|
||||
then a much quicker bitwise-AND instruction can be used instead.
|
||||
|
||||
Linux provides a set of macros for handling power-of-2 circular buffers. These
|
||||
can be made use of by:
|
||||
can be made use of by::
|
||||
|
||||
#include <linux/circ_buf.h>
|
||||
|
||||
The macros are:
|
||||
|
||||
(*) Measure the remaining capacity of a buffer:
|
||||
(#) Measure the remaining capacity of a buffer::
|
||||
|
||||
CIRC_SPACE(head_index, tail_index, buffer_size);
|
||||
|
||||
@ -85,7 +83,7 @@ The macros are:
|
||||
can be inserted.
|
||||
|
||||
|
||||
(*) Measure the maximum consecutive immediate space in a buffer:
|
||||
(#) Measure the maximum consecutive immediate space in a buffer::
|
||||
|
||||
CIRC_SPACE_TO_END(head_index, tail_index, buffer_size);
|
||||
|
||||
@ -94,14 +92,14 @@ The macros are:
|
||||
beginning of the buffer.
|
||||
|
||||
|
||||
(*) Measure the occupancy of a buffer:
|
||||
(#) Measure the occupancy of a buffer::
|
||||
|
||||
CIRC_CNT(head_index, tail_index, buffer_size);
|
||||
|
||||
This returns the number of items currently occupying a buffer[2].
|
||||
|
||||
|
||||
(*) Measure the non-wrapping occupancy of a buffer:
|
||||
(#) Measure the non-wrapping occupancy of a buffer::
|
||||
|
||||
CIRC_CNT_TO_END(head_index, tail_index, buffer_size);
|
||||
|
||||
@ -112,7 +110,7 @@ The macros are:
|
||||
Each of these macros will nominally return a value between 0 and buffer_size-1,
|
||||
however:
|
||||
|
||||
[1] CIRC_SPACE*() are intended to be used in the producer. To the producer
|
||||
(1) CIRC_SPACE*() are intended to be used in the producer. To the producer
|
||||
they will return a lower bound as the producer controls the head index,
|
||||
but the consumer may still be depleting the buffer on another CPU and
|
||||
moving the tail index.
|
||||
@ -120,7 +118,7 @@ however:
|
||||
To the consumer it will show an upper bound as the producer may be busy
|
||||
depleting the space.
|
||||
|
||||
[2] CIRC_CNT*() are intended to be used in the consumer. To the consumer they
|
||||
(2) CIRC_CNT*() are intended to be used in the consumer. To the consumer they
|
||||
will return a lower bound as the consumer controls the tail index, but the
|
||||
producer may still be filling the buffer on another CPU and moving the
|
||||
head index.
|
||||
@ -128,14 +126,12 @@ however:
|
||||
To the producer it will show an upper bound as the consumer may be busy
|
||||
emptying the buffer.
|
||||
|
||||
[3] To a third party, the order in which the writes to the indices by the
|
||||
(3) To a third party, the order in which the writes to the indices by the
|
||||
producer and consumer become visible cannot be guaranteed as they are
|
||||
independent and may be made on different CPUs - so the result in such a
|
||||
situation will merely be a guess, and may even be negative.
|
||||
|
||||
|
||||
===========================================
|
||||
USING MEMORY BARRIERS WITH CIRCULAR BUFFERS
|
||||
Using memory barriers with circular buffers
|
||||
===========================================
|
||||
|
||||
By using memory barriers in conjunction with circular buffers, you can avoid
|
||||
@ -152,10 +148,10 @@ time, and only one thing should be emptying a buffer at any one time, but the
|
||||
two sides can operate simultaneously.
|
||||
|
||||
|
||||
THE PRODUCER
|
||||
The producer
|
||||
------------
|
||||
|
||||
The producer will look something like this:
|
||||
The producer will look something like this::
|
||||
|
||||
spin_lock(&producer_lock);
|
||||
|
||||
@ -193,10 +189,10 @@ ordering between the read of the index indicating that the consumer has
|
||||
vacated a given element and the write by the producer to that same element.
|
||||
|
||||
|
||||
THE CONSUMER
|
||||
The Consumer
|
||||
------------
|
||||
|
||||
The consumer will look something like this:
|
||||
The consumer will look something like this::
|
||||
|
||||
spin_lock(&consumer_lock);
|
||||
|
||||
@ -235,8 +231,7 @@ prevents the compiler from tearing the store, and enforces ordering
|
||||
against previous accesses.
|
||||
|
||||
|
||||
===============
|
||||
FURTHER READING
|
||||
Further reading
|
||||
===============
|
||||
|
||||
See also Documentation/memory-barriers.txt for a description of Linux's memory
|
||||
|
@ -1,12 +1,16 @@
|
||||
The Common Clk Framework
|
||||
Mike Turquette <mturquette@ti.com>
|
||||
========================
|
||||
The Common Clk Framework
|
||||
========================
|
||||
|
||||
:Author: Mike Turquette <mturquette@ti.com>
|
||||
|
||||
This document endeavours to explain the common clk framework details,
|
||||
and how to port a platform over to this framework. It is not yet a
|
||||
detailed explanation of the clock api in include/linux/clk.h, but
|
||||
perhaps someday it will include that information.
|
||||
|
||||
Part 1 - introduction and interface split
|
||||
Introduction and interface split
|
||||
================================
|
||||
|
||||
The common clk framework is an interface to control the clock nodes
|
||||
available on various devices today. This may come in the form of clock
|
||||
@ -35,10 +39,11 @@ is defined in struct clk_foo and pointed to within struct clk_core. This
|
||||
allows for easy navigation between the two discrete halves of the common
|
||||
clock interface.
|
||||
|
||||
Part 2 - common data structures and api
|
||||
Common data structures and api
|
||||
==============================
|
||||
|
||||
Below is the common struct clk_core definition from
|
||||
drivers/clk/clk.c, modified for brevity:
|
||||
drivers/clk/clk.c, modified for brevity::
|
||||
|
||||
struct clk_core {
|
||||
const char *name;
|
||||
@ -59,7 +64,7 @@ struct clk. That api is documented in include/linux/clk.h.
|
||||
|
||||
Platforms and devices utilizing the common struct clk_core use the struct
|
||||
clk_ops pointer in struct clk_core to perform the hardware-specific parts of
|
||||
the operations defined in clk-provider.h:
|
||||
the operations defined in clk-provider.h::
|
||||
|
||||
struct clk_ops {
|
||||
int (*prepare)(struct clk_hw *hw);
|
||||
@ -95,19 +100,20 @@ the operations defined in clk-provider.h:
|
||||
struct dentry *dentry);
|
||||
};
|
||||
|
||||
Part 3 - hardware clk implementations
|
||||
Hardware clk implementations
|
||||
============================
|
||||
|
||||
The strength of the common struct clk_core comes from its .ops and .hw pointers
|
||||
which abstract the details of struct clk from the hardware-specific bits, and
|
||||
vice versa. To illustrate consider the simple gateable clk implementation in
|
||||
drivers/clk/clk-gate.c:
|
||||
drivers/clk/clk-gate.c::
|
||||
|
||||
struct clk_gate {
|
||||
struct clk_hw hw;
|
||||
void __iomem *reg;
|
||||
u8 bit_idx;
|
||||
...
|
||||
};
|
||||
struct clk_gate {
|
||||
struct clk_hw hw;
|
||||
void __iomem *reg;
|
||||
u8 bit_idx;
|
||||
...
|
||||
};
|
||||
|
||||
struct clk_gate contains struct clk_hw hw as well as hardware-specific
|
||||
knowledge about which register and bit controls this clk's gating.
|
||||
@ -115,7 +121,7 @@ Nothing about clock topology or accounting, such as enable_count or
|
||||
notifier_count, is needed here. That is all handled by the common
|
||||
framework code and struct clk_core.
|
||||
|
||||
Let's walk through enabling this clk from driver code:
|
||||
Let's walk through enabling this clk from driver code::
|
||||
|
||||
struct clk *clk;
|
||||
clk = clk_get(NULL, "my_gateable_clk");
|
||||
@ -123,70 +129,71 @@ Let's walk through enabling this clk from driver code:
|
||||
clk_prepare(clk);
|
||||
clk_enable(clk);
|
||||
|
||||
The call graph for clk_enable is very simple:
|
||||
The call graph for clk_enable is very simple::
|
||||
|
||||
clk_enable(clk);
|
||||
clk->ops->enable(clk->hw);
|
||||
[resolves to...]
|
||||
clk_gate_enable(hw);
|
||||
[resolves struct clk gate with to_clk_gate(hw)]
|
||||
clk_gate_set_bit(gate);
|
||||
clk_enable(clk);
|
||||
clk->ops->enable(clk->hw);
|
||||
[resolves to...]
|
||||
clk_gate_enable(hw);
|
||||
[resolves struct clk gate with to_clk_gate(hw)]
|
||||
clk_gate_set_bit(gate);
|
||||
|
||||
And the definition of clk_gate_set_bit:
|
||||
And the definition of clk_gate_set_bit::
|
||||
|
||||
static void clk_gate_set_bit(struct clk_gate *gate)
|
||||
{
|
||||
u32 reg;
|
||||
static void clk_gate_set_bit(struct clk_gate *gate)
|
||||
{
|
||||
u32 reg;
|
||||
|
||||
reg = __raw_readl(gate->reg);
|
||||
reg |= BIT(gate->bit_idx);
|
||||
writel(reg, gate->reg);
|
||||
}
|
||||
reg = __raw_readl(gate->reg);
|
||||
reg |= BIT(gate->bit_idx);
|
||||
writel(reg, gate->reg);
|
||||
}
|
||||
|
||||
Note that to_clk_gate is defined as:
|
||||
Note that to_clk_gate is defined as::
|
||||
|
||||
#define to_clk_gate(_hw) container_of(_hw, struct clk_gate, hw)
|
||||
#define to_clk_gate(_hw) container_of(_hw, struct clk_gate, hw)
|
||||
|
||||
This pattern of abstraction is used for every clock hardware
|
||||
representation.
|
||||
|
||||
Part 4 - supporting your own clk hardware
|
||||
Supporting your own clk hardware
|
||||
================================
|
||||
|
||||
When implementing support for a new type of clock it is only necessary to
|
||||
include the following header:
|
||||
include the following header::
|
||||
|
||||
#include <linux/clk-provider.h>
|
||||
#include <linux/clk-provider.h>
|
||||
|
||||
To construct a clk hardware structure for your platform you must define
|
||||
the following:
|
||||
the following::
|
||||
|
||||
struct clk_foo {
|
||||
struct clk_hw hw;
|
||||
... hardware specific data goes here ...
|
||||
};
|
||||
struct clk_foo {
|
||||
struct clk_hw hw;
|
||||
... hardware specific data goes here ...
|
||||
};
|
||||
|
||||
To take advantage of your data you'll need to support valid operations
|
||||
for your clk:
|
||||
for your clk::
|
||||
|
||||
struct clk_ops clk_foo_ops {
|
||||
.enable = &clk_foo_enable;
|
||||
.disable = &clk_foo_disable;
|
||||
};
|
||||
struct clk_ops clk_foo_ops {
|
||||
.enable = &clk_foo_enable;
|
||||
.disable = &clk_foo_disable;
|
||||
};
|
||||
|
||||
Implement the above functions using container_of:
|
||||
Implement the above functions using container_of::
|
||||
|
||||
#define to_clk_foo(_hw) container_of(_hw, struct clk_foo, hw)
|
||||
#define to_clk_foo(_hw) container_of(_hw, struct clk_foo, hw)
|
||||
|
||||
int clk_foo_enable(struct clk_hw *hw)
|
||||
{
|
||||
struct clk_foo *foo;
|
||||
int clk_foo_enable(struct clk_hw *hw)
|
||||
{
|
||||
struct clk_foo *foo;
|
||||
|
||||
foo = to_clk_foo(hw);
|
||||
foo = to_clk_foo(hw);
|
||||
|
||||
... perform magic on foo ...
|
||||
... perform magic on foo ...
|
||||
|
||||
return 0;
|
||||
};
|
||||
return 0;
|
||||
};
|
||||
|
||||
Below is a matrix detailing which clk_ops are mandatory based upon the
|
||||
hardware capabilities of that clock. A cell marked as "y" means
|
||||
@ -194,41 +201,56 @@ mandatory, a cell marked as "n" implies that either including that
|
||||
callback is invalid or otherwise unnecessary. Empty cells are either
|
||||
optional or must be evaluated on a case-by-case basis.
|
||||
|
||||
clock hardware characteristics
|
||||
-----------------------------------------------------------
|
||||
| gate | change rate | single parent | multiplexer | root |
|
||||
|------|-------------|---------------|-------------|------|
|
||||
.prepare | | | | | |
|
||||
.unprepare | | | | | |
|
||||
| | | | | |
|
||||
.enable | y | | | | |
|
||||
.disable | y | | | | |
|
||||
.is_enabled | y | | | | |
|
||||
| | | | | |
|
||||
.recalc_rate | | y | | | |
|
||||
.round_rate | | y [1] | | | |
|
||||
.determine_rate | | y [1] | | | |
|
||||
.set_rate | | y | | | |
|
||||
| | | | | |
|
||||
.set_parent | | | n | y | n |
|
||||
.get_parent | | | n | y | n |
|
||||
| | | | | |
|
||||
.recalc_accuracy| | | | | |
|
||||
| | | | | |
|
||||
.init | | | | | |
|
||||
-----------------------------------------------------------
|
||||
[1] either one of round_rate or determine_rate is required.
|
||||
.. table:: clock hardware characteristics
|
||||
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
| | gate | change rate | single parent | multiplexer | root |
|
||||
+================+======+=============+===============+=============+======+
|
||||
|.prepare | | | | | |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.unprepare | | | | | |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.enable | y | | | | |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.disable | y | | | | |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.is_enabled | y | | | | |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.recalc_rate | | y | | | |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.round_rate | | y [1]_ | | | |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.determine_rate | | y [1]_ | | | |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.set_rate | | y | | | |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.set_parent | | | n | y | n |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.get_parent | | | n | y | n |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.recalc_accuracy| | | | | |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|.init | | | | | |
|
||||
+----------------+------+-------------+---------------+-------------+------+
|
||||
|
||||
.. [1] either one of round_rate or determine_rate is required.
|
||||
|
||||
Finally, register your clock at run-time with a hardware-specific
|
||||
registration function. This function simply populates struct clk_foo's
|
||||
data and then passes the common struct clk parameters to the framework
|
||||
with a call to:
|
||||
with a call to::
|
||||
|
||||
clk_register(...)
|
||||
clk_register(...)
|
||||
|
||||
See the basic clock types in drivers/clk/clk-*.c for examples.
|
||||
See the basic clock types in ``drivers/clk/clk-*.c`` for examples.
|
||||
|
||||
Part 5 - Disabling clock gating of unused clocks
|
||||
Disabling clock gating of unused clocks
|
||||
=======================================
|
||||
|
||||
Sometimes during development it can be useful to be able to bypass the
|
||||
default disabling of unused clocks. For example, if drivers aren't enabling
|
||||
@ -239,7 +261,8 @@ are sorted out.
|
||||
To bypass this disabling, include "clk_ignore_unused" in the bootargs to the
|
||||
kernel.
|
||||
|
||||
Part 6 - Locking
|
||||
Locking
|
||||
=======
|
||||
|
||||
The common clock framework uses two global locks, the prepare lock and the
|
||||
enable lock.
|
||||
|
@ -271,8 +271,7 @@ latex_elements = {
|
||||
|
||||
# Additional stuff for the LaTeX preamble.
|
||||
'preamble': '''
|
||||
% Adjust margins
|
||||
\\usepackage[margin=0.5in, top=1in, bottom=1in]{geometry}
|
||||
\\usepackage{ifthen}
|
||||
|
||||
% Allow generate some pages in landscape
|
||||
\\usepackage{lscape}
|
||||
@ -281,6 +280,7 @@ latex_elements = {
|
||||
\\definecolor{NoteColor}{RGB}{204,255,255}
|
||||
\\definecolor{WarningColor}{RGB}{255,204,204}
|
||||
\\definecolor{AttentionColor}{RGB}{255,255,204}
|
||||
\\definecolor{ImportantColor}{RGB}{192,255,204}
|
||||
\\definecolor{OtherColor}{RGB}{204,204,204}
|
||||
\\newlength{\\mynoticelength}
|
||||
\\makeatletter\\newenvironment{coloredbox}[1]{%
|
||||
@ -301,7 +301,12 @@ latex_elements = {
|
||||
\\ifthenelse%
|
||||
{\\equal{\\py@noticetype}{attention}}%
|
||||
{\\colorbox{AttentionColor}{\\usebox{\\@tempboxa}}}%
|
||||
{\\colorbox{OtherColor}{\\usebox{\\@tempboxa}}}%
|
||||
{%
|
||||
\\ifthenelse%
|
||||
{\\equal{\\py@noticetype}{important}}%
|
||||
{\\colorbox{ImportantColor}{\\usebox{\\@tempboxa}}}%
|
||||
{\\colorbox{OtherColor}{\\usebox{\\@tempboxa}}}%
|
||||
}%
|
||||
}%
|
||||
}%
|
||||
}\\makeatother
|
||||
@ -336,30 +341,51 @@ latex_elements = {
|
||||
if major == 1 and minor > 3:
|
||||
latex_elements['preamble'] += '\\renewcommand*{\\DUrole}[2]{ #2 }\n'
|
||||
|
||||
if major == 1 and minor <= 4:
|
||||
latex_elements['preamble'] += '\\usepackage[margin=0.5in, top=1in, bottom=1in]{geometry}'
|
||||
elif major == 1 and (minor > 5 or (minor == 5 and patch >= 3)):
|
||||
latex_elements['sphinxsetup'] = 'hmargin=0.5in, vmargin=0.5in'
|
||||
|
||||
|
||||
# Grouping the document tree into LaTeX files. List of tuples
|
||||
# (source start file, target name, title,
|
||||
# author, documentclass [howto, manual, or own class]).
|
||||
# Sorted in alphabetical order
|
||||
latex_documents = [
|
||||
('doc-guide/index', 'kernel-doc-guide.tex', 'Linux Kernel Documentation Guide',
|
||||
'The kernel development community', 'manual'),
|
||||
('admin-guide/index', 'linux-user.tex', 'Linux Kernel User Documentation',
|
||||
'The kernel development community', 'manual'),
|
||||
('core-api/index', 'core-api.tex', 'The kernel core API manual',
|
||||
'The kernel development community', 'manual'),
|
||||
('crypto/index', 'crypto-api.tex', 'Linux Kernel Crypto API manual',
|
||||
'The kernel development community', 'manual'),
|
||||
('dev-tools/index', 'dev-tools.tex', 'Development tools for the Kernel',
|
||||
'The kernel development community', 'manual'),
|
||||
('doc-guide/index', 'kernel-doc-guide.tex', 'Linux Kernel Documentation Guide',
|
||||
'The kernel development community', 'manual'),
|
||||
('driver-api/index', 'driver-api.tex', 'The kernel driver API manual',
|
||||
'The kernel development community', 'manual'),
|
||||
('input/index', 'linux-input.tex', 'The Linux input driver subsystem',
|
||||
'The kernel development community', 'manual'),
|
||||
('kernel-documentation', 'kernel-documentation.tex', 'The Linux Kernel Documentation',
|
||||
'The kernel development community', 'manual'),
|
||||
('process/index', 'development-process.tex', 'Linux Kernel Development Documentation',
|
||||
('filesystems/index', 'filesystems.tex', 'Linux Filesystems API',
|
||||
'The kernel development community', 'manual'),
|
||||
('gpu/index', 'gpu.tex', 'Linux GPU Driver Developer\'s Guide',
|
||||
'The kernel development community', 'manual'),
|
||||
('input/index', 'linux-input.tex', 'The Linux input driver subsystem',
|
||||
'The kernel development community', 'manual'),
|
||||
('kernel-hacking/index', 'kernel-hacking.tex', 'Unreliable Guide To Hacking The Linux Kernel',
|
||||
'The kernel development community', 'manual'),
|
||||
('media/index', 'media.tex', 'Linux Media Subsystem Documentation',
|
||||
'The kernel development community', 'manual'),
|
||||
('networking/index', 'networking.tex', 'Linux Networking Documentation',
|
||||
'The kernel development community', 'manual'),
|
||||
('process/index', 'development-process.tex', 'Linux Kernel Development Documentation',
|
||||
'The kernel development community', 'manual'),
|
||||
('security/index', 'security.tex', 'The kernel security subsystem manual',
|
||||
'The kernel development community', 'manual'),
|
||||
('sh/index', 'sh.tex', 'SuperH architecture implementation manual',
|
||||
'The kernel development community', 'manual'),
|
||||
('sound/index', 'sound.tex', 'Linux Sound Subsystem Documentation',
|
||||
'The kernel development community', 'manual'),
|
||||
('userspace-api/index', 'userspace-api.tex', 'The Linux kernel user-space API guide',
|
||||
'The kernel development community', 'manual'),
|
||||
]
|
||||
|
||||
# The name of an image file (relative to this directory) to place at the top of
|
||||
|
@ -10,7 +10,10 @@ properties:
|
||||
|
||||
1. Objects are opaque pointers. The implementation does not care where they
|
||||
point (if anywhere) or what they point to (if anything).
|
||||
.. note:: Pointers to objects _must_ be zero in the least significant bit.
|
||||
|
||||
.. note::
|
||||
|
||||
Pointers to objects _must_ be zero in the least significant bit.
|
||||
|
||||
2. Objects do not need to contain linkage blocks for use by the array. This
|
||||
permits an object to be located in multiple arrays simultaneously.
|
||||
|
@ -303,6 +303,11 @@ defined which accomplish this::
|
||||
void smp_mb__before_atomic(void);
|
||||
void smp_mb__after_atomic(void);
|
||||
|
||||
Preceding a non-value-returning read-modify-write atomic operation with
|
||||
smp_mb__before_atomic() and following it with smp_mb__after_atomic()
|
||||
provides the same full ordering that is provided by value-returning
|
||||
read-modify-write atomic operations.
|
||||
|
||||
For example, smp_mb__before_atomic() can be used like so::
|
||||
|
||||
obj->dead = 1;
|
||||
|
@ -19,6 +19,7 @@ Core utilities
|
||||
workqueue
|
||||
genericirq
|
||||
flexible-arrays
|
||||
librs
|
||||
|
||||
Interfaces for kernel debugging
|
||||
===============================
|
||||
|
@ -114,7 +114,7 @@ The Slab Cache
|
||||
User Space Memory Access
|
||||
------------------------
|
||||
|
||||
.. kernel-doc:: arch/x86/include/asm/uaccess_32.h
|
||||
.. kernel-doc:: arch/x86/include/asm/uaccess.h
|
||||
:internal:
|
||||
|
||||
.. kernel-doc:: arch/x86/lib/usercopy_32.c
|
||||
|
212
Documentation/core-api/librs.rst
Normal file
212
Documentation/core-api/librs.rst
Normal file
@ -0,0 +1,212 @@
|
||||
==========================================
|
||||
Reed-Solomon Library Programming Interface
|
||||
==========================================
|
||||
|
||||
:Author: Thomas Gleixner
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
||||
The generic Reed-Solomon Library provides encoding, decoding and error
|
||||
correction functions.
|
||||
|
||||
Reed-Solomon codes are used in communication and storage applications to
|
||||
ensure data integrity.
|
||||
|
||||
This documentation is provided for developers who want to utilize the
|
||||
functions provided by the library.
|
||||
|
||||
Known Bugs And Assumptions
|
||||
==========================
|
||||
|
||||
None.
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
This chapter provides examples of how to use the library.
|
||||
|
||||
Initializing
|
||||
------------
|
||||
|
||||
The init function init_rs returns a pointer to an rs decoder structure,
|
||||
which holds the necessary information for encoding, decoding and error
|
||||
correction with the given polynomial. It either uses an existing
|
||||
matching decoder or creates a new one. On creation all the lookup tables
|
||||
for fast en/decoding are created. The function may take a while, so make
|
||||
sure not to call it in critical code paths.
|
||||
|
||||
::
|
||||
|
||||
/* the Reed Solomon control structure */
|
||||
static struct rs_control *rs_decoder;
|
||||
|
||||
/* Symbolsize is 10 (bits)
|
||||
* Primitive polynomial is x^10+x^3+1
|
||||
* first consecutive root is 0
|
||||
* primitive element to generate roots = 1
|
||||
* generator polynomial degree (number of roots) = 6
|
||||
*/
|
||||
rs_decoder = init_rs (10, 0x409, 0, 1, 6);
|
||||
|
||||
|
||||
Encoding
|
||||
--------
|
||||
|
||||
The encoder calculates the Reed-Solomon code over the given data length
|
||||
and stores the result in the parity buffer. Note that the parity buffer
|
||||
must be initialized before calling the encoder.
|
||||
|
||||
The expanded data can be inverted on the fly by providing a non-zero
|
||||
inversion mask. The expanded data is XOR'ed with the mask. This is used
|
||||
e.g. for FLASH ECC, where the all 0xFF is inverted to an all 0x00. The
|
||||
Reed-Solomon code for all 0x00 is all 0x00. The code is inverted before
|
||||
storing to FLASH so it is 0xFF too. This prevents that reading from an
|
||||
erased FLASH results in ECC errors.
|
||||
|
||||
The databytes are expanded to the given symbol size on the fly. There is
|
||||
no support for encoding continuous bitstreams with a symbol size != 8 at
|
||||
the moment. If it is necessary it should be not a big deal to implement
|
||||
such functionality.
|
||||
|
||||
::
|
||||
|
||||
/* Parity buffer. Size = number of roots */
|
||||
uint16_t par[6];
|
||||
/* Initialize the parity buffer */
|
||||
memset(par, 0, sizeof(par));
|
||||
/* Encode 512 byte in data8. Store parity in buffer par */
|
||||
encode_rs8 (rs_decoder, data8, 512, par, 0);
|
||||
|
||||
|
||||
Decoding
|
||||
--------
|
||||
|
||||
The decoder calculates the syndrome over the given data length and the
|
||||
received parity symbols and corrects errors in the data.
|
||||
|
||||
If a syndrome is available from a hardware decoder then the syndrome
|
||||
calculation is skipped.
|
||||
|
||||
The correction of the data buffer can be suppressed by providing a
|
||||
correction pattern buffer and an error location buffer to the decoder.
|
||||
The decoder stores the calculated error location and the correction
|
||||
bitmask in the given buffers. This is useful for hardware decoders which
|
||||
use a weird bit ordering scheme.
|
||||
|
||||
The databytes are expanded to the given symbol size on the fly. There is
|
||||
no support for decoding continuous bitstreams with a symbolsize != 8 at
|
||||
the moment. If it is necessary it should be not a big deal to implement
|
||||
such functionality.
|
||||
|
||||
Decoding with syndrome calculation, direct data correction
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
::
|
||||
|
||||
/* Parity buffer. Size = number of roots */
|
||||
uint16_t par[6];
|
||||
uint8_t data[512];
|
||||
int numerr;
|
||||
/* Receive data */
|
||||
.....
|
||||
/* Receive parity */
|
||||
.....
|
||||
/* Decode 512 byte in data8.*/
|
||||
numerr = decode_rs8 (rs_decoder, data8, par, 512, NULL, 0, NULL, 0, NULL);
|
||||
|
||||
|
||||
Decoding with syndrome given by hardware decoder, direct data correction
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
::
|
||||
|
||||
/* Parity buffer. Size = number of roots */
|
||||
uint16_t par[6], syn[6];
|
||||
uint8_t data[512];
|
||||
int numerr;
|
||||
/* Receive data */
|
||||
.....
|
||||
/* Receive parity */
|
||||
.....
|
||||
/* Get syndrome from hardware decoder */
|
||||
.....
|
||||
/* Decode 512 byte in data8.*/
|
||||
numerr = decode_rs8 (rs_decoder, data8, par, 512, syn, 0, NULL, 0, NULL);
|
||||
|
||||
|
||||
Decoding with syndrome given by hardware decoder, no direct data correction.
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Note: It's not necessary to give data and received parity to the
|
||||
decoder.
|
||||
|
||||
::
|
||||
|
||||
/* Parity buffer. Size = number of roots */
|
||||
uint16_t par[6], syn[6], corr[8];
|
||||
uint8_t data[512];
|
||||
int numerr, errpos[8];
|
||||
/* Receive data */
|
||||
.....
|
||||
/* Receive parity */
|
||||
.....
|
||||
/* Get syndrome from hardware decoder */
|
||||
.....
|
||||
/* Decode 512 byte in data8.*/
|
||||
numerr = decode_rs8 (rs_decoder, NULL, NULL, 512, syn, 0, errpos, 0, corr);
|
||||
for (i = 0; i < numerr; i++) {
|
||||
do_error_correction_in_your_buffer(errpos[i], corr[i]);
|
||||
}
|
||||
|
||||
|
||||
Cleanup
|
||||
-------
|
||||
|
||||
The function free_rs frees the allocated resources, if the caller is
|
||||
the last user of the decoder.
|
||||
|
||||
::
|
||||
|
||||
/* Release resources */
|
||||
free_rs(rs_decoder);
|
||||
|
||||
|
||||
Structures
|
||||
==========
|
||||
|
||||
This chapter contains the autogenerated documentation of the structures
|
||||
which are used in the Reed-Solomon Library and are relevant for a
|
||||
developer.
|
||||
|
||||
.. kernel-doc:: include/linux/rslib.h
|
||||
:internal:
|
||||
|
||||
Public Functions Provided
|
||||
=========================
|
||||
|
||||
This chapter contains the autogenerated documentation of the
|
||||
Reed-Solomon functions which are exported.
|
||||
|
||||
.. kernel-doc:: lib/reed_solomon/reed_solomon.c
|
||||
:export:
|
||||
|
||||
Credits
|
||||
=======
|
||||
|
||||
The library code for encoding and decoding was written by Phil Karn.
|
||||
|
||||
::
|
||||
|
||||
Copyright 2002, Phil Karn, KA9Q
|
||||
May be used under the terms of the GNU General Public License (GPL)
|
||||
|
||||
|
||||
The wrapper functions and interfaces are written by Thomas Gleixner.
|
||||
|
||||
Many users have provided bugfixes, improvements and helping hands for
|
||||
testing. Thanks a lot.
|
||||
|
||||
The following people have contributed to this document:
|
||||
|
||||
Thomas Gleixner\ tglx@linutronix.de
|
@ -1,281 +0,0 @@
|
||||
Intel P-State driver
|
||||
--------------------
|
||||
|
||||
This driver provides an interface to control the P-State selection for the
|
||||
SandyBridge+ Intel processors.
|
||||
|
||||
The following document explains P-States:
|
||||
http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf
|
||||
As stated in the document, P-State doesn’t exactly mean a frequency. However, for
|
||||
the sake of the relationship with cpufreq, P-State and frequency are used
|
||||
interchangeably.
|
||||
|
||||
Understanding the cpufreq core governors and policies are important before
|
||||
discussing more details about the Intel P-State driver. Based on what callbacks
|
||||
a cpufreq driver provides to the cpufreq core, it can support two types of
|
||||
drivers:
|
||||
- with target_index() callback: In this mode, the drivers using cpufreq core
|
||||
simply provide the minimum and maximum frequency limits and an additional
|
||||
interface target_index() to set the current frequency. The cpufreq subsystem
|
||||
has a number of scaling governors ("performance", "powersave", "ondemand",
|
||||
etc.). Depending on which governor is in use, cpufreq core will call for
|
||||
transitions to a specific frequency using target_index() callback.
|
||||
- setpolicy() callback: In this mode, drivers do not provide target_index()
|
||||
callback, so cpufreq core can't request a transition to a specific frequency.
|
||||
The driver provides minimum and maximum frequency limits and callbacks to set a
|
||||
policy. The policy in cpufreq sysfs is referred to as the "scaling governor".
|
||||
The cpufreq core can request the driver to operate in any of the two policies:
|
||||
"performance" and "powersave". The driver decides which frequency to use based
|
||||
on the above policy selection considering minimum and maximum frequency limits.
|
||||
|
||||
The Intel P-State driver falls under the latter category, which implements the
|
||||
setpolicy() callback. This driver decides what P-State to use based on the
|
||||
requested policy from the cpufreq core. If the processor is capable of
|
||||
selecting its next P-State internally, then the driver will offload this
|
||||
responsibility to the processor (aka HWP: Hardware P-States). If not, the
|
||||
driver implements algorithms to select the next P-State.
|
||||
|
||||
Since these policies are implemented in the driver, they are not same as the
|
||||
cpufreq scaling governors implementation, even if they have the same name in
|
||||
the cpufreq sysfs (scaling_governors). For example the "performance" policy is
|
||||
similar to cpufreq’s "performance" governor, but "powersave" is completely
|
||||
different than the cpufreq "powersave" governor. The strategy here is similar
|
||||
to cpufreq "ondemand", where the requested P-State is related to the system load.
|
||||
|
||||
Sysfs Interface
|
||||
|
||||
In addition to the frequency-controlling interfaces provided by the cpufreq
|
||||
core, the driver provides its own sysfs files to control the P-State selection.
|
||||
These files have been added to /sys/devices/system/cpu/intel_pstate/.
|
||||
Any changes made to these files are applicable to all CPUs (even in a
|
||||
multi-package system, Refer to later section on placing "Per-CPU limits").
|
||||
|
||||
max_perf_pct: Limits the maximum P-State that will be requested by
|
||||
the driver. It states it as a percentage of the available performance. The
|
||||
available (P-State) performance may be reduced by the no_turbo
|
||||
setting described below.
|
||||
|
||||
min_perf_pct: Limits the minimum P-State that will be requested by
|
||||
the driver. It states it as a percentage of the max (non-turbo)
|
||||
performance level.
|
||||
|
||||
no_turbo: Limits the driver to selecting P-State below the turbo
|
||||
frequency range.
|
||||
|
||||
turbo_pct: Displays the percentage of the total performance that
|
||||
is supported by hardware that is in the turbo range. This number
|
||||
is independent of whether turbo has been disabled or not.
|
||||
|
||||
num_pstates: Displays the number of P-States that are supported
|
||||
by hardware. This number is independent of whether turbo has
|
||||
been disabled or not.
|
||||
|
||||
For example, if a system has these parameters:
|
||||
Max 1 core turbo ratio: 0x21 (Max 1 core ratio is the maximum P-State)
|
||||
Max non turbo ratio: 0x17
|
||||
Minimum ratio : 0x08 (Here the ratio is called max efficiency ratio)
|
||||
|
||||
Sysfs will show :
|
||||
max_perf_pct:100, which corresponds to 1 core ratio
|
||||
min_perf_pct:24, max_efficiency_ratio / max 1 Core ratio
|
||||
no_turbo:0, turbo is not disabled
|
||||
num_pstates:26 = (max 1 Core ratio - Max Efficiency Ratio + 1)
|
||||
turbo_pct:39 = (max 1 core ratio - max non turbo ratio) / num_pstates
|
||||
|
||||
Refer to "Intel® 64 and IA-32 Architectures Software Developer’s Manual
|
||||
Volume 3: System Programming Guide" to understand ratios.
|
||||
|
||||
There is one more sysfs attribute in /sys/devices/system/cpu/intel_pstate/
|
||||
that can be used for controlling the operation mode of the driver:
|
||||
|
||||
status: Three settings are possible:
|
||||
"off" - The driver is not in use at this time.
|
||||
"active" - The driver works as a P-state governor (default).
|
||||
"passive" - The driver works as a regular cpufreq one and collaborates
|
||||
with the generic cpufreq governors (it sets P-states as
|
||||
requested by those governors).
|
||||
The current setting is returned by reads from this attribute. Writing one
|
||||
of the above strings to it changes the operation mode as indicated by that
|
||||
string, if possible. If HW-managed P-states (HWP) are enabled, it is not
|
||||
possible to change the driver's operation mode and attempts to write to
|
||||
this attribute will fail.
|
||||
|
||||
cpufreq sysfs for Intel P-State
|
||||
|
||||
Since this driver registers with cpufreq, cpufreq sysfs is also presented.
|
||||
There are some important differences, which need to be considered.
|
||||
|
||||
scaling_cur_freq: This displays the real frequency which was used during
|
||||
the last sample period instead of what is requested. Some other cpufreq driver,
|
||||
like acpi-cpufreq, displays what is requested (Some changes are on the
|
||||
way to fix this for acpi-cpufreq driver). The same is true for frequencies
|
||||
displayed at /proc/cpuinfo.
|
||||
|
||||
scaling_governor: This displays current active policy. Since each CPU has a
|
||||
cpufreq sysfs, it is possible to set a scaling governor to each CPU. But this
|
||||
is not possible with Intel P-States, as there is one common policy for all
|
||||
CPUs. Here, the last requested policy will be applicable to all CPUs. It is
|
||||
suggested that one use the cpupower utility to change policy to all CPUs at the
|
||||
same time.
|
||||
|
||||
scaling_setspeed: This attribute can never be used with Intel P-State.
|
||||
|
||||
scaling_max_freq/scaling_min_freq: This interface can be used similarly to
|
||||
the max_perf_pct/min_perf_pct of Intel P-State sysfs. However since frequencies
|
||||
are converted to nearest possible P-State, this is prone to rounding errors.
|
||||
This method is not preferred to limit performance.
|
||||
|
||||
affected_cpus: Not used
|
||||
related_cpus: Not used
|
||||
|
||||
For contemporary Intel processors, the frequency is controlled by the
|
||||
processor itself and the P-State exposed to software is related to
|
||||
performance levels. The idea that frequency can be set to a single
|
||||
frequency is fictional for Intel Core processors. Even if the scaling
|
||||
driver selects a single P-State, the actual frequency the processor
|
||||
will run at is selected by the processor itself.
|
||||
|
||||
Per-CPU limits
|
||||
|
||||
The kernel command line option "intel_pstate=per_cpu_perf_limits" forces
|
||||
the intel_pstate driver to use per-CPU performance limits. When it is set,
|
||||
the sysfs control interface described above is subject to limitations.
|
||||
- The following controls are not available for both read and write
|
||||
/sys/devices/system/cpu/intel_pstate/max_perf_pct
|
||||
/sys/devices/system/cpu/intel_pstate/min_perf_pct
|
||||
- The following controls can be used to set performance limits, as far as the
|
||||
architecture of the processor permits:
|
||||
/sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq
|
||||
/sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq
|
||||
/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
|
||||
- User can still observe turbo percent and number of P-States from
|
||||
/sys/devices/system/cpu/intel_pstate/turbo_pct
|
||||
/sys/devices/system/cpu/intel_pstate/num_pstates
|
||||
- User can read write system wide turbo status
|
||||
/sys/devices/system/cpu/no_turbo
|
||||
|
||||
Support of energy performance hints
|
||||
It is possible to provide hints to the HWP algorithms in the processor
|
||||
to be more performance centric to more energy centric. When the driver
|
||||
is using HWP, two additional cpufreq sysfs attributes are presented for
|
||||
each logical CPU.
|
||||
These attributes are:
|
||||
- energy_performance_available_preferences
|
||||
- energy_performance_preference
|
||||
|
||||
To get list of supported hints:
|
||||
$ cat energy_performance_available_preferences
|
||||
default performance balance_performance balance_power power
|
||||
|
||||
The current preference can be read or changed via cpufreq sysfs
|
||||
attribute "energy_performance_preference". Reading from this attribute
|
||||
will display current effective setting. User can write any of the valid
|
||||
preference string to this attribute. User can always restore to power-on
|
||||
default by writing "default".
|
||||
|
||||
Since threads can migrate to different CPUs, this is possible that the
|
||||
new CPU may have different energy performance preference than the previous
|
||||
one. To avoid such issues, either threads can be pinned to specific CPUs
|
||||
or set the same energy performance preference value to all CPUs.
|
||||
|
||||
Tuning Intel P-State driver
|
||||
|
||||
When the performance can be tuned using PID (Proportional Integral
|
||||
Derivative) controller, debugfs files are provided for adjusting performance.
|
||||
They are presented under:
|
||||
/sys/kernel/debug/pstate_snb/
|
||||
|
||||
The PID tunable parameters are:
|
||||
deadband
|
||||
d_gain_pct
|
||||
i_gain_pct
|
||||
p_gain_pct
|
||||
sample_rate_ms
|
||||
setpoint
|
||||
|
||||
To adjust these parameters, some understanding of driver implementation is
|
||||
necessary. There are some tweeks described here, but be very careful. Adjusting
|
||||
them requires expert level understanding of power and performance relationship.
|
||||
These limits are only useful when the "powersave" policy is active.
|
||||
|
||||
-To make the system more responsive to load changes, sample_rate_ms can
|
||||
be adjusted (current default is 10ms).
|
||||
-To make the system use higher performance, even if the load is lower, setpoint
|
||||
can be adjusted to a lower number. This will also lead to faster ramp up time
|
||||
to reach the maximum P-State.
|
||||
If there are no derivative and integral coefficients, The next P-State will be
|
||||
equal to:
|
||||
current P-State - ((setpoint - current cpu load) * p_gain_pct)
|
||||
|
||||
For example, if the current PID parameters are (Which are defaults for the core
|
||||
processors like SandyBridge):
|
||||
deadband = 0
|
||||
d_gain_pct = 0
|
||||
i_gain_pct = 0
|
||||
p_gain_pct = 20
|
||||
sample_rate_ms = 10
|
||||
setpoint = 97
|
||||
|
||||
If the current P-State = 0x08 and current load = 100, this will result in the
|
||||
next P-State = 0x08 - ((97 - 100) * 0.2) = 8.6 (rounded to 9). Here the P-State
|
||||
goes up by only 1. If during next sample interval the current load doesn't
|
||||
change and still 100, then P-State goes up by one again. This process will
|
||||
continue as long as the load is more than the setpoint until the maximum P-State
|
||||
is reached.
|
||||
|
||||
For the same load at setpoint = 60, this will result in the next P-State
|
||||
= 0x08 - ((60 - 100) * 0.2) = 16
|
||||
So by changing the setpoint from 97 to 60, there is an increase of the
|
||||
next P-State from 9 to 16. So this will make processor execute at higher
|
||||
P-State for the same CPU load. If the load continues to be more than the
|
||||
setpoint during next sample intervals, then P-State will go up again till the
|
||||
maximum P-State is reached. But the ramp up time to reach the maximum P-State
|
||||
will be much faster when the setpoint is 60 compared to 97.
|
||||
|
||||
Debugging Intel P-State driver
|
||||
|
||||
Event tracing
|
||||
To debug P-State transition, the Linux event tracing interface can be used.
|
||||
There are two specific events, which can be enabled (Provided the kernel
|
||||
configs related to event tracing are enabled).
|
||||
|
||||
# cd /sys/kernel/debug/tracing/
|
||||
# echo 1 > events/power/pstate_sample/enable
|
||||
# echo 1 > events/power/cpu_frequency/enable
|
||||
# cat trace
|
||||
gnome-terminal--4510 [001] ..s. 1177.680733: pstate_sample: core_busy=107
|
||||
scaled=94 from=26 to=26 mperf=1143818 aperf=1230607 tsc=29838618
|
||||
freq=2474476
|
||||
cat-5235 [002] ..s. 1177.681723: cpu_frequency: state=2900000 cpu_id=2
|
||||
|
||||
|
||||
Using ftrace
|
||||
|
||||
If function level tracing is required, the Linux ftrace interface can be used.
|
||||
For example if we want to check how often a function to set a P-State is
|
||||
called, we can set ftrace filter to intel_pstate_set_pstate.
|
||||
|
||||
# cd /sys/kernel/debug/tracing/
|
||||
# cat available_filter_functions | grep -i pstate
|
||||
intel_pstate_set_pstate
|
||||
intel_pstate_cpu_init
|
||||
...
|
||||
|
||||
# echo intel_pstate_set_pstate > set_ftrace_filter
|
||||
# echo function > current_tracer
|
||||
# cat trace | head -15
|
||||
# tracer: function
|
||||
#
|
||||
# entries-in-buffer/entries-written: 80/80 #P:4
|
||||
#
|
||||
# _-----=> irqs-off
|
||||
# / _----=> need-resched
|
||||
# | / _---=> hardirq/softirq
|
||||
# || / _--=> preempt-depth
|
||||
# ||| / delay
|
||||
# TASK-PID CPU# |||| TIMESTAMP FUNCTION
|
||||
# | | | |||| | |
|
||||
Xorg-3129 [000] ..s. 2537.644844: intel_pstate_set_pstate <-intel_pstate_timer_func
|
||||
gnome-terminal--4510 [002] ..s. 2537.649844: intel_pstate_set_pstate <-intel_pstate_timer_func
|
||||
gnome-shell-3409 [001] ..s. 2537.650850: intel_pstate_set_pstate <-intel_pstate_timer_func
|
||||
<idle>-0 [000] ..s. 2537.654843: intel_pstate_set_pstate <-intel_pstate_timer_func
|
@ -1,9 +1,10 @@
|
||||
========
|
||||
CPU load
|
||||
--------
|
||||
========
|
||||
|
||||
Linux exports various bits of information via `/proc/stat' and
|
||||
`/proc/uptime' that userland tools, such as top(1), use to calculate
|
||||
the average time system spent in a particular state, for example:
|
||||
Linux exports various bits of information via ``/proc/stat`` and
|
||||
``/proc/uptime`` that userland tools, such as top(1), use to calculate
|
||||
the average time system spent in a particular state, for example::
|
||||
|
||||
$ iostat
|
||||
Linux 2.6.18.3-exp (linmac) 02/20/2007
|
||||
@ -17,7 +18,7 @@ Here the system thinks that over the default sampling period the
|
||||
system spent 10.01% of the time doing work in user space, 2.92% in the
|
||||
kernel, and was overall 81.63% of the time idle.
|
||||
|
||||
In most cases the `/proc/stat' information reflects the reality quite
|
||||
In most cases the ``/proc/stat`` information reflects the reality quite
|
||||
closely, however due to the nature of how/when the kernel collects
|
||||
this data sometimes it can not be trusted at all.
|
||||
|
||||
@ -33,78 +34,78 @@ Example
|
||||
-------
|
||||
|
||||
If we imagine the system with one task that periodically burns cycles
|
||||
in the following manner:
|
||||
in the following manner::
|
||||
|
||||
time line between two timer interrupts
|
||||
|--------------------------------------|
|
||||
^ ^
|
||||
|_ something begins working |
|
||||
|_ something goes to sleep
|
||||
(only to be awaken quite soon)
|
||||
time line between two timer interrupts
|
||||
|--------------------------------------|
|
||||
^ ^
|
||||
|_ something begins working |
|
||||
|_ something goes to sleep
|
||||
(only to be awaken quite soon)
|
||||
|
||||
In the above situation the system will be 0% loaded according to the
|
||||
`/proc/stat' (since the timer interrupt will always happen when the
|
||||
``/proc/stat`` (since the timer interrupt will always happen when the
|
||||
system is executing the idle handler), but in reality the load is
|
||||
closer to 99%.
|
||||
|
||||
One can imagine many more situations where this behavior of the kernel
|
||||
will lead to quite erratic information inside `/proc/stat'.
|
||||
will lead to quite erratic information inside ``/proc/stat``::
|
||||
|
||||
|
||||
/* gcc -o hog smallhog.c */
|
||||
#include <time.h>
|
||||
#include <limits.h>
|
||||
#include <signal.h>
|
||||
#include <sys/time.h>
|
||||
#define HIST 10
|
||||
/* gcc -o hog smallhog.c */
|
||||
#include <time.h>
|
||||
#include <limits.h>
|
||||
#include <signal.h>
|
||||
#include <sys/time.h>
|
||||
#define HIST 10
|
||||
|
||||
static volatile sig_atomic_t stop;
|
||||
static volatile sig_atomic_t stop;
|
||||
|
||||
static void sighandler (int signr)
|
||||
{
|
||||
(void) signr;
|
||||
stop = 1;
|
||||
}
|
||||
static unsigned long hog (unsigned long niters)
|
||||
{
|
||||
stop = 0;
|
||||
while (!stop && --niters);
|
||||
return niters;
|
||||
}
|
||||
int main (void)
|
||||
{
|
||||
int i;
|
||||
struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
|
||||
.it_value = { .tv_sec = 0, .tv_usec = 1 } };
|
||||
sigset_t set;
|
||||
unsigned long v[HIST];
|
||||
double tmp = 0.0;
|
||||
unsigned long n;
|
||||
signal (SIGALRM, &sighandler);
|
||||
setitimer (ITIMER_REAL, &it, NULL);
|
||||
static void sighandler (int signr)
|
||||
{
|
||||
(void) signr;
|
||||
stop = 1;
|
||||
}
|
||||
static unsigned long hog (unsigned long niters)
|
||||
{
|
||||
stop = 0;
|
||||
while (!stop && --niters);
|
||||
return niters;
|
||||
}
|
||||
int main (void)
|
||||
{
|
||||
int i;
|
||||
struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
|
||||
.it_value = { .tv_sec = 0, .tv_usec = 1 } };
|
||||
sigset_t set;
|
||||
unsigned long v[HIST];
|
||||
double tmp = 0.0;
|
||||
unsigned long n;
|
||||
signal (SIGALRM, &sighandler);
|
||||
setitimer (ITIMER_REAL, &it, NULL);
|
||||
|
||||
hog (ULONG_MAX);
|
||||
for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
|
||||
for (i = 0; i < HIST; ++i) tmp += v[i];
|
||||
tmp /= HIST;
|
||||
n = tmp - (tmp / 3.0);
|
||||
hog (ULONG_MAX);
|
||||
for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
|
||||
for (i = 0; i < HIST; ++i) tmp += v[i];
|
||||
tmp /= HIST;
|
||||
n = tmp - (tmp / 3.0);
|
||||
|
||||
sigemptyset (&set);
|
||||
sigaddset (&set, SIGALRM);
|
||||
sigemptyset (&set);
|
||||
sigaddset (&set, SIGALRM);
|
||||
|
||||
for (;;) {
|
||||
hog (n);
|
||||
sigwait (&set, &i);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
for (;;) {
|
||||
hog (n);
|
||||
sigwait (&set, &i);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
http://lkml.org/lkml/2007/2/12/6
|
||||
Documentation/filesystems/proc.txt (1.8)
|
||||
- http://lkml.org/lkml/2007/2/12/6
|
||||
- Documentation/filesystems/proc.txt (1.8)
|
||||
|
||||
|
||||
Thanks
|
||||
|
@ -1,3 +1,6 @@
|
||||
===========================================
|
||||
How CPU topology info is exported via sysfs
|
||||
===========================================
|
||||
|
||||
Export CPU topology info via sysfs. Items (attributes) are similar
|
||||
to /proc/cpuinfo output of some architectures:
|
||||
@ -75,24 +78,26 @@ CONFIG_SCHED_BOOK and CONFIG_DRAWER are currently only used on s390, where
|
||||
they reflect the cpu and cache hierarchy.
|
||||
|
||||
For an architecture to support this feature, it must define some of
|
||||
these macros in include/asm-XXX/topology.h:
|
||||
#define topology_physical_package_id(cpu)
|
||||
#define topology_core_id(cpu)
|
||||
#define topology_book_id(cpu)
|
||||
#define topology_drawer_id(cpu)
|
||||
#define topology_sibling_cpumask(cpu)
|
||||
#define topology_core_cpumask(cpu)
|
||||
#define topology_book_cpumask(cpu)
|
||||
#define topology_drawer_cpumask(cpu)
|
||||
these macros in include/asm-XXX/topology.h::
|
||||
|
||||
The type of **_id macros is int.
|
||||
The type of **_cpumask macros is (const) struct cpumask *. The latter
|
||||
correspond with appropriate **_siblings sysfs attributes (except for
|
||||
#define topology_physical_package_id(cpu)
|
||||
#define topology_core_id(cpu)
|
||||
#define topology_book_id(cpu)
|
||||
#define topology_drawer_id(cpu)
|
||||
#define topology_sibling_cpumask(cpu)
|
||||
#define topology_core_cpumask(cpu)
|
||||
#define topology_book_cpumask(cpu)
|
||||
#define topology_drawer_cpumask(cpu)
|
||||
|
||||
The type of ``**_id macros`` is int.
|
||||
The type of ``**_cpumask macros`` is ``(const) struct cpumask *``. The latter
|
||||
correspond with appropriate ``**_siblings`` sysfs attributes (except for
|
||||
topology_sibling_cpumask() which corresponds with thread_siblings).
|
||||
|
||||
To be consistent on all architectures, include/linux/topology.h
|
||||
provides default definitions for any of the above macros that are
|
||||
not defined by include/asm-XXX/topology.h:
|
||||
|
||||
1) physical_package_id: -1
|
||||
2) core_id: 0
|
||||
3) sibling_cpumask: just the given CPU
|
||||
@ -107,6 +112,7 @@ Additionally, CPU topology information is provided under
|
||||
/sys/devices/system/cpu and includes these files. The internal
|
||||
source for the output is in brackets ("[]").
|
||||
|
||||
=========== ==========================================================
|
||||
kernel_max: the maximum CPU index allowed by the kernel configuration.
|
||||
[NR_CPUS-1]
|
||||
|
||||
@ -122,6 +128,7 @@ source for the output is in brackets ("[]").
|
||||
|
||||
present: CPUs that have been identified as being present in the
|
||||
system. [cpu_present_mask]
|
||||
=========== ==========================================================
|
||||
|
||||
The format for the above output is compatible with cpulist_parse()
|
||||
[see <linux/cpumask.h>]. Some examples follow.
|
||||
@ -129,7 +136,7 @@ The format for the above output is compatible with cpulist_parse()
|
||||
In this example, there are 64 CPUs in the system but cpus 32-63 exceed
|
||||
the kernel max which is limited to 0..31 by the NR_CPUS config option
|
||||
being 32. Note also that CPUs 2 and 4-31 are not online but could be
|
||||
brought online as they are both present and possible.
|
||||
brought online as they are both present and possible::
|
||||
|
||||
kernel_max: 31
|
||||
offline: 2,4-31,32-63
|
||||
@ -140,7 +147,7 @@ brought online as they are both present and possible.
|
||||
In this example, the NR_CPUS config option is 128, but the kernel was
|
||||
started with possible_cpus=144. There are 4 CPUs in the system and cpu2
|
||||
was manually taken offline (and is the only CPU that can be brought
|
||||
online.)
|
||||
online.)::
|
||||
|
||||
kernel_max: 127
|
||||
offline: 2,4-127,128-143
|
||||
|
@ -1,4 +1,6 @@
|
||||
A brief CRC tutorial.
|
||||
=================================
|
||||
brief tutorial on CRC computation
|
||||
=================================
|
||||
|
||||
A CRC is a long-division remainder. You add the CRC to the message,
|
||||
and the whole thing (message+CRC) is a multiple of the given
|
||||
@ -8,7 +10,8 @@ remainder computed on the message+CRC is 0. This latter approach
|
||||
is used by a lot of hardware implementations, and is why so many
|
||||
protocols put the end-of-frame flag after the CRC.
|
||||
|
||||
It's actually the same long division you learned in school, except that
|
||||
It's actually the same long division you learned in school, except that:
|
||||
|
||||
- We're working in binary, so the digits are only 0 and 1, and
|
||||
- When dividing polynomials, there are no carries. Rather than add and
|
||||
subtract, we just xor. Thus, we tend to get a bit sloppy about
|
||||
@ -40,11 +43,12 @@ throw the quotient bit away, but subtract the appropriate multiple of
|
||||
the polynomial from the remainder and we're back to where we started,
|
||||
ready to process the next bit.
|
||||
|
||||
A big-endian CRC written this way would be coded like:
|
||||
for (i = 0; i < input_bits; i++) {
|
||||
multiple = remainder & 0x80000000 ? CRCPOLY : 0;
|
||||
remainder = (remainder << 1 | next_input_bit()) ^ multiple;
|
||||
}
|
||||
A big-endian CRC written this way would be coded like::
|
||||
|
||||
for (i = 0; i < input_bits; i++) {
|
||||
multiple = remainder & 0x80000000 ? CRCPOLY : 0;
|
||||
remainder = (remainder << 1 | next_input_bit()) ^ multiple;
|
||||
}
|
||||
|
||||
Notice how, to get at bit 32 of the shifted remainder, we look
|
||||
at bit 31 of the remainder *before* shifting it.
|
||||
@ -54,25 +58,26 @@ the remainder don't actually affect any decision-making until
|
||||
32 bits later. Thus, the first 32 cycles of this are pretty boring.
|
||||
Also, to add the CRC to a message, we need a 32-bit-long hole for it at
|
||||
the end, so we have to add 32 extra cycles shifting in zeros at the
|
||||
end of every message,
|
||||
end of every message.
|
||||
|
||||
These details lead to a standard trick: rearrange merging in the
|
||||
next_input_bit() until the moment it's needed. Then the first 32 cycles
|
||||
can be precomputed, and merging in the final 32 zero bits to make room
|
||||
for the CRC can be skipped entirely. This changes the code to:
|
||||
for the CRC can be skipped entirely. This changes the code to::
|
||||
|
||||
for (i = 0; i < input_bits; i++) {
|
||||
remainder ^= next_input_bit() << 31;
|
||||
multiple = (remainder & 0x80000000) ? CRCPOLY : 0;
|
||||
remainder = (remainder << 1) ^ multiple;
|
||||
}
|
||||
for (i = 0; i < input_bits; i++) {
|
||||
remainder ^= next_input_bit() << 31;
|
||||
multiple = (remainder & 0x80000000) ? CRCPOLY : 0;
|
||||
remainder = (remainder << 1) ^ multiple;
|
||||
}
|
||||
|
||||
With this optimization, the little-endian code is particularly simple:
|
||||
for (i = 0; i < input_bits; i++) {
|
||||
remainder ^= next_input_bit();
|
||||
multiple = (remainder & 1) ? CRCPOLY : 0;
|
||||
remainder = (remainder >> 1) ^ multiple;
|
||||
}
|
||||
With this optimization, the little-endian code is particularly simple::
|
||||
|
||||
for (i = 0; i < input_bits; i++) {
|
||||
remainder ^= next_input_bit();
|
||||
multiple = (remainder & 1) ? CRCPOLY : 0;
|
||||
remainder = (remainder >> 1) ^ multiple;
|
||||
}
|
||||
|
||||
The most significant coefficient of the remainder polynomial is stored
|
||||
in the least significant bit of the binary "remainder" variable.
|
||||
@ -81,23 +86,25 @@ be bit-reversed) and next_input_bit().
|
||||
|
||||
As long as next_input_bit is returning the bits in a sensible order, we don't
|
||||
*have* to wait until the last possible moment to merge in additional bits.
|
||||
We can do it 8 bits at a time rather than 1 bit at a time:
|
||||
for (i = 0; i < input_bytes; i++) {
|
||||
remainder ^= next_input_byte() << 24;
|
||||
for (j = 0; j < 8; j++) {
|
||||
multiple = (remainder & 0x80000000) ? CRCPOLY : 0;
|
||||
remainder = (remainder << 1) ^ multiple;
|
||||
}
|
||||
}
|
||||
We can do it 8 bits at a time rather than 1 bit at a time::
|
||||
|
||||
Or in little-endian:
|
||||
for (i = 0; i < input_bytes; i++) {
|
||||
remainder ^= next_input_byte();
|
||||
for (j = 0; j < 8; j++) {
|
||||
multiple = (remainder & 1) ? CRCPOLY : 0;
|
||||
remainder = (remainder >> 1) ^ multiple;
|
||||
for (i = 0; i < input_bytes; i++) {
|
||||
remainder ^= next_input_byte() << 24;
|
||||
for (j = 0; j < 8; j++) {
|
||||
multiple = (remainder & 0x80000000) ? CRCPOLY : 0;
|
||||
remainder = (remainder << 1) ^ multiple;
|
||||
}
|
||||
}
|
||||
|
||||
Or in little-endian::
|
||||
|
||||
for (i = 0; i < input_bytes; i++) {
|
||||
remainder ^= next_input_byte();
|
||||
for (j = 0; j < 8; j++) {
|
||||
multiple = (remainder & 1) ? CRCPOLY : 0;
|
||||
remainder = (remainder >> 1) ^ multiple;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
If the input is a multiple of 32 bits, you can even XOR in a 32-bit
|
||||
word at a time and increase the inner loop count to 32.
|
||||
|
@ -155,9 +155,9 @@ Code Example For Use of Operational State Memory With SHASH
|
||||
char ctx[];
|
||||
};
|
||||
|
||||
static struct sdesc init_sdesc(struct crypto_shash *alg)
|
||||
static struct sdesc *init_sdesc(struct crypto_shash *alg)
|
||||
{
|
||||
struct sdesc sdesc;
|
||||
struct sdesc *sdesc;
|
||||
int size;
|
||||
|
||||
size = sizeof(struct shash_desc) + crypto_shash_descsize(alg);
|
||||
@ -169,15 +169,16 @@ Code Example For Use of Operational State Memory With SHASH
|
||||
return sdesc;
|
||||
}
|
||||
|
||||
static int calc_hash(struct crypto_shashalg,
|
||||
const unsigned chardata, unsigned int datalen,
|
||||
unsigned chardigest) {
|
||||
struct sdesc sdesc;
|
||||
static int calc_hash(struct crypto_shash *alg,
|
||||
const unsigned char *data, unsigned int datalen,
|
||||
unsigned char *digest)
|
||||
{
|
||||
struct sdesc *sdesc;
|
||||
int ret;
|
||||
|
||||
sdesc = init_sdesc(alg);
|
||||
if (IS_ERR(sdesc)) {
|
||||
pr_info("trusted_key: can't alloc %s\n", hash_alg);
|
||||
pr_info("can't alloc sdesc\n");
|
||||
return PTR_ERR(sdesc);
|
||||
}
|
||||
|
||||
@ -186,6 +187,23 @@ Code Example For Use of Operational State Memory With SHASH
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int test_hash(const unsigned char *data, unsigned int datalen,
|
||||
unsigned char *digest)
|
||||
{
|
||||
struct crypto_shash *alg;
|
||||
char *hash_alg_name = "sha1-padlock-nano";
|
||||
int ret;
|
||||
|
||||
alg = crypto_alloc_shash(hash_alg_name, CRYPTO_ALG_TYPE_SHASH, 0);
|
||||
if (IS_ERR(alg)) {
|
||||
pr_info("can't alloc alg %s\n", hash_alg_name);
|
||||
return PTR_ERR(alg);
|
||||
}
|
||||
ret = calc_hash(alg, data, datalen, digest);
|
||||
crypto_free_shash(alg);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
Code Example For Random Number Generator Usage
|
||||
----------------------------------------------
|
||||
@ -195,8 +213,8 @@ Code Example For Random Number Generator Usage
|
||||
|
||||
static int get_random_numbers(u8 *buf, unsigned int len)
|
||||
{
|
||||
struct crypto_rngrng = NULL;
|
||||
chardrbg = "drbg_nopr_sha256"; /* Hash DRBG with SHA-256, no PR */
|
||||
struct crypto_rng *rng = NULL;
|
||||
char *drbg = "drbg_nopr_sha256"; /* Hash DRBG with SHA-256, no PR */
|
||||
int ret;
|
||||
|
||||
if (!buf || !len) {
|
||||
@ -207,7 +225,7 @@ Code Example For Random Number Generator Usage
|
||||
rng = crypto_alloc_rng(drbg, 0, 0);
|
||||
if (IS_ERR(rng)) {
|
||||
pr_debug("could not allocate RNG handle for %s\n", drbg);
|
||||
return -PTR_ERR(rng);
|
||||
return PTR_ERR(rng);
|
||||
}
|
||||
|
||||
ret = crypto_rng_get_bytes(rng, buf, len);
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user